azure-ai-evaluation 1.8.0__py3-none-any.whl → 1.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of azure-ai-evaluation might be problematic. Click here for more details.
- azure/ai/evaluation/__init__.py +51 -6
- azure/ai/evaluation/_aoai/__init__.py +1 -1
- azure/ai/evaluation/_aoai/aoai_grader.py +21 -11
- azure/ai/evaluation/_aoai/label_grader.py +3 -2
- azure/ai/evaluation/_aoai/python_grader.py +84 -0
- azure/ai/evaluation/_aoai/score_model_grader.py +91 -0
- azure/ai/evaluation/_aoai/string_check_grader.py +3 -2
- azure/ai/evaluation/_aoai/text_similarity_grader.py +3 -2
- azure/ai/evaluation/_azure/_envs.py +9 -10
- azure/ai/evaluation/_azure/_token_manager.py +7 -1
- azure/ai/evaluation/_common/constants.py +11 -2
- azure/ai/evaluation/_common/evaluation_onedp_client.py +32 -26
- azure/ai/evaluation/_common/onedp/__init__.py +32 -32
- azure/ai/evaluation/_common/onedp/_client.py +136 -139
- azure/ai/evaluation/_common/onedp/_configuration.py +70 -73
- azure/ai/evaluation/_common/onedp/_patch.py +21 -21
- azure/ai/evaluation/_common/onedp/_utils/__init__.py +6 -0
- azure/ai/evaluation/_common/onedp/_utils/model_base.py +1232 -0
- azure/ai/evaluation/_common/onedp/_utils/serialization.py +2032 -0
- azure/ai/evaluation/_common/onedp/_validation.py +50 -50
- azure/ai/evaluation/_common/onedp/_version.py +9 -9
- azure/ai/evaluation/_common/onedp/aio/__init__.py +29 -29
- azure/ai/evaluation/_common/onedp/aio/_client.py +138 -143
- azure/ai/evaluation/_common/onedp/aio/_configuration.py +70 -75
- azure/ai/evaluation/_common/onedp/aio/_patch.py +21 -21
- azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +37 -39
- azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +4832 -4494
- azure/ai/evaluation/_common/onedp/aio/operations/_patch.py +21 -21
- azure/ai/evaluation/_common/onedp/models/__init__.py +168 -142
- azure/ai/evaluation/_common/onedp/models/_enums.py +230 -162
- azure/ai/evaluation/_common/onedp/models/_models.py +2685 -2228
- azure/ai/evaluation/_common/onedp/models/_patch.py +21 -21
- azure/ai/evaluation/_common/onedp/operations/__init__.py +37 -39
- azure/ai/evaluation/_common/onedp/operations/_operations.py +6106 -5657
- azure/ai/evaluation/_common/onedp/operations/_patch.py +21 -21
- azure/ai/evaluation/_common/rai_service.py +88 -52
- azure/ai/evaluation/_common/raiclient/__init__.py +1 -1
- azure/ai/evaluation/_common/raiclient/operations/_operations.py +14 -1
- azure/ai/evaluation/_common/utils.py +188 -10
- azure/ai/evaluation/_constants.py +2 -1
- azure/ai/evaluation/_converters/__init__.py +1 -1
- azure/ai/evaluation/_converters/_ai_services.py +9 -8
- azure/ai/evaluation/_converters/_models.py +46 -0
- azure/ai/evaluation/_converters/_sk_services.py +495 -0
- azure/ai/evaluation/_eval_mapping.py +2 -2
- azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +73 -25
- azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +2 -2
- azure/ai/evaluation/_evaluate/_evaluate.py +210 -94
- azure/ai/evaluation/_evaluate/_evaluate_aoai.py +132 -89
- azure/ai/evaluation/_evaluate/_telemetry/__init__.py +0 -1
- azure/ai/evaluation/_evaluate/_utils.py +25 -17
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +4 -4
- azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +20 -12
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +6 -6
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +45 -11
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +24 -9
- azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +24 -9
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +28 -18
- azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +11 -8
- azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +11 -8
- azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +12 -9
- azure/ai/evaluation/_evaluators/_content_safety/_violence.py +10 -7
- azure/ai/evaluation/_evaluators/_document_retrieval/__init__.py +1 -5
- azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +37 -64
- azure/ai/evaluation/_evaluators/_eci/_eci.py +6 -3
- azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +5 -5
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +3 -3
- azure/ai/evaluation/_evaluators/_gleu/_gleu.py +4 -4
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +12 -8
- azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +31 -26
- azure/ai/evaluation/_evaluators/_intent_resolution/intent_resolution.prompty +210 -96
- azure/ai/evaluation/_evaluators/_meteor/_meteor.py +3 -4
- azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +14 -7
- azure/ai/evaluation/_evaluators/_qa/_qa.py +5 -5
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +62 -15
- azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +140 -59
- azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +21 -26
- azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +5 -5
- azure/ai/evaluation/_evaluators/_rouge/_rouge.py +22 -22
- azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +7 -6
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +4 -4
- azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +27 -24
- azure/ai/evaluation/_evaluators/_task_adherence/task_adherence.prompty +354 -66
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +175 -183
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +99 -21
- azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +20 -12
- azure/ai/evaluation/_evaluators/_xpia/xpia.py +10 -7
- azure/ai/evaluation/_exceptions.py +10 -0
- azure/ai/evaluation/_http_utils.py +3 -3
- azure/ai/evaluation/_legacy/_batch_engine/_config.py +6 -3
- azure/ai/evaluation/_legacy/_batch_engine/_engine.py +117 -32
- azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py +5 -2
- azure/ai/evaluation/_legacy/_batch_engine/_result.py +2 -0
- azure/ai/evaluation/_legacy/_batch_engine/_run.py +2 -2
- azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +33 -41
- azure/ai/evaluation/_legacy/_batch_engine/_utils.py +1 -4
- azure/ai/evaluation/_legacy/_common/_async_token_provider.py +12 -19
- azure/ai/evaluation/_legacy/_common/_thread_pool_executor_with_context.py +2 -0
- azure/ai/evaluation/_legacy/prompty/_prompty.py +11 -5
- azure/ai/evaluation/_safety_evaluation/__init__.py +1 -1
- azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +195 -111
- azure/ai/evaluation/_user_agent.py +32 -1
- azure/ai/evaluation/_version.py +1 -1
- azure/ai/evaluation/red_team/__init__.py +3 -1
- azure/ai/evaluation/red_team/_agent/__init__.py +1 -1
- azure/ai/evaluation/red_team/_agent/_agent_functions.py +68 -71
- azure/ai/evaluation/red_team/_agent/_agent_tools.py +103 -145
- azure/ai/evaluation/red_team/_agent/_agent_utils.py +26 -6
- azure/ai/evaluation/red_team/_agent/_semantic_kernel_plugin.py +62 -71
- azure/ai/evaluation/red_team/_attack_objective_generator.py +94 -52
- azure/ai/evaluation/red_team/_attack_strategy.py +2 -1
- azure/ai/evaluation/red_team/_callback_chat_target.py +4 -9
- azure/ai/evaluation/red_team/_default_converter.py +1 -1
- azure/ai/evaluation/red_team/_red_team.py +1947 -1040
- azure/ai/evaluation/red_team/_red_team_result.py +49 -38
- azure/ai/evaluation/red_team/_utils/__init__.py +1 -1
- azure/ai/evaluation/red_team/_utils/_rai_service_eval_chat_target.py +39 -34
- azure/ai/evaluation/red_team/_utils/_rai_service_target.py +163 -138
- azure/ai/evaluation/red_team/_utils/_rai_service_true_false_scorer.py +14 -14
- azure/ai/evaluation/red_team/_utils/constants.py +1 -13
- azure/ai/evaluation/red_team/_utils/formatting_utils.py +41 -44
- azure/ai/evaluation/red_team/_utils/logging_utils.py +17 -17
- azure/ai/evaluation/red_team/_utils/metric_mapping.py +31 -4
- azure/ai/evaluation/red_team/_utils/strategy_utils.py +33 -25
- azure/ai/evaluation/simulator/_adversarial_scenario.py +2 -0
- azure/ai/evaluation/simulator/_adversarial_simulator.py +31 -17
- azure/ai/evaluation/simulator/_conversation/__init__.py +2 -2
- azure/ai/evaluation/simulator/_direct_attack_simulator.py +8 -8
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +18 -6
- azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +54 -24
- azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +7 -1
- azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +30 -10
- azure/ai/evaluation/simulator/_model_tools/_rai_client.py +19 -31
- azure/ai/evaluation/simulator/_model_tools/_template_handler.py +20 -6
- azure/ai/evaluation/simulator/_model_tools/models.py +1 -1
- azure/ai/evaluation/simulator/_simulator.py +21 -8
- {azure_ai_evaluation-1.8.0.dist-info → azure_ai_evaluation-1.10.0.dist-info}/METADATA +46 -3
- {azure_ai_evaluation-1.8.0.dist-info → azure_ai_evaluation-1.10.0.dist-info}/RECORD +141 -136
- azure/ai/evaluation/_common/onedp/aio/_vendor.py +0 -40
- {azure_ai_evaluation-1.8.0.dist-info → azure_ai_evaluation-1.10.0.dist-info}/NOTICE.txt +0 -0
- {azure_ai_evaluation-1.8.0.dist-info → azure_ai_evaluation-1.10.0.dist-info}/WHEEL +0 -0
- {azure_ai_evaluation-1.8.0.dist-info → azure_ai_evaluation-1.10.0.dist-info}/top_level.txt +0 -0
|
@@ -2,11 +2,14 @@
|
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
4
|
import inspect
|
|
5
|
+
import contextlib
|
|
5
6
|
import json
|
|
6
7
|
import logging
|
|
7
8
|
import os
|
|
8
9
|
import re
|
|
9
|
-
|
|
10
|
+
import tempfile
|
|
11
|
+
import json
|
|
12
|
+
from typing import Any, Callable, Dict, List, Literal, Optional, Set, Tuple, TypedDict, Union, cast
|
|
10
13
|
|
|
11
14
|
from openai import OpenAI, AzureOpenAI
|
|
12
15
|
from azure.ai.evaluation._legacy._adapters._constants import LINE_NUMBER
|
|
@@ -27,10 +30,10 @@ from .._constants import (
|
|
|
27
30
|
Prefixes,
|
|
28
31
|
_InternalEvaluationMetrics,
|
|
29
32
|
BINARY_AGGREGATE_SUFFIX,
|
|
30
|
-
DEFAULT_OAI_EVAL_RUN_NAME
|
|
33
|
+
DEFAULT_OAI_EVAL_RUN_NAME,
|
|
31
34
|
)
|
|
32
35
|
from .._model_configurations import AzureAIProject, EvaluationResult, EvaluatorConfig
|
|
33
|
-
from .._user_agent import
|
|
36
|
+
from .._user_agent import UserAgentSingleton
|
|
34
37
|
from ._batch_run import (
|
|
35
38
|
EvalRunContext,
|
|
36
39
|
CodeClient,
|
|
@@ -43,7 +46,8 @@ from ._utils import (
|
|
|
43
46
|
_log_metrics_and_instance_results,
|
|
44
47
|
_trace_destination_from_project_scope,
|
|
45
48
|
_write_output,
|
|
46
|
-
DataLoaderFactory,
|
|
49
|
+
DataLoaderFactory,
|
|
50
|
+
_log_metrics_and_instance_results_onedp,
|
|
47
51
|
)
|
|
48
52
|
from ._batch_run.batch_clients import BatchClient, BatchClientRun
|
|
49
53
|
|
|
@@ -51,8 +55,9 @@ from ._evaluate_aoai import (
|
|
|
51
55
|
_begin_aoai_evaluation,
|
|
52
56
|
_split_evaluators_and_grader_configs,
|
|
53
57
|
_get_evaluation_run_results,
|
|
54
|
-
OAIEvalRunCreationInfo
|
|
58
|
+
OAIEvalRunCreationInfo,
|
|
55
59
|
)
|
|
60
|
+
|
|
56
61
|
LOGGER = logging.getLogger(__name__)
|
|
57
62
|
|
|
58
63
|
# For metrics (aggregates) whose metric names intentionally differ from their
|
|
@@ -69,11 +74,13 @@ class __EvaluatorInfo(TypedDict):
|
|
|
69
74
|
metrics: Dict[str, Any]
|
|
70
75
|
run_summary: Dict[str, Any]
|
|
71
76
|
|
|
77
|
+
|
|
72
78
|
class __ValidatedData(TypedDict):
|
|
73
|
-
|
|
79
|
+
"""
|
|
74
80
|
Simple dictionary that contains ALL pre-processed data and
|
|
75
81
|
the resultant objects that are needed for downstream evaluation.
|
|
76
|
-
|
|
82
|
+
"""
|
|
83
|
+
|
|
77
84
|
evaluators: Dict[str, Callable]
|
|
78
85
|
graders: Dict[str, AzureOpenAIGrader]
|
|
79
86
|
input_data_df: pd.DataFrame
|
|
@@ -255,7 +262,9 @@ def _aggregation_binary_output(df: pd.DataFrame) -> Dict[str, float]:
|
|
|
255
262
|
if len(parts) >= 3:
|
|
256
263
|
evaluator_name = parts[1]
|
|
257
264
|
else:
|
|
258
|
-
LOGGER.warning(
|
|
265
|
+
LOGGER.warning(
|
|
266
|
+
"Skipping column '%s' due to unexpected format. Expected at least three parts separated by '.'", col
|
|
267
|
+
)
|
|
259
268
|
continue
|
|
260
269
|
if evaluator_name:
|
|
261
270
|
# Count the occurrences of each unique value (pass/fail)
|
|
@@ -604,6 +613,18 @@ def _apply_target_to_data(
|
|
|
604
613
|
category=ErrorCategory.FAILED_EXECUTION,
|
|
605
614
|
blame=ErrorBlame.USER_ERROR,
|
|
606
615
|
)
|
|
616
|
+
|
|
617
|
+
# Log a warning if some rows failed
|
|
618
|
+
failed_lines = run_summary.get("failed_lines", 0)
|
|
619
|
+
completed_lines = run_summary["completed_lines"]
|
|
620
|
+
total_lines = failed_lines + completed_lines
|
|
621
|
+
|
|
622
|
+
if failed_lines > 0:
|
|
623
|
+
LOGGER.warning(
|
|
624
|
+
f"Target function completed {completed_lines} out of {total_lines} rows. "
|
|
625
|
+
f"{failed_lines} rows failed and will be filled with NaN values."
|
|
626
|
+
)
|
|
627
|
+
|
|
607
628
|
# Remove input and output prefix
|
|
608
629
|
generated_columns = {
|
|
609
630
|
col[len(Prefixes.OUTPUTS) :] for col in target_output.columns if col.startswith(Prefixes.OUTPUTS)
|
|
@@ -611,6 +632,13 @@ def _apply_target_to_data(
|
|
|
611
632
|
# Sort output by line numbers
|
|
612
633
|
target_output.set_index(f"inputs.{LINE_NUMBER}", inplace=True)
|
|
613
634
|
target_output.sort_index(inplace=True)
|
|
635
|
+
|
|
636
|
+
initial_data_with_line_numbers = initial_data.copy()
|
|
637
|
+
initial_data_with_line_numbers[LINE_NUMBER] = range(len(initial_data))
|
|
638
|
+
|
|
639
|
+
complete_index = initial_data_with_line_numbers[LINE_NUMBER]
|
|
640
|
+
target_output = target_output.reindex(complete_index)
|
|
641
|
+
|
|
614
642
|
target_output.reset_index(inplace=True, drop=False)
|
|
615
643
|
# target_output contains only input columns, taken by function,
|
|
616
644
|
# so we need to concatenate it to the input data frame.
|
|
@@ -619,8 +647,8 @@ def _apply_target_to_data(
|
|
|
619
647
|
# Rename outputs columns to __outputs
|
|
620
648
|
rename_dict = {col: col.replace(Prefixes.OUTPUTS, Prefixes.TSG_OUTPUTS) for col in target_output.columns}
|
|
621
649
|
target_output.rename(columns=rename_dict, inplace=True)
|
|
622
|
-
# Concatenate output to input
|
|
623
|
-
target_output = pd.concat([
|
|
650
|
+
# Concatenate output to input - now both dataframes have the same number of rows
|
|
651
|
+
target_output = pd.concat([initial_data, target_output], axis=1)
|
|
624
652
|
|
|
625
653
|
return target_output, generated_columns, run
|
|
626
654
|
|
|
@@ -638,7 +666,7 @@ def _process_column_mappings(
|
|
|
638
666
|
|
|
639
667
|
processed_config: Dict[str, Dict[str, str]] = {}
|
|
640
668
|
|
|
641
|
-
expected_references = re.compile(r"^\$\{(target|data)\.[a-zA-Z0-9_]
|
|
669
|
+
expected_references = re.compile(r"^\$\{(target|data)\.([a-zA-Z0-9_]+(?:\.[a-zA-Z0-9_]+)*)\}$")
|
|
642
670
|
|
|
643
671
|
if column_mapping:
|
|
644
672
|
for evaluator, mapping_config in column_mapping.items():
|
|
@@ -721,13 +749,16 @@ def evaluate(
|
|
|
721
749
|
:keyword output_path: The local folder or file path to save evaluation results to if set. If folder path is provided
|
|
722
750
|
the results will be saved to a file named `evaluation_results.json` in the folder.
|
|
723
751
|
:paramtype output_path: Optional[str]
|
|
724
|
-
:keyword azure_ai_project:
|
|
725
|
-
|
|
752
|
+
:keyword azure_ai_project: The Azure AI project, which can either be a string representing the project endpoint
|
|
753
|
+
or an instance of AzureAIProject. It contains subscription id, resource group, and project name.
|
|
754
|
+
:paramtype azure_ai_project: Optional[Union[str, ~azure.ai.evaluation.AzureAIProject]]
|
|
726
755
|
:keyword fail_on_evaluator_errors: Whether or not the evaluation should cancel early with an EvaluationException
|
|
727
756
|
if ANY evaluator fails during their evaluation.
|
|
728
757
|
Defaults to false, which means that evaluations will continue regardless of failures.
|
|
729
758
|
If such failures occur, metrics may be missing, and evidence of failures can be found in the evaluation's logs.
|
|
730
759
|
:paramtype fail_on_evaluator_errors: bool
|
|
760
|
+
:keyword user_agent: A string to append to the default user-agent sent with evaluation http requests
|
|
761
|
+
:paramtype user_agent: Optional[str]
|
|
731
762
|
:return: Evaluation results.
|
|
732
763
|
:rtype: ~azure.ai.evaluation.EvaluationResult
|
|
733
764
|
|
|
@@ -739,29 +770,31 @@ def evaluate(
|
|
|
739
770
|
:language: python
|
|
740
771
|
:dedent: 8
|
|
741
772
|
:caption: Run an evaluation on local data with one or more evaluators using azure.ai.evaluation.AzureAIProject
|
|
742
|
-
|
|
773
|
+
|
|
743
774
|
.. admonition:: Example using Azure AI Project URL:
|
|
744
|
-
|
|
775
|
+
|
|
745
776
|
.. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
|
|
746
777
|
:start-after: [START evaluate_method]
|
|
747
778
|
:end-before: [END evaluate_method]
|
|
748
779
|
:language: python
|
|
749
780
|
:dedent: 8
|
|
750
|
-
:caption: Run an evaluation on local data with one or more evaluators using Azure AI Project URL in following format
|
|
781
|
+
:caption: Run an evaluation on local data with one or more evaluators using Azure AI Project URL in following format
|
|
751
782
|
https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
|
|
752
783
|
"""
|
|
753
784
|
try:
|
|
754
|
-
|
|
755
|
-
|
|
756
|
-
|
|
757
|
-
|
|
758
|
-
|
|
759
|
-
|
|
760
|
-
|
|
761
|
-
|
|
762
|
-
|
|
763
|
-
|
|
764
|
-
|
|
785
|
+
user_agent: Optional[str] = kwargs.get("user_agent")
|
|
786
|
+
with UserAgentSingleton().add_useragent_product(user_agent) if user_agent else contextlib.nullcontext():
|
|
787
|
+
return _evaluate(
|
|
788
|
+
evaluation_name=evaluation_name,
|
|
789
|
+
target=target,
|
|
790
|
+
data=data,
|
|
791
|
+
evaluators_and_graders=evaluators,
|
|
792
|
+
evaluator_config=evaluator_config,
|
|
793
|
+
azure_ai_project=azure_ai_project,
|
|
794
|
+
output_path=output_path,
|
|
795
|
+
fail_on_evaluator_errors=fail_on_evaluator_errors,
|
|
796
|
+
**kwargs,
|
|
797
|
+
)
|
|
765
798
|
except Exception as e:
|
|
766
799
|
# Handle multiprocess bootstrap error
|
|
767
800
|
bootstrap_error = (
|
|
@@ -832,7 +865,7 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
|
|
|
832
865
|
) -> EvaluationResult:
|
|
833
866
|
if fail_on_evaluator_errors:
|
|
834
867
|
_print_fail_flag_warning()
|
|
835
|
-
|
|
868
|
+
|
|
836
869
|
# Turn inputted mess of data into a dataframe, apply targets if needed
|
|
837
870
|
# split graders and evaluators, and verify that column mappings are sensible.
|
|
838
871
|
validated_data = _preprocess_data(
|
|
@@ -843,9 +876,10 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
|
|
|
843
876
|
output_path=output_path,
|
|
844
877
|
azure_ai_project=azure_ai_project,
|
|
845
878
|
evaluation_name=evaluation_name,
|
|
879
|
+
fail_on_evaluator_errors=fail_on_evaluator_errors,
|
|
846
880
|
**kwargs,
|
|
847
881
|
)
|
|
848
|
-
|
|
882
|
+
|
|
849
883
|
# extract relevant info from validated data
|
|
850
884
|
column_mapping = validated_data["column_mapping"]
|
|
851
885
|
evaluators = validated_data["evaluators"]
|
|
@@ -863,29 +897,25 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
|
|
|
863
897
|
if need_oai_run:
|
|
864
898
|
try:
|
|
865
899
|
aoi_name = evaluation_name if evaluation_name else DEFAULT_OAI_EVAL_RUN_NAME
|
|
866
|
-
eval_run_info_list = _begin_aoai_evaluation(
|
|
867
|
-
graders,
|
|
868
|
-
column_mapping,
|
|
869
|
-
input_data_df,
|
|
870
|
-
aoi_name
|
|
871
|
-
)
|
|
900
|
+
eval_run_info_list = _begin_aoai_evaluation(graders, column_mapping, input_data_df, aoi_name)
|
|
872
901
|
need_get_oai_results = len(eval_run_info_list) > 0
|
|
873
902
|
except EvaluationException as e:
|
|
874
903
|
if need_local_run:
|
|
875
904
|
# If there are normal evaluators, don't stop execution and try to run
|
|
876
905
|
# those.
|
|
877
|
-
LOGGER.warning(
|
|
878
|
-
|
|
906
|
+
LOGGER.warning(
|
|
907
|
+
"Remote Azure Open AI grader evaluations failed during run creation."
|
|
908
|
+
+ " Continuing with local evaluators."
|
|
909
|
+
)
|
|
879
910
|
LOGGER.warning(e)
|
|
880
911
|
else:
|
|
881
912
|
raise e
|
|
882
|
-
|
|
913
|
+
|
|
883
914
|
# Evaluate 'normal' evaluators. This includes built-in evaluators and any user-supplied callables.
|
|
884
915
|
if need_local_run:
|
|
885
916
|
try:
|
|
886
|
-
eval_result_df, eval_metrics, per_evaluator_results = _run_callable_evaluators(
|
|
887
|
-
validated_data=validated_data,
|
|
888
|
-
fail_on_evaluator_errors=fail_on_evaluator_errors
|
|
917
|
+
eval_result_df, eval_metrics, per_evaluator_results = _run_callable_evaluators(
|
|
918
|
+
validated_data=validated_data, fail_on_evaluator_errors=fail_on_evaluator_errors
|
|
889
919
|
)
|
|
890
920
|
results_df = eval_result_df
|
|
891
921
|
metrics = eval_metrics
|
|
@@ -903,7 +933,7 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
|
|
|
903
933
|
# Retrieve OAI eval run results if needed.
|
|
904
934
|
if need_get_oai_results:
|
|
905
935
|
try:
|
|
906
|
-
aoai_results, aoai_metrics = _get_evaluation_run_results(eval_run_info_list)
|
|
936
|
+
aoai_results, aoai_metrics = _get_evaluation_run_results(eval_run_info_list) # type: ignore
|
|
907
937
|
# Post build TODO: add equivalent of _print_summary(per_evaluator_results) here
|
|
908
938
|
|
|
909
939
|
# Combine results if both evaluators and graders are present
|
|
@@ -954,23 +984,19 @@ def _preprocess_data(
|
|
|
954
984
|
output_path: Optional[Union[str, os.PathLike]] = None,
|
|
955
985
|
azure_ai_project: Optional[Union[str, AzureAIProject]] = None,
|
|
956
986
|
evaluation_name: Optional[str] = None,
|
|
987
|
+
fail_on_evaluator_errors: bool = False,
|
|
957
988
|
**kwargs,
|
|
958
|
-
|
|
989
|
+
) -> __ValidatedData:
|
|
959
990
|
# Process evaluator config to replace ${target.} with ${data.}
|
|
960
991
|
if evaluator_config is None:
|
|
961
992
|
evaluator_config = {}
|
|
962
993
|
|
|
963
994
|
input_data_df = _validate_and_load_data(
|
|
964
|
-
target,
|
|
965
|
-
data,
|
|
966
|
-
evaluators_and_graders,
|
|
967
|
-
output_path,
|
|
968
|
-
azure_ai_project,
|
|
969
|
-
evaluation_name
|
|
995
|
+
target, data, evaluators_and_graders, output_path, azure_ai_project, evaluation_name
|
|
970
996
|
)
|
|
971
997
|
if target is not None:
|
|
972
998
|
_validate_columns_for_target(input_data_df, target)
|
|
973
|
-
|
|
999
|
+
|
|
974
1000
|
# extract column mapping dicts into dictionary mapping evaluator name to column mapping
|
|
975
1001
|
column_mapping = _process_column_mappings(
|
|
976
1002
|
{
|
|
@@ -992,15 +1018,49 @@ def _preprocess_data(
|
|
|
992
1018
|
batch_run_client: BatchClient
|
|
993
1019
|
batch_run_data: Union[str, os.PathLike, pd.DataFrame] = data
|
|
994
1020
|
|
|
995
|
-
|
|
996
|
-
|
|
1021
|
+
def get_client_type(evaluate_kwargs: Dict[str, Any]) -> Literal["run_submitter", "pf_client", "code_client"]:
|
|
1022
|
+
"""Determines the BatchClient to use from provided kwargs (_use_run_submitter_client and _use_pf_client)"""
|
|
1023
|
+
_use_run_submitter_client = cast(Optional[bool], kwargs.pop("_use_run_submitter_client", None))
|
|
1024
|
+
_use_pf_client = cast(Optional[bool], kwargs.pop("_use_pf_client", None))
|
|
1025
|
+
|
|
1026
|
+
if _use_run_submitter_client is None and _use_pf_client is None:
|
|
1027
|
+
# If both are unset, return default
|
|
1028
|
+
return "run_submitter"
|
|
1029
|
+
|
|
1030
|
+
if _use_run_submitter_client and _use_pf_client:
|
|
1031
|
+
raise EvaluationException(
|
|
1032
|
+
message="Only one of _use_pf_client and _use_run_submitter_client should be set to True.",
|
|
1033
|
+
target=ErrorTarget.EVALUATE,
|
|
1034
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
1035
|
+
blame=ErrorBlame.USER_ERROR,
|
|
1036
|
+
)
|
|
1037
|
+
|
|
1038
|
+
if _use_run_submitter_client == False and _use_pf_client == False:
|
|
1039
|
+
return "code_client"
|
|
1040
|
+
|
|
1041
|
+
if _use_run_submitter_client:
|
|
1042
|
+
return "run_submitter"
|
|
1043
|
+
if _use_pf_client:
|
|
1044
|
+
return "pf_client"
|
|
1045
|
+
|
|
1046
|
+
if _use_run_submitter_client is None and _use_pf_client == False:
|
|
1047
|
+
return "run_submitter"
|
|
1048
|
+
if _use_run_submitter_client == False and _use_pf_client is None:
|
|
1049
|
+
return "pf_client"
|
|
1050
|
+
|
|
1051
|
+
assert False, "This should be impossible"
|
|
1052
|
+
|
|
1053
|
+
client_type: Literal["run_submitter", "pf_client", "code_client"] = get_client_type(kwargs)
|
|
1054
|
+
|
|
1055
|
+
if client_type == "run_submitter":
|
|
1056
|
+
batch_run_client = RunSubmitterClient(raise_on_errors=fail_on_evaluator_errors)
|
|
997
1057
|
batch_run_data = input_data_df
|
|
998
|
-
elif
|
|
999
|
-
batch_run_client = ProxyClient(user_agent=
|
|
1058
|
+
elif client_type == "pf_client":
|
|
1059
|
+
batch_run_client = ProxyClient(user_agent=UserAgentSingleton().value)
|
|
1000
1060
|
# Ensure the absolute path is passed to pf.run, as relative path doesn't work with
|
|
1001
1061
|
# multiple evaluators. If the path is already absolute, abspath will return the original path.
|
|
1002
1062
|
batch_run_data = os.path.abspath(data)
|
|
1003
|
-
|
|
1063
|
+
elif client_type == "code_client":
|
|
1004
1064
|
batch_run_client = CodeClient()
|
|
1005
1065
|
batch_run_data = input_data_df
|
|
1006
1066
|
|
|
@@ -1010,17 +1070,50 @@ def _preprocess_data(
|
|
|
1010
1070
|
target, batch_run_data, batch_run_client, input_data_df, evaluation_name, **kwargs
|
|
1011
1071
|
)
|
|
1012
1072
|
|
|
1013
|
-
|
|
1014
|
-
|
|
1015
|
-
|
|
1016
|
-
|
|
1017
|
-
|
|
1018
|
-
|
|
1019
|
-
|
|
1020
|
-
|
|
1021
|
-
|
|
1022
|
-
|
|
1023
|
-
|
|
1073
|
+
# IMPORTANT FIX: For ProxyClient, create a temporary file with the complete dataframe
|
|
1074
|
+
# This ensures that evaluators get all rows (including failed ones with NaN values)
|
|
1075
|
+
if isinstance(batch_run_client, ProxyClient):
|
|
1076
|
+
# Create a temporary JSONL file with the complete dataframe
|
|
1077
|
+
temp_file = tempfile.NamedTemporaryFile(mode="w", suffix=".jsonl", delete=False)
|
|
1078
|
+
try:
|
|
1079
|
+
for _, row in input_data_df.iterrows():
|
|
1080
|
+
row_dict = row.to_dict()
|
|
1081
|
+
temp_file.write(json.dumps(row_dict) + "\n")
|
|
1082
|
+
temp_file.close()
|
|
1083
|
+
batch_run_data = temp_file.name
|
|
1084
|
+
|
|
1085
|
+
# Update column mappings to use data references instead of run outputs
|
|
1086
|
+
for evaluator_name, mapping in column_mapping.items():
|
|
1087
|
+
mapped_to_values = set(mapping.values())
|
|
1088
|
+
for col in target_generated_columns:
|
|
1089
|
+
# Use data reference instead of run output to ensure we get all rows
|
|
1090
|
+
target_reference = f"${{data.{Prefixes.TSG_OUTPUTS}{col}}}"
|
|
1091
|
+
|
|
1092
|
+
# We will add our mapping only if customer did not map target output.
|
|
1093
|
+
if col not in mapping and target_reference not in mapped_to_values:
|
|
1094
|
+
column_mapping[evaluator_name][col] = target_reference
|
|
1095
|
+
|
|
1096
|
+
# Don't pass the target_run since we're now using the complete dataframe
|
|
1097
|
+
target_run = None
|
|
1098
|
+
|
|
1099
|
+
except Exception as e:
|
|
1100
|
+
# Clean up the temp file if something goes wrong
|
|
1101
|
+
if os.path.exists(temp_file.name):
|
|
1102
|
+
os.unlink(temp_file.name)
|
|
1103
|
+
raise e
|
|
1104
|
+
else:
|
|
1105
|
+
# For DataFrame-based clients, update batch_run_data to use the updated input_data_df
|
|
1106
|
+
batch_run_data = input_data_df
|
|
1107
|
+
|
|
1108
|
+
# Update column mappings for DataFrame clients
|
|
1109
|
+
for evaluator_name, mapping in column_mapping.items():
|
|
1110
|
+
mapped_to_values = set(mapping.values())
|
|
1111
|
+
for col in target_generated_columns:
|
|
1112
|
+
target_reference = f"${{data.{Prefixes.TSG_OUTPUTS}{col}}}"
|
|
1113
|
+
|
|
1114
|
+
# We will add our mapping only if customer did not map target output.
|
|
1115
|
+
if col not in mapping and target_reference not in mapped_to_values:
|
|
1116
|
+
column_mapping[evaluator_name][col] = target_reference
|
|
1024
1117
|
|
|
1025
1118
|
# After we have generated all columns, we can check if we have everything we need for evaluators.
|
|
1026
1119
|
_validate_columns_for_evaluators(input_data_df, evaluators, target, target_generated_columns, column_mapping)
|
|
@@ -1059,30 +1152,50 @@ def _run_callable_evaluators(
|
|
|
1059
1152
|
batch_run_data = validated_data["batch_run_data"]
|
|
1060
1153
|
column_mapping = validated_data["column_mapping"]
|
|
1061
1154
|
evaluators = validated_data["evaluators"]
|
|
1062
|
-
with EvalRunContext(batch_run_client):
|
|
1063
|
-
runs = {
|
|
1064
|
-
evaluator_name: batch_run_client.run(
|
|
1065
|
-
flow=evaluator,
|
|
1066
|
-
data=batch_run_data,
|
|
1067
|
-
run=target_run,
|
|
1068
|
-
evaluator_name=evaluator_name,
|
|
1069
|
-
column_mapping=column_mapping.get(evaluator_name, column_mapping.get("default", None)),
|
|
1070
|
-
stream=True,
|
|
1071
|
-
name=kwargs.get("_run_name"),
|
|
1072
|
-
)
|
|
1073
|
-
for evaluator_name, evaluator in evaluators.items()
|
|
1074
|
-
}
|
|
1075
1155
|
|
|
1076
|
-
|
|
1077
|
-
|
|
1078
|
-
|
|
1079
|
-
|
|
1080
|
-
|
|
1081
|
-
|
|
1156
|
+
# Clean up temporary file after evaluation if it was created
|
|
1157
|
+
temp_file_to_cleanup = None
|
|
1158
|
+
if (
|
|
1159
|
+
isinstance(batch_run_client, ProxyClient)
|
|
1160
|
+
and isinstance(batch_run_data, str)
|
|
1161
|
+
and batch_run_data.endswith(".jsonl")
|
|
1162
|
+
):
|
|
1163
|
+
# Check if it's a temporary file (contains temp directory path)
|
|
1164
|
+
if tempfile.gettempdir() in batch_run_data:
|
|
1165
|
+
temp_file_to_cleanup = batch_run_data
|
|
1166
|
+
|
|
1167
|
+
try:
|
|
1168
|
+
with EvalRunContext(batch_run_client):
|
|
1169
|
+
runs = {
|
|
1170
|
+
evaluator_name: batch_run_client.run(
|
|
1171
|
+
flow=evaluator,
|
|
1172
|
+
data=batch_run_data,
|
|
1173
|
+
# Don't pass target_run when using complete dataframe
|
|
1174
|
+
run=target_run,
|
|
1175
|
+
evaluator_name=evaluator_name,
|
|
1176
|
+
column_mapping=column_mapping.get(evaluator_name, column_mapping.get("default", None)),
|
|
1177
|
+
stream=True,
|
|
1178
|
+
name=kwargs.get("_run_name"),
|
|
1179
|
+
)
|
|
1180
|
+
for evaluator_name, evaluator in evaluators.items()
|
|
1082
1181
|
}
|
|
1083
|
-
for evaluator_name, run in runs.items()
|
|
1084
|
-
}
|
|
1085
1182
|
|
|
1183
|
+
# get_details needs to be called within EvalRunContext scope in order to have user agent populated
|
|
1184
|
+
per_evaluator_results: Dict[str, __EvaluatorInfo] = {
|
|
1185
|
+
evaluator_name: {
|
|
1186
|
+
"result": batch_run_client.get_details(run, all_results=True),
|
|
1187
|
+
"metrics": batch_run_client.get_metrics(run),
|
|
1188
|
+
"run_summary": batch_run_client.get_run_summary(run),
|
|
1189
|
+
}
|
|
1190
|
+
for evaluator_name, run in runs.items()
|
|
1191
|
+
}
|
|
1192
|
+
finally:
|
|
1193
|
+
# Clean up temporary file if it was created
|
|
1194
|
+
if temp_file_to_cleanup and os.path.exists(temp_file_to_cleanup):
|
|
1195
|
+
try:
|
|
1196
|
+
os.unlink(temp_file_to_cleanup)
|
|
1197
|
+
except Exception as e:
|
|
1198
|
+
LOGGER.warning(f"Failed to clean up temporary file {temp_file_to_cleanup}: {e}")
|
|
1086
1199
|
# Concatenate all results
|
|
1087
1200
|
evaluators_result_df = pd.DataFrame()
|
|
1088
1201
|
evaluators_metric = {}
|
|
@@ -1127,10 +1240,11 @@ def _run_callable_evaluators(
|
|
|
1127
1240
|
|
|
1128
1241
|
return eval_result_df, eval_metrics, per_evaluator_results
|
|
1129
1242
|
|
|
1243
|
+
|
|
1130
1244
|
def _map_names_to_builtins(
|
|
1131
|
-
|
|
1132
|
-
|
|
1133
|
-
|
|
1245
|
+
evaluators: Dict[str, Callable],
|
|
1246
|
+
graders: Dict[str, AzureOpenAIGrader],
|
|
1247
|
+
) -> Dict[str, str]:
|
|
1134
1248
|
"""
|
|
1135
1249
|
Construct a mapping from user-supplied evaluator names to which known, built-in
|
|
1136
1250
|
evaluator or grader they refer to. Custom evaluators are excluded from the mapping
|
|
@@ -1142,9 +1256,10 @@ def _map_names_to_builtins(
|
|
|
1142
1256
|
:type graders: Dict[str, AzureOpenAIGrader]
|
|
1143
1257
|
:param evaluator_config: The configuration for evaluators.
|
|
1144
1258
|
:type evaluator_config: Optional[Dict[str, EvaluatorConfig]]
|
|
1145
|
-
|
|
1259
|
+
|
|
1146
1260
|
"""
|
|
1147
1261
|
from .._eval_mapping import EVAL_CLASS_MAP
|
|
1262
|
+
|
|
1148
1263
|
name_map = {}
|
|
1149
1264
|
|
|
1150
1265
|
for name, evaluator in evaluators.items():
|
|
@@ -1158,12 +1273,13 @@ def _map_names_to_builtins(
|
|
|
1158
1273
|
if not found_eval:
|
|
1159
1274
|
# Skip custom evaluators - we only want to track built-in evaluators
|
|
1160
1275
|
pass
|
|
1161
|
-
|
|
1162
|
-
for
|
|
1276
|
+
|
|
1277
|
+
for name, grader in graders.items():
|
|
1163
1278
|
name_map[name] = grader.id
|
|
1164
1279
|
|
|
1165
1280
|
return name_map
|
|
1166
1281
|
|
|
1282
|
+
|
|
1167
1283
|
def _turn_error_logs_into_exception(log_path: str) -> None:
|
|
1168
1284
|
"""Produce an EvaluationException using the contents of the inputted
|
|
1169
1285
|
file as the error message.
|
|
@@ -1178,4 +1294,4 @@ def _turn_error_logs_into_exception(log_path: str) -> None:
|
|
|
1178
1294
|
target=ErrorTarget.EVALUATE,
|
|
1179
1295
|
category=ErrorCategory.FAILED_EXECUTION,
|
|
1180
1296
|
blame=ErrorBlame.UNKNOWN,
|
|
1181
|
-
)
|
|
1297
|
+
)
|