azure-ai-evaluation 1.7.0__py3-none-any.whl → 1.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- azure/ai/evaluation/__init__.py +13 -2
- azure/ai/evaluation/_aoai/__init__.py +1 -1
- azure/ai/evaluation/_aoai/aoai_grader.py +21 -11
- azure/ai/evaluation/_aoai/label_grader.py +3 -2
- azure/ai/evaluation/_aoai/score_model_grader.py +90 -0
- azure/ai/evaluation/_aoai/string_check_grader.py +3 -2
- azure/ai/evaluation/_aoai/text_similarity_grader.py +3 -2
- azure/ai/evaluation/_azure/_envs.py +9 -10
- azure/ai/evaluation/_azure/_token_manager.py +7 -1
- azure/ai/evaluation/_common/constants.py +11 -2
- azure/ai/evaluation/_common/evaluation_onedp_client.py +32 -26
- azure/ai/evaluation/_common/onedp/__init__.py +32 -32
- azure/ai/evaluation/_common/onedp/_client.py +136 -139
- azure/ai/evaluation/_common/onedp/_configuration.py +70 -73
- azure/ai/evaluation/_common/onedp/_patch.py +21 -21
- azure/ai/evaluation/_common/onedp/_utils/__init__.py +6 -0
- azure/ai/evaluation/_common/onedp/_utils/model_base.py +1232 -0
- azure/ai/evaluation/_common/onedp/_utils/serialization.py +2032 -0
- azure/ai/evaluation/_common/onedp/_validation.py +50 -50
- azure/ai/evaluation/_common/onedp/_version.py +9 -9
- azure/ai/evaluation/_common/onedp/aio/__init__.py +29 -29
- azure/ai/evaluation/_common/onedp/aio/_client.py +138 -143
- azure/ai/evaluation/_common/onedp/aio/_configuration.py +70 -75
- azure/ai/evaluation/_common/onedp/aio/_patch.py +21 -21
- azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +37 -39
- azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +4832 -4494
- azure/ai/evaluation/_common/onedp/aio/operations/_patch.py +21 -21
- azure/ai/evaluation/_common/onedp/models/__init__.py +168 -142
- azure/ai/evaluation/_common/onedp/models/_enums.py +230 -162
- azure/ai/evaluation/_common/onedp/models/_models.py +2685 -2228
- azure/ai/evaluation/_common/onedp/models/_patch.py +21 -21
- azure/ai/evaluation/_common/onedp/operations/__init__.py +37 -39
- azure/ai/evaluation/_common/onedp/operations/_operations.py +6106 -5655
- azure/ai/evaluation/_common/onedp/operations/_patch.py +21 -21
- azure/ai/evaluation/_common/rai_service.py +86 -50
- azure/ai/evaluation/_common/raiclient/__init__.py +1 -1
- azure/ai/evaluation/_common/raiclient/operations/_operations.py +14 -1
- azure/ai/evaluation/_common/utils.py +124 -3
- azure/ai/evaluation/_constants.py +2 -1
- azure/ai/evaluation/_converters/__init__.py +1 -1
- azure/ai/evaluation/_converters/_ai_services.py +9 -8
- azure/ai/evaluation/_converters/_models.py +46 -0
- azure/ai/evaluation/_converters/_sk_services.py +495 -0
- azure/ai/evaluation/_eval_mapping.py +2 -2
- azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +4 -4
- azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +2 -2
- azure/ai/evaluation/_evaluate/_evaluate.py +64 -58
- azure/ai/evaluation/_evaluate/_evaluate_aoai.py +130 -89
- azure/ai/evaluation/_evaluate/_telemetry/__init__.py +0 -1
- azure/ai/evaluation/_evaluate/_utils.py +24 -15
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +3 -3
- azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +12 -11
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +5 -5
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +15 -5
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +24 -9
- azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +6 -1
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +13 -13
- azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +7 -7
- azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +7 -7
- azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +7 -7
- azure/ai/evaluation/_evaluators/_content_safety/_violence.py +6 -6
- azure/ai/evaluation/_evaluators/_document_retrieval/__init__.py +1 -5
- azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +34 -64
- azure/ai/evaluation/_evaluators/_eci/_eci.py +3 -3
- azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +4 -4
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +2 -2
- azure/ai/evaluation/_evaluators/_gleu/_gleu.py +3 -3
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +11 -7
- azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +30 -25
- azure/ai/evaluation/_evaluators/_intent_resolution/intent_resolution.prompty +210 -96
- azure/ai/evaluation/_evaluators/_meteor/_meteor.py +2 -3
- azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +6 -6
- azure/ai/evaluation/_evaluators/_qa/_qa.py +4 -4
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +8 -13
- azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +20 -25
- azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +4 -4
- azure/ai/evaluation/_evaluators/_rouge/_rouge.py +25 -25
- azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +5 -5
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +3 -3
- azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +11 -14
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +43 -34
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +3 -3
- azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +12 -11
- azure/ai/evaluation/_evaluators/_xpia/xpia.py +6 -6
- azure/ai/evaluation/_exceptions.py +10 -0
- azure/ai/evaluation/_http_utils.py +3 -3
- azure/ai/evaluation/_legacy/_batch_engine/_engine.py +3 -3
- azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py +5 -2
- azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +5 -10
- azure/ai/evaluation/_legacy/_batch_engine/_utils.py +1 -4
- azure/ai/evaluation/_legacy/_common/_async_token_provider.py +12 -19
- azure/ai/evaluation/_legacy/_common/_thread_pool_executor_with_context.py +2 -0
- azure/ai/evaluation/_legacy/prompty/_prompty.py +11 -5
- azure/ai/evaluation/_safety_evaluation/__init__.py +1 -1
- azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +193 -111
- azure/ai/evaluation/_user_agent.py +32 -1
- azure/ai/evaluation/_version.py +1 -1
- azure/ai/evaluation/red_team/__init__.py +3 -1
- azure/ai/evaluation/red_team/_agent/__init__.py +3 -0
- azure/ai/evaluation/red_team/_agent/_agent_functions.py +261 -0
- azure/ai/evaluation/red_team/_agent/_agent_tools.py +461 -0
- azure/ai/evaluation/red_team/_agent/_agent_utils.py +89 -0
- azure/ai/evaluation/red_team/_agent/_semantic_kernel_plugin.py +228 -0
- azure/ai/evaluation/red_team/_attack_objective_generator.py +94 -52
- azure/ai/evaluation/red_team/_attack_strategy.py +4 -1
- azure/ai/evaluation/red_team/_callback_chat_target.py +4 -9
- azure/ai/evaluation/red_team/_default_converter.py +1 -1
- azure/ai/evaluation/red_team/_red_team.py +1622 -765
- azure/ai/evaluation/red_team/_red_team_result.py +43 -38
- azure/ai/evaluation/red_team/_utils/__init__.py +1 -1
- azure/ai/evaluation/red_team/_utils/_rai_service_eval_chat_target.py +121 -0
- azure/ai/evaluation/red_team/_utils/_rai_service_target.py +595 -0
- azure/ai/evaluation/red_team/_utils/_rai_service_true_false_scorer.py +108 -0
- azure/ai/evaluation/red_team/_utils/constants.py +6 -12
- azure/ai/evaluation/red_team/_utils/formatting_utils.py +41 -44
- azure/ai/evaluation/red_team/_utils/logging_utils.py +17 -17
- azure/ai/evaluation/red_team/_utils/metric_mapping.py +33 -6
- azure/ai/evaluation/red_team/_utils/strategy_utils.py +35 -25
- azure/ai/evaluation/simulator/_adversarial_scenario.py +2 -0
- azure/ai/evaluation/simulator/_adversarial_simulator.py +34 -16
- azure/ai/evaluation/simulator/_conversation/__init__.py +2 -2
- azure/ai/evaluation/simulator/_direct_attack_simulator.py +8 -8
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +5 -5
- azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +54 -23
- azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +7 -1
- azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +25 -15
- azure/ai/evaluation/simulator/_model_tools/_rai_client.py +19 -31
- azure/ai/evaluation/simulator/_model_tools/_template_handler.py +20 -6
- azure/ai/evaluation/simulator/_model_tools/models.py +1 -1
- azure/ai/evaluation/simulator/_simulator.py +9 -8
- {azure_ai_evaluation-1.7.0.dist-info → azure_ai_evaluation-1.9.0.dist-info}/METADATA +24 -1
- {azure_ai_evaluation-1.7.0.dist-info → azure_ai_evaluation-1.9.0.dist-info}/RECORD +135 -123
- azure/ai/evaluation/_common/onedp/aio/_vendor.py +0 -40
- {azure_ai_evaluation-1.7.0.dist-info → azure_ai_evaluation-1.9.0.dist-info}/NOTICE.txt +0 -0
- {azure_ai_evaluation-1.7.0.dist-info → azure_ai_evaluation-1.9.0.dist-info}/WHEEL +0 -0
- {azure_ai_evaluation-1.7.0.dist-info → azure_ai_evaluation-1.9.0.dist-info}/top_level.txt +0 -0
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
4
|
import inspect
|
|
5
|
+
import contextlib
|
|
5
6
|
import json
|
|
6
7
|
import logging
|
|
7
8
|
import os
|
|
@@ -27,10 +28,10 @@ from .._constants import (
|
|
|
27
28
|
Prefixes,
|
|
28
29
|
_InternalEvaluationMetrics,
|
|
29
30
|
BINARY_AGGREGATE_SUFFIX,
|
|
30
|
-
DEFAULT_OAI_EVAL_RUN_NAME
|
|
31
|
+
DEFAULT_OAI_EVAL_RUN_NAME,
|
|
31
32
|
)
|
|
32
33
|
from .._model_configurations import AzureAIProject, EvaluationResult, EvaluatorConfig
|
|
33
|
-
from .._user_agent import
|
|
34
|
+
from .._user_agent import UserAgentSingleton
|
|
34
35
|
from ._batch_run import (
|
|
35
36
|
EvalRunContext,
|
|
36
37
|
CodeClient,
|
|
@@ -43,7 +44,8 @@ from ._utils import (
|
|
|
43
44
|
_log_metrics_and_instance_results,
|
|
44
45
|
_trace_destination_from_project_scope,
|
|
45
46
|
_write_output,
|
|
46
|
-
DataLoaderFactory,
|
|
47
|
+
DataLoaderFactory,
|
|
48
|
+
_log_metrics_and_instance_results_onedp,
|
|
47
49
|
)
|
|
48
50
|
from ._batch_run.batch_clients import BatchClient, BatchClientRun
|
|
49
51
|
|
|
@@ -51,8 +53,9 @@ from ._evaluate_aoai import (
|
|
|
51
53
|
_begin_aoai_evaluation,
|
|
52
54
|
_split_evaluators_and_grader_configs,
|
|
53
55
|
_get_evaluation_run_results,
|
|
54
|
-
OAIEvalRunCreationInfo
|
|
56
|
+
OAIEvalRunCreationInfo,
|
|
55
57
|
)
|
|
58
|
+
|
|
56
59
|
LOGGER = logging.getLogger(__name__)
|
|
57
60
|
|
|
58
61
|
# For metrics (aggregates) whose metric names intentionally differ from their
|
|
@@ -69,11 +72,13 @@ class __EvaluatorInfo(TypedDict):
|
|
|
69
72
|
metrics: Dict[str, Any]
|
|
70
73
|
run_summary: Dict[str, Any]
|
|
71
74
|
|
|
75
|
+
|
|
72
76
|
class __ValidatedData(TypedDict):
|
|
73
|
-
|
|
77
|
+
"""
|
|
74
78
|
Simple dictionary that contains ALL pre-processed data and
|
|
75
79
|
the resultant objects that are needed for downstream evaluation.
|
|
76
|
-
|
|
80
|
+
"""
|
|
81
|
+
|
|
77
82
|
evaluators: Dict[str, Callable]
|
|
78
83
|
graders: Dict[str, AzureOpenAIGrader]
|
|
79
84
|
input_data_df: pd.DataFrame
|
|
@@ -255,7 +260,9 @@ def _aggregation_binary_output(df: pd.DataFrame) -> Dict[str, float]:
|
|
|
255
260
|
if len(parts) >= 3:
|
|
256
261
|
evaluator_name = parts[1]
|
|
257
262
|
else:
|
|
258
|
-
LOGGER.warning(
|
|
263
|
+
LOGGER.warning(
|
|
264
|
+
"Skipping column '%s' due to unexpected format. Expected at least three parts separated by '.'", col
|
|
265
|
+
)
|
|
259
266
|
continue
|
|
260
267
|
if evaluator_name:
|
|
261
268
|
# Count the occurrences of each unique value (pass/fail)
|
|
@@ -721,13 +728,16 @@ def evaluate(
|
|
|
721
728
|
:keyword output_path: The local folder or file path to save evaluation results to if set. If folder path is provided
|
|
722
729
|
the results will be saved to a file named `evaluation_results.json` in the folder.
|
|
723
730
|
:paramtype output_path: Optional[str]
|
|
724
|
-
:keyword azure_ai_project:
|
|
725
|
-
|
|
731
|
+
:keyword azure_ai_project: The Azure AI project, which can either be a string representing the project endpoint
|
|
732
|
+
or an instance of AzureAIProject. It contains subscription id, resource group, and project name.
|
|
733
|
+
:paramtype azure_ai_project: Optional[Union[str, ~azure.ai.evaluation.AzureAIProject]]
|
|
726
734
|
:keyword fail_on_evaluator_errors: Whether or not the evaluation should cancel early with an EvaluationException
|
|
727
735
|
if ANY evaluator fails during their evaluation.
|
|
728
736
|
Defaults to false, which means that evaluations will continue regardless of failures.
|
|
729
737
|
If such failures occur, metrics may be missing, and evidence of failures can be found in the evaluation's logs.
|
|
730
738
|
:paramtype fail_on_evaluator_errors: bool
|
|
739
|
+
:keyword user_agent: A string to append to the default user-agent sent with evaluation http requests
|
|
740
|
+
:paramtype user_agent: Optional[str]
|
|
731
741
|
:return: Evaluation results.
|
|
732
742
|
:rtype: ~azure.ai.evaluation.EvaluationResult
|
|
733
743
|
|
|
@@ -739,29 +749,31 @@ def evaluate(
|
|
|
739
749
|
:language: python
|
|
740
750
|
:dedent: 8
|
|
741
751
|
:caption: Run an evaluation on local data with one or more evaluators using azure.ai.evaluation.AzureAIProject
|
|
742
|
-
|
|
752
|
+
|
|
743
753
|
.. admonition:: Example using Azure AI Project URL:
|
|
744
|
-
|
|
754
|
+
|
|
745
755
|
.. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
|
|
746
756
|
:start-after: [START evaluate_method]
|
|
747
757
|
:end-before: [END evaluate_method]
|
|
748
758
|
:language: python
|
|
749
759
|
:dedent: 8
|
|
750
|
-
:caption: Run an evaluation on local data with one or more evaluators using Azure AI Project URL in following format
|
|
760
|
+
:caption: Run an evaluation on local data with one or more evaluators using Azure AI Project URL in following format
|
|
751
761
|
https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
|
|
752
762
|
"""
|
|
753
763
|
try:
|
|
754
|
-
|
|
755
|
-
|
|
756
|
-
|
|
757
|
-
|
|
758
|
-
|
|
759
|
-
|
|
760
|
-
|
|
761
|
-
|
|
762
|
-
|
|
763
|
-
|
|
764
|
-
|
|
764
|
+
user_agent: Optional[str] = kwargs.get("user_agent")
|
|
765
|
+
with UserAgentSingleton().add_useragent_product(user_agent) if user_agent else contextlib.nullcontext():
|
|
766
|
+
return _evaluate(
|
|
767
|
+
evaluation_name=evaluation_name,
|
|
768
|
+
target=target,
|
|
769
|
+
data=data,
|
|
770
|
+
evaluators_and_graders=evaluators,
|
|
771
|
+
evaluator_config=evaluator_config,
|
|
772
|
+
azure_ai_project=azure_ai_project,
|
|
773
|
+
output_path=output_path,
|
|
774
|
+
fail_on_evaluator_errors=fail_on_evaluator_errors,
|
|
775
|
+
**kwargs,
|
|
776
|
+
)
|
|
765
777
|
except Exception as e:
|
|
766
778
|
# Handle multiprocess bootstrap error
|
|
767
779
|
bootstrap_error = (
|
|
@@ -832,7 +844,7 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
|
|
|
832
844
|
) -> EvaluationResult:
|
|
833
845
|
if fail_on_evaluator_errors:
|
|
834
846
|
_print_fail_flag_warning()
|
|
835
|
-
|
|
847
|
+
|
|
836
848
|
# Turn inputted mess of data into a dataframe, apply targets if needed
|
|
837
849
|
# split graders and evaluators, and verify that column mappings are sensible.
|
|
838
850
|
validated_data = _preprocess_data(
|
|
@@ -845,7 +857,7 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
|
|
|
845
857
|
evaluation_name=evaluation_name,
|
|
846
858
|
**kwargs,
|
|
847
859
|
)
|
|
848
|
-
|
|
860
|
+
|
|
849
861
|
# extract relevant info from validated data
|
|
850
862
|
column_mapping = validated_data["column_mapping"]
|
|
851
863
|
evaluators = validated_data["evaluators"]
|
|
@@ -863,29 +875,25 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
|
|
|
863
875
|
if need_oai_run:
|
|
864
876
|
try:
|
|
865
877
|
aoi_name = evaluation_name if evaluation_name else DEFAULT_OAI_EVAL_RUN_NAME
|
|
866
|
-
eval_run_info_list = _begin_aoai_evaluation(
|
|
867
|
-
graders,
|
|
868
|
-
column_mapping,
|
|
869
|
-
input_data_df,
|
|
870
|
-
aoi_name
|
|
871
|
-
)
|
|
878
|
+
eval_run_info_list = _begin_aoai_evaluation(graders, column_mapping, input_data_df, aoi_name)
|
|
872
879
|
need_get_oai_results = len(eval_run_info_list) > 0
|
|
873
880
|
except EvaluationException as e:
|
|
874
881
|
if need_local_run:
|
|
875
882
|
# If there are normal evaluators, don't stop execution and try to run
|
|
876
883
|
# those.
|
|
877
|
-
LOGGER.warning(
|
|
878
|
-
|
|
884
|
+
LOGGER.warning(
|
|
885
|
+
"Remote Azure Open AI grader evaluations failed during run creation."
|
|
886
|
+
+ " Continuing with local evaluators."
|
|
887
|
+
)
|
|
879
888
|
LOGGER.warning(e)
|
|
880
889
|
else:
|
|
881
890
|
raise e
|
|
882
|
-
|
|
891
|
+
|
|
883
892
|
# Evaluate 'normal' evaluators. This includes built-in evaluators and any user-supplied callables.
|
|
884
893
|
if need_local_run:
|
|
885
894
|
try:
|
|
886
|
-
eval_result_df, eval_metrics, per_evaluator_results = _run_callable_evaluators(
|
|
887
|
-
validated_data=validated_data,
|
|
888
|
-
fail_on_evaluator_errors=fail_on_evaluator_errors
|
|
895
|
+
eval_result_df, eval_metrics, per_evaluator_results = _run_callable_evaluators(
|
|
896
|
+
validated_data=validated_data, fail_on_evaluator_errors=fail_on_evaluator_errors
|
|
889
897
|
)
|
|
890
898
|
results_df = eval_result_df
|
|
891
899
|
metrics = eval_metrics
|
|
@@ -903,7 +911,7 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
|
|
|
903
911
|
# Retrieve OAI eval run results if needed.
|
|
904
912
|
if need_get_oai_results:
|
|
905
913
|
try:
|
|
906
|
-
aoai_results, aoai_metrics = _get_evaluation_run_results(eval_run_info_list)
|
|
914
|
+
aoai_results, aoai_metrics = _get_evaluation_run_results(eval_run_info_list) # type: ignore
|
|
907
915
|
# Post build TODO: add equivalent of _print_summary(per_evaluator_results) here
|
|
908
916
|
|
|
909
917
|
# Combine results if both evaluators and graders are present
|
|
@@ -955,22 +963,17 @@ def _preprocess_data(
|
|
|
955
963
|
azure_ai_project: Optional[Union[str, AzureAIProject]] = None,
|
|
956
964
|
evaluation_name: Optional[str] = None,
|
|
957
965
|
**kwargs,
|
|
958
|
-
|
|
966
|
+
) -> __ValidatedData:
|
|
959
967
|
# Process evaluator config to replace ${target.} with ${data.}
|
|
960
968
|
if evaluator_config is None:
|
|
961
969
|
evaluator_config = {}
|
|
962
970
|
|
|
963
971
|
input_data_df = _validate_and_load_data(
|
|
964
|
-
target,
|
|
965
|
-
data,
|
|
966
|
-
evaluators_and_graders,
|
|
967
|
-
output_path,
|
|
968
|
-
azure_ai_project,
|
|
969
|
-
evaluation_name
|
|
972
|
+
target, data, evaluators_and_graders, output_path, azure_ai_project, evaluation_name
|
|
970
973
|
)
|
|
971
974
|
if target is not None:
|
|
972
975
|
_validate_columns_for_target(input_data_df, target)
|
|
973
|
-
|
|
976
|
+
|
|
974
977
|
# extract column mapping dicts into dictionary mapping evaluator name to column mapping
|
|
975
978
|
column_mapping = _process_column_mappings(
|
|
976
979
|
{
|
|
@@ -996,7 +999,7 @@ def _preprocess_data(
|
|
|
996
999
|
batch_run_client = RunSubmitterClient()
|
|
997
1000
|
batch_run_data = input_data_df
|
|
998
1001
|
elif kwargs.pop("_use_pf_client", True):
|
|
999
|
-
batch_run_client = ProxyClient(user_agent=
|
|
1002
|
+
batch_run_client = ProxyClient(user_agent=UserAgentSingleton().value)
|
|
1000
1003
|
# Ensure the absolute path is passed to pf.run, as relative path doesn't work with
|
|
1001
1004
|
# multiple evaluators. If the path is already absolute, abspath will return the original path.
|
|
1002
1005
|
batch_run_data = os.path.abspath(data)
|
|
@@ -1127,14 +1130,15 @@ def _run_callable_evaluators(
|
|
|
1127
1130
|
|
|
1128
1131
|
return eval_result_df, eval_metrics, per_evaluator_results
|
|
1129
1132
|
|
|
1133
|
+
|
|
1130
1134
|
def _map_names_to_builtins(
|
|
1131
|
-
|
|
1132
|
-
|
|
1133
|
-
|
|
1135
|
+
evaluators: Dict[str, Callable],
|
|
1136
|
+
graders: Dict[str, AzureOpenAIGrader],
|
|
1137
|
+
) -> Dict[str, str]:
|
|
1134
1138
|
"""
|
|
1135
1139
|
Construct a mapping from user-supplied evaluator names to which known, built-in
|
|
1136
|
-
evaluator or grader they refer to. Custom
|
|
1137
|
-
|
|
1140
|
+
evaluator or grader they refer to. Custom evaluators are excluded from the mapping
|
|
1141
|
+
as we only want to track built-in evaluators and graders.
|
|
1138
1142
|
|
|
1139
1143
|
:param evaluators: The dictionary of evaluators.
|
|
1140
1144
|
:type evaluators: Dict[str, Callable]
|
|
@@ -1142,9 +1146,10 @@ def _map_names_to_builtins(
|
|
|
1142
1146
|
:type graders: Dict[str, AzureOpenAIGrader]
|
|
1143
1147
|
:param evaluator_config: The configuration for evaluators.
|
|
1144
1148
|
:type evaluator_config: Optional[Dict[str, EvaluatorConfig]]
|
|
1145
|
-
|
|
1149
|
+
|
|
1146
1150
|
"""
|
|
1147
1151
|
from .._eval_mapping import EVAL_CLASS_MAP
|
|
1152
|
+
|
|
1148
1153
|
name_map = {}
|
|
1149
1154
|
|
|
1150
1155
|
for name, evaluator in evaluators.items():
|
|
@@ -1156,14 +1161,15 @@ def _map_names_to_builtins(
|
|
|
1156
1161
|
found_eval = True
|
|
1157
1162
|
break
|
|
1158
1163
|
if not found_eval:
|
|
1159
|
-
#
|
|
1160
|
-
|
|
1161
|
-
|
|
1162
|
-
for
|
|
1164
|
+
# Skip custom evaluators - we only want to track built-in evaluators
|
|
1165
|
+
pass
|
|
1166
|
+
|
|
1167
|
+
for name, grader in graders.items():
|
|
1163
1168
|
name_map[name] = grader.id
|
|
1164
1169
|
|
|
1165
1170
|
return name_map
|
|
1166
1171
|
|
|
1172
|
+
|
|
1167
1173
|
def _turn_error_logs_into_exception(log_path: str) -> None:
|
|
1168
1174
|
"""Produce an EvaluationException using the contents of the inputted
|
|
1169
1175
|
file as the error message.
|
|
@@ -1178,4 +1184,4 @@ def _turn_error_logs_into_exception(log_path: str) -> None:
|
|
|
1178
1184
|
target=ErrorTarget.EVALUATE,
|
|
1179
1185
|
category=ErrorCategory.FAILED_EXECUTION,
|
|
1180
1186
|
blame=ErrorBlame.UNKNOWN,
|
|
1181
|
-
)
|
|
1187
|
+
)
|