azure-ai-evaluation 1.0.0b2__py3-none-any.whl → 1.13.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of azure-ai-evaluation might be problematic. Click here for more details.
- azure/ai/evaluation/__init__.py +100 -5
- azure/ai/evaluation/{_evaluators/_chat → _aoai}/__init__.py +3 -2
- azure/ai/evaluation/_aoai/aoai_grader.py +140 -0
- azure/ai/evaluation/_aoai/label_grader.py +68 -0
- azure/ai/evaluation/_aoai/python_grader.py +86 -0
- azure/ai/evaluation/_aoai/score_model_grader.py +94 -0
- azure/ai/evaluation/_aoai/string_check_grader.py +66 -0
- azure/ai/evaluation/_aoai/text_similarity_grader.py +80 -0
- azure/ai/evaluation/_azure/__init__.py +3 -0
- azure/ai/evaluation/_azure/_clients.py +204 -0
- azure/ai/evaluation/_azure/_envs.py +207 -0
- azure/ai/evaluation/_azure/_models.py +227 -0
- azure/ai/evaluation/_azure/_token_manager.py +129 -0
- azure/ai/evaluation/_common/__init__.py +9 -1
- azure/ai/evaluation/{simulator/_helpers → _common}/_experimental.py +24 -9
- azure/ai/evaluation/_common/constants.py +131 -2
- azure/ai/evaluation/_common/evaluation_onedp_client.py +169 -0
- azure/ai/evaluation/_common/math.py +89 -0
- azure/ai/evaluation/_common/onedp/__init__.py +32 -0
- azure/ai/evaluation/_common/onedp/_client.py +166 -0
- azure/ai/evaluation/_common/onedp/_configuration.py +72 -0
- azure/ai/evaluation/_common/onedp/_model_base.py +1232 -0
- azure/ai/evaluation/_common/onedp/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/_serialization.py +2032 -0
- azure/ai/evaluation/_common/onedp/_types.py +21 -0
- azure/ai/evaluation/_common/onedp/_utils/__init__.py +6 -0
- azure/ai/evaluation/_common/onedp/_utils/model_base.py +1232 -0
- azure/ai/evaluation/_common/onedp/_utils/serialization.py +2032 -0
- azure/ai/evaluation/_common/onedp/_validation.py +66 -0
- azure/ai/evaluation/_common/onedp/_vendor.py +50 -0
- azure/ai/evaluation/_common/onedp/_version.py +9 -0
- azure/ai/evaluation/_common/onedp/aio/__init__.py +29 -0
- azure/ai/evaluation/_common/onedp/aio/_client.py +168 -0
- azure/ai/evaluation/_common/onedp/aio/_configuration.py +72 -0
- azure/ai/evaluation/_common/onedp/aio/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +49 -0
- azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +7143 -0
- azure/ai/evaluation/_common/onedp/aio/operations/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/models/__init__.py +358 -0
- azure/ai/evaluation/_common/onedp/models/_enums.py +447 -0
- azure/ai/evaluation/_common/onedp/models/_models.py +5963 -0
- azure/ai/evaluation/_common/onedp/models/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/operations/__init__.py +49 -0
- azure/ai/evaluation/_common/onedp/operations/_operations.py +8951 -0
- azure/ai/evaluation/_common/onedp/operations/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/py.typed +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/__init__.py +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/aio/__init__.py +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/__init__.py +25 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_operations.py +34 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/__init__.py +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/__init__.py +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/__init__.py +22 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_operations.py +29 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/__init__.py +22 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_operations.py +29 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/operations/__init__.py +25 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/operations/_operations.py +34 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/rai_service.py +831 -142
- azure/ai/evaluation/_common/raiclient/__init__.py +34 -0
- azure/ai/evaluation/_common/raiclient/_client.py +128 -0
- azure/ai/evaluation/_common/raiclient/_configuration.py +87 -0
- azure/ai/evaluation/_common/raiclient/_model_base.py +1235 -0
- azure/ai/evaluation/_common/raiclient/_patch.py +20 -0
- azure/ai/evaluation/_common/raiclient/_serialization.py +2050 -0
- azure/ai/evaluation/_common/raiclient/_version.py +9 -0
- azure/ai/evaluation/_common/raiclient/aio/__init__.py +29 -0
- azure/ai/evaluation/_common/raiclient/aio/_client.py +130 -0
- azure/ai/evaluation/_common/raiclient/aio/_configuration.py +87 -0
- azure/ai/evaluation/_common/raiclient/aio/_patch.py +20 -0
- azure/ai/evaluation/_common/raiclient/aio/operations/__init__.py +25 -0
- azure/ai/evaluation/_common/raiclient/aio/operations/_operations.py +981 -0
- azure/ai/evaluation/_common/raiclient/aio/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/raiclient/models/__init__.py +60 -0
- azure/ai/evaluation/_common/raiclient/models/_enums.py +18 -0
- azure/ai/evaluation/_common/raiclient/models/_models.py +651 -0
- azure/ai/evaluation/_common/raiclient/models/_patch.py +20 -0
- azure/ai/evaluation/_common/raiclient/operations/__init__.py +25 -0
- azure/ai/evaluation/_common/raiclient/operations/_operations.py +1238 -0
- azure/ai/evaluation/_common/raiclient/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/raiclient/py.typed +1 -0
- azure/ai/evaluation/_common/utils.py +870 -34
- azure/ai/evaluation/_constants.py +167 -6
- azure/ai/evaluation/_converters/__init__.py +3 -0
- azure/ai/evaluation/_converters/_ai_services.py +899 -0
- azure/ai/evaluation/_converters/_models.py +467 -0
- azure/ai/evaluation/_converters/_sk_services.py +495 -0
- azure/ai/evaluation/_eval_mapping.py +83 -0
- azure/ai/evaluation/_evaluate/_batch_run/__init__.py +17 -0
- azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +176 -0
- azure/ai/evaluation/_evaluate/_batch_run/batch_clients.py +82 -0
- azure/ai/evaluation/_evaluate/{_batch_run_client → _batch_run}/code_client.py +47 -25
- azure/ai/evaluation/_evaluate/{_batch_run_client/batch_run_context.py → _batch_run/eval_run_context.py} +42 -13
- azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +124 -0
- azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +62 -0
- azure/ai/evaluation/_evaluate/_eval_run.py +102 -59
- azure/ai/evaluation/_evaluate/_evaluate.py +2134 -311
- azure/ai/evaluation/_evaluate/_evaluate_aoai.py +992 -0
- azure/ai/evaluation/_evaluate/_telemetry/__init__.py +14 -99
- azure/ai/evaluation/_evaluate/_utils.py +289 -40
- azure/ai/evaluation/_evaluator_definition.py +76 -0
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +93 -42
- azure/ai/evaluation/_evaluators/_code_vulnerability/__init__.py +5 -0
- azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +119 -0
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +117 -91
- azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +76 -39
- azure/ai/evaluation/_evaluators/_common/__init__.py +15 -0
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +742 -0
- azure/ai/evaluation/_evaluators/_common/_base_multi_eval.py +63 -0
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +345 -0
- azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +198 -0
- azure/ai/evaluation/_evaluators/_common/_conversation_aggregators.py +49 -0
- azure/ai/evaluation/_evaluators/_content_safety/__init__.py +0 -4
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +144 -86
- azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +138 -57
- azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +123 -55
- azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +133 -54
- azure/ai/evaluation/_evaluators/_content_safety/_violence.py +134 -54
- azure/ai/evaluation/_evaluators/_document_retrieval/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +442 -0
- azure/ai/evaluation/_evaluators/_eci/_eci.py +49 -56
- azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +102 -60
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +115 -92
- azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +66 -41
- azure/ai/evaluation/_evaluators/_gleu/_gleu.py +90 -37
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +318 -82
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +114 -0
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +104 -0
- azure/ai/evaluation/{_evaluate/_batch_run_client → _evaluators/_intent_resolution}/__init__.py +3 -4
- azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +196 -0
- azure/ai/evaluation/_evaluators/_intent_resolution/intent_resolution.prompty +275 -0
- azure/ai/evaluation/_evaluators/_meteor/_meteor.py +107 -61
- azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +104 -77
- azure/ai/evaluation/_evaluators/_qa/_qa.py +115 -63
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +182 -98
- azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +178 -49
- azure/ai/evaluation/_evaluators/_response_completeness/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +202 -0
- azure/ai/evaluation/_evaluators/_response_completeness/response_completeness.prompty +84 -0
- azure/ai/evaluation/_evaluators/{_chat/retrieval → _retrieval}/__init__.py +2 -2
- azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +148 -0
- azure/ai/evaluation/_evaluators/_retrieval/retrieval.prompty +93 -0
- azure/ai/evaluation/_evaluators/_rouge/_rouge.py +189 -50
- azure/ai/evaluation/_evaluators/_service_groundedness/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +179 -0
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +102 -91
- azure/ai/evaluation/_evaluators/_similarity/similarity.prompty +0 -5
- azure/ai/evaluation/_evaluators/_task_adherence/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +226 -0
- azure/ai/evaluation/_evaluators/_task_adherence/task_adherence.prompty +101 -0
- azure/ai/evaluation/_evaluators/_task_completion/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_task_completion/_task_completion.py +177 -0
- azure/ai/evaluation/_evaluators/_task_completion/task_completion.prompty +220 -0
- azure/ai/evaluation/_evaluators/_task_navigation_efficiency/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_task_navigation_efficiency/_task_navigation_efficiency.py +384 -0
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +298 -0
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +166 -0
- azure/ai/evaluation/_evaluators/_tool_input_accuracy/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py +263 -0
- azure/ai/evaluation/_evaluators/_tool_input_accuracy/tool_input_accuracy.prompty +76 -0
- azure/ai/evaluation/_evaluators/_tool_output_utilization/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py +225 -0
- azure/ai/evaluation/_evaluators/_tool_output_utilization/tool_output_utilization.prompty +221 -0
- azure/ai/evaluation/_evaluators/_tool_selection/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py +266 -0
- azure/ai/evaluation/_evaluators/_tool_selection/tool_selection.prompty +104 -0
- azure/ai/evaluation/_evaluators/_tool_success/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_tool_success/_tool_success.py +301 -0
- azure/ai/evaluation/_evaluators/_tool_success/tool_success.prompty +321 -0
- azure/ai/evaluation/_evaluators/_ungrounded_attributes/__init__.py +5 -0
- azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +102 -0
- azure/ai/evaluation/_evaluators/_xpia/xpia.py +109 -107
- azure/ai/evaluation/_exceptions.py +51 -7
- azure/ai/evaluation/_http_utils.py +210 -137
- azure/ai/evaluation/_legacy/__init__.py +3 -0
- azure/ai/evaluation/_legacy/_adapters/__init__.py +7 -0
- azure/ai/evaluation/_legacy/_adapters/_check.py +17 -0
- azure/ai/evaluation/_legacy/_adapters/_configuration.py +45 -0
- azure/ai/evaluation/_legacy/_adapters/_constants.py +10 -0
- azure/ai/evaluation/_legacy/_adapters/_errors.py +29 -0
- azure/ai/evaluation/_legacy/_adapters/_flows.py +28 -0
- azure/ai/evaluation/_legacy/_adapters/_service.py +16 -0
- azure/ai/evaluation/_legacy/_adapters/client.py +51 -0
- azure/ai/evaluation/_legacy/_adapters/entities.py +26 -0
- azure/ai/evaluation/_legacy/_adapters/tracing.py +28 -0
- azure/ai/evaluation/_legacy/_adapters/types.py +15 -0
- azure/ai/evaluation/_legacy/_adapters/utils.py +31 -0
- azure/ai/evaluation/_legacy/_batch_engine/__init__.py +9 -0
- azure/ai/evaluation/_legacy/_batch_engine/_config.py +48 -0
- azure/ai/evaluation/_legacy/_batch_engine/_engine.py +477 -0
- azure/ai/evaluation/_legacy/_batch_engine/_exceptions.py +88 -0
- azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py +132 -0
- azure/ai/evaluation/_legacy/_batch_engine/_result.py +107 -0
- azure/ai/evaluation/_legacy/_batch_engine/_run.py +127 -0
- azure/ai/evaluation/_legacy/_batch_engine/_run_storage.py +128 -0
- azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +262 -0
- azure/ai/evaluation/_legacy/_batch_engine/_status.py +25 -0
- azure/ai/evaluation/_legacy/_batch_engine/_trace.py +97 -0
- azure/ai/evaluation/_legacy/_batch_engine/_utils.py +97 -0
- azure/ai/evaluation/_legacy/_batch_engine/_utils_deprecated.py +131 -0
- azure/ai/evaluation/_legacy/_common/__init__.py +3 -0
- azure/ai/evaluation/_legacy/_common/_async_token_provider.py +117 -0
- azure/ai/evaluation/_legacy/_common/_logging.py +292 -0
- azure/ai/evaluation/_legacy/_common/_thread_pool_executor_with_context.py +17 -0
- azure/ai/evaluation/_legacy/prompty/__init__.py +36 -0
- azure/ai/evaluation/_legacy/prompty/_connection.py +119 -0
- azure/ai/evaluation/_legacy/prompty/_exceptions.py +139 -0
- azure/ai/evaluation/_legacy/prompty/_prompty.py +430 -0
- azure/ai/evaluation/_legacy/prompty/_utils.py +663 -0
- azure/ai/evaluation/_legacy/prompty/_yaml_utils.py +99 -0
- azure/ai/evaluation/_model_configurations.py +130 -8
- azure/ai/evaluation/_safety_evaluation/__init__.py +3 -0
- azure/ai/evaluation/_safety_evaluation/_generated_rai_client.py +0 -0
- azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +917 -0
- azure/ai/evaluation/_user_agent.py +32 -1
- azure/ai/evaluation/_vendor/__init__.py +3 -0
- azure/ai/evaluation/_vendor/rouge_score/__init__.py +14 -0
- azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +324 -0
- azure/ai/evaluation/_vendor/rouge_score/scoring.py +59 -0
- azure/ai/evaluation/_vendor/rouge_score/tokenize.py +59 -0
- azure/ai/evaluation/_vendor/rouge_score/tokenizers.py +53 -0
- azure/ai/evaluation/_version.py +2 -1
- azure/ai/evaluation/red_team/__init__.py +22 -0
- azure/ai/evaluation/red_team/_agent/__init__.py +3 -0
- azure/ai/evaluation/red_team/_agent/_agent_functions.py +261 -0
- azure/ai/evaluation/red_team/_agent/_agent_tools.py +461 -0
- azure/ai/evaluation/red_team/_agent/_agent_utils.py +89 -0
- azure/ai/evaluation/red_team/_agent/_semantic_kernel_plugin.py +228 -0
- azure/ai/evaluation/red_team/_attack_objective_generator.py +268 -0
- azure/ai/evaluation/red_team/_attack_strategy.py +49 -0
- azure/ai/evaluation/red_team/_callback_chat_target.py +115 -0
- azure/ai/evaluation/red_team/_default_converter.py +21 -0
- azure/ai/evaluation/red_team/_evaluation_processor.py +505 -0
- azure/ai/evaluation/red_team/_mlflow_integration.py +430 -0
- azure/ai/evaluation/red_team/_orchestrator_manager.py +803 -0
- azure/ai/evaluation/red_team/_red_team.py +1717 -0
- azure/ai/evaluation/red_team/_red_team_result.py +661 -0
- azure/ai/evaluation/red_team/_result_processor.py +1708 -0
- azure/ai/evaluation/red_team/_utils/__init__.py +37 -0
- azure/ai/evaluation/red_team/_utils/_rai_service_eval_chat_target.py +128 -0
- azure/ai/evaluation/red_team/_utils/_rai_service_target.py +601 -0
- azure/ai/evaluation/red_team/_utils/_rai_service_true_false_scorer.py +114 -0
- azure/ai/evaluation/red_team/_utils/constants.py +72 -0
- azure/ai/evaluation/red_team/_utils/exception_utils.py +345 -0
- azure/ai/evaluation/red_team/_utils/file_utils.py +266 -0
- azure/ai/evaluation/red_team/_utils/formatting_utils.py +365 -0
- azure/ai/evaluation/red_team/_utils/logging_utils.py +139 -0
- azure/ai/evaluation/red_team/_utils/metric_mapping.py +73 -0
- azure/ai/evaluation/red_team/_utils/objective_utils.py +46 -0
- azure/ai/evaluation/red_team/_utils/progress_utils.py +252 -0
- azure/ai/evaluation/red_team/_utils/retry_utils.py +218 -0
- azure/ai/evaluation/red_team/_utils/strategy_utils.py +218 -0
- azure/ai/evaluation/simulator/__init__.py +2 -1
- azure/ai/evaluation/simulator/_adversarial_scenario.py +26 -1
- azure/ai/evaluation/simulator/_adversarial_simulator.py +270 -144
- azure/ai/evaluation/simulator/_constants.py +12 -1
- azure/ai/evaluation/simulator/_conversation/__init__.py +151 -23
- azure/ai/evaluation/simulator/_conversation/_conversation.py +10 -6
- azure/ai/evaluation/simulator/_conversation/constants.py +1 -1
- azure/ai/evaluation/simulator/_data_sources/__init__.py +3 -0
- azure/ai/evaluation/simulator/_data_sources/grounding.json +1150 -0
- azure/ai/evaluation/simulator/_direct_attack_simulator.py +54 -75
- azure/ai/evaluation/simulator/_helpers/__init__.py +1 -2
- azure/ai/evaluation/simulator/_helpers/_language_suffix_mapping.py +1 -0
- azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +26 -5
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +145 -104
- azure/ai/evaluation/simulator/_model_tools/__init__.py +2 -1
- azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +225 -0
- azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +80 -30
- azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +117 -45
- azure/ai/evaluation/simulator/_model_tools/_rai_client.py +109 -7
- azure/ai/evaluation/simulator/_model_tools/_template_handler.py +97 -33
- azure/ai/evaluation/simulator/_model_tools/models.py +30 -27
- azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +6 -10
- azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +6 -5
- azure/ai/evaluation/simulator/_simulator.py +302 -208
- azure/ai/evaluation/simulator/_utils.py +31 -13
- azure_ai_evaluation-1.13.3.dist-info/METADATA +939 -0
- azure_ai_evaluation-1.13.3.dist-info/RECORD +305 -0
- {azure_ai_evaluation-1.0.0b2.dist-info → azure_ai_evaluation-1.13.3.dist-info}/WHEEL +1 -1
- azure_ai_evaluation-1.13.3.dist-info/licenses/NOTICE.txt +70 -0
- azure/ai/evaluation/_evaluate/_batch_run_client/proxy_client.py +0 -71
- azure/ai/evaluation/_evaluators/_chat/_chat.py +0 -357
- azure/ai/evaluation/_evaluators/_chat/retrieval/_retrieval.py +0 -157
- azure/ai/evaluation/_evaluators/_chat/retrieval/retrieval.prompty +0 -48
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety_base.py +0 -65
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +0 -301
- azure/ai/evaluation/_evaluators/_groundedness/groundedness.prompty +0 -54
- azure/ai/evaluation/_evaluators/_protected_materials/__init__.py +0 -5
- azure/ai/evaluation/_evaluators/_protected_materials/_protected_materials.py +0 -104
- azure/ai/evaluation/simulator/_tracing.py +0 -89
- azure_ai_evaluation-1.0.0b2.dist-info/METADATA +0 -449
- azure_ai_evaluation-1.0.0b2.dist-info/RECORD +0 -99
- {azure_ai_evaluation-1.0.0b2.dist-info → azure_ai_evaluation-1.13.3.dist-info}/top_level.txt +0 -0
|
@@ -2,37 +2,128 @@
|
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
4
|
import inspect
|
|
5
|
+
import contextlib
|
|
6
|
+
import json
|
|
7
|
+
import logging
|
|
8
|
+
import math
|
|
5
9
|
import os
|
|
6
10
|
import re
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
import
|
|
11
|
+
import tempfile
|
|
12
|
+
import json
|
|
13
|
+
import time
|
|
14
|
+
from typing import Any, Callable, Dict, Iterable, Iterator, List, Literal, Optional, Set, Tuple, TypedDict, Union, cast
|
|
15
|
+
|
|
16
|
+
from openai import OpenAI, AzureOpenAI
|
|
17
|
+
from azure.ai.evaluation._legacy._adapters._constants import LINE_NUMBER
|
|
18
|
+
from azure.ai.evaluation._legacy._adapters.entities import Run
|
|
10
19
|
import pandas as pd
|
|
11
|
-
from promptflow._sdk._constants import LINE_NUMBER
|
|
12
|
-
from promptflow.client import PFClient
|
|
13
20
|
|
|
21
|
+
from azure.ai.evaluation._common.math import list_mean_nan_safe, apply_transform_nan_safe
|
|
22
|
+
from azure.ai.evaluation._common.utils import validate_azure_ai_project, is_onedp_project
|
|
23
|
+
from azure.ai.evaluation._evaluators._common._base_eval import EvaluatorBase
|
|
14
24
|
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
25
|
+
from azure.ai.evaluation._aoai.aoai_grader import AzureOpenAIGrader
|
|
15
26
|
|
|
16
27
|
from .._constants import (
|
|
17
28
|
CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT,
|
|
29
|
+
EVALUATION_PASS_FAIL_MAPPING,
|
|
18
30
|
EvaluationMetrics,
|
|
31
|
+
DefaultOpenEncoding,
|
|
19
32
|
Prefixes,
|
|
20
33
|
_InternalEvaluationMetrics,
|
|
34
|
+
BINARY_AGGREGATE_SUFFIX,
|
|
35
|
+
DEFAULT_OAI_EVAL_RUN_NAME,
|
|
36
|
+
EVALUATION_EVENT_NAME,
|
|
37
|
+
_EvaluatorMetricMapping,
|
|
38
|
+
)
|
|
39
|
+
from .._model_configurations import AzureAIProject, EvaluationResult, EvaluatorConfig, AppInsightsConfig
|
|
40
|
+
from .._user_agent import UserAgentSingleton
|
|
41
|
+
from ._batch_run import (
|
|
42
|
+
EvalRunContext,
|
|
43
|
+
CodeClient,
|
|
44
|
+
ProxyClient,
|
|
45
|
+
TargetRunContext,
|
|
46
|
+
RunSubmitterClient,
|
|
21
47
|
)
|
|
22
|
-
from .._model_configurations import AzureAIProject
|
|
23
|
-
from .._user_agent import USER_AGENT
|
|
24
|
-
from ._batch_run_client import BatchRunContext, CodeClient, ProxyClient
|
|
25
48
|
from ._utils import (
|
|
26
49
|
_apply_column_mapping,
|
|
27
50
|
_log_metrics_and_instance_results,
|
|
28
51
|
_trace_destination_from_project_scope,
|
|
29
52
|
_write_output,
|
|
53
|
+
DataLoaderFactory,
|
|
54
|
+
_log_metrics_and_instance_results_onedp,
|
|
55
|
+
)
|
|
56
|
+
from ._batch_run.batch_clients import BatchClient, BatchClientRun
|
|
57
|
+
|
|
58
|
+
from ._evaluate_aoai import (
|
|
59
|
+
_begin_aoai_evaluation,
|
|
60
|
+
_split_evaluators_and_grader_configs,
|
|
61
|
+
_get_evaluation_run_results,
|
|
62
|
+
OAIEvalRunCreationInfo,
|
|
30
63
|
)
|
|
31
64
|
|
|
65
|
+
LOGGER = logging.getLogger(__name__)
|
|
66
|
+
|
|
67
|
+
# For metrics (aggregates) whose metric names intentionally differ from their
|
|
68
|
+
# originating column name, usually because the aggregation of the original value
|
|
69
|
+
# means something sufficiently different.
|
|
70
|
+
# Note that content safety metrics are handled separately.
|
|
71
|
+
METRIC_COLUMN_NAME_REPLACEMENTS = {
|
|
72
|
+
"groundedness_pro_label": "groundedness_pro_passing_rate",
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
class __EvaluatorInfo(TypedDict):
|
|
77
|
+
result: pd.DataFrame
|
|
78
|
+
metrics: Dict[str, Any]
|
|
79
|
+
run_summary: Dict[str, Any]
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
class __ValidatedData(TypedDict):
|
|
83
|
+
"""
|
|
84
|
+
Simple dictionary that contains ALL pre-processed data and
|
|
85
|
+
the resultant objects that are needed for downstream evaluation.
|
|
86
|
+
"""
|
|
87
|
+
|
|
88
|
+
evaluators: Dict[str, Callable]
|
|
89
|
+
graders: Dict[str, AzureOpenAIGrader]
|
|
90
|
+
input_data_df: pd.DataFrame
|
|
91
|
+
column_mapping: Dict[str, Dict[str, str]]
|
|
92
|
+
target_run: Optional[BatchClientRun]
|
|
93
|
+
batch_run_client: BatchClient
|
|
94
|
+
batch_run_data: Union[str, os.PathLike, pd.DataFrame]
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def _aggregate_other_metrics(df: pd.DataFrame) -> Tuple[List[str], Dict[str, float]]:
|
|
98
|
+
"""Identify and average various metrics that need to have the metric name be replaced,
|
|
99
|
+
instead of having the metric match the originating column name.
|
|
100
|
+
:param df: The dataframe of evaluation results.
|
|
101
|
+
:type df: ~pandas.DataFrame
|
|
102
|
+
:return: A tuple; the first element is a list of dataframe columns that were aggregated,
|
|
103
|
+
and the second element is a dictionary of resultant new metric column names and their values.
|
|
104
|
+
:rtype: Tuple[List[str], Dict[str, float]]
|
|
105
|
+
"""
|
|
106
|
+
renamed_cols = []
|
|
107
|
+
metric_columns = {}
|
|
108
|
+
for col in df.columns:
|
|
109
|
+
metric_prefix = col.split(".")[0]
|
|
110
|
+
metric_name = col.split(".")[1]
|
|
111
|
+
if metric_name in METRIC_COLUMN_NAME_REPLACEMENTS:
|
|
112
|
+
renamed_cols.append(col)
|
|
113
|
+
new_col_name = metric_prefix + "." + METRIC_COLUMN_NAME_REPLACEMENTS[metric_name]
|
|
114
|
+
col_with_numeric_values = cast(List[float], pd.to_numeric(df[col], errors="coerce"))
|
|
115
|
+
try:
|
|
116
|
+
metric_columns[new_col_name] = round(list_mean_nan_safe(col_with_numeric_values), 2)
|
|
117
|
+
except EvaluationException: # only exception that can be cause is all NaN values
|
|
118
|
+
msg = f"All score evaluations are NaN/None for column {col}. No aggregation can be performed."
|
|
119
|
+
LOGGER.warning(msg)
|
|
120
|
+
|
|
121
|
+
return renamed_cols, metric_columns
|
|
122
|
+
|
|
32
123
|
|
|
33
124
|
# pylint: disable=line-too-long
|
|
34
125
|
def _aggregate_content_safety_metrics(
|
|
35
|
-
df: pd.DataFrame, evaluators: Dict[str,
|
|
126
|
+
df: pd.DataFrame, evaluators: Dict[str, Callable]
|
|
36
127
|
) -> Tuple[List[str], Dict[str, float]]:
|
|
37
128
|
"""Find and aggregate defect rates for content safety metrics. Returns both a list
|
|
38
129
|
of columns that were used to calculate defect rates and the defect rates themselves.
|
|
@@ -61,7 +152,6 @@ def _aggregate_content_safety_metrics(
|
|
|
61
152
|
module = inspect.getmodule(evaluators[evaluator_name])
|
|
62
153
|
if (
|
|
63
154
|
module
|
|
64
|
-
and module.__name__.startswith("azure.ai.evaluation.")
|
|
65
155
|
and metric_name.endswith("_score")
|
|
66
156
|
and metric_name.replace("_score", "") in content_safety_metrics
|
|
67
157
|
):
|
|
@@ -71,12 +161,16 @@ def _aggregate_content_safety_metrics(
|
|
|
71
161
|
defect_rates = {}
|
|
72
162
|
for col in content_safety_df.columns:
|
|
73
163
|
defect_rate_name = col.replace("_score", "_defect_rate")
|
|
74
|
-
col_with_numeric_values = pd.to_numeric(content_safety_df[col], errors="coerce")
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
164
|
+
col_with_numeric_values = cast(List[float], pd.to_numeric(content_safety_df[col], errors="coerce"))
|
|
165
|
+
try:
|
|
166
|
+
col_with_boolean_values = apply_transform_nan_safe(
|
|
167
|
+
col_with_numeric_values, lambda x: 1 if x >= CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT else 0
|
|
168
|
+
)
|
|
169
|
+
defect_rates[defect_rate_name] = round(list_mean_nan_safe(col_with_boolean_values), 2)
|
|
170
|
+
except EvaluationException: # only exception that can be cause is all NaN values
|
|
171
|
+
msg = f"All score evaluations are NaN/None for column {col}. No aggregation can be performed."
|
|
172
|
+
LOGGER.warning(msg)
|
|
173
|
+
|
|
80
174
|
return content_safety_cols, defect_rates
|
|
81
175
|
|
|
82
176
|
|
|
@@ -92,28 +186,152 @@ def _aggregate_label_defect_metrics(df: pd.DataFrame) -> Tuple[List[str], Dict[s
|
|
|
92
186
|
"""
|
|
93
187
|
handled_metrics = [
|
|
94
188
|
EvaluationMetrics.PROTECTED_MATERIAL,
|
|
189
|
+
EvaluationMetrics.FICTIONAL_CHARACTERS,
|
|
190
|
+
EvaluationMetrics.ARTWORK,
|
|
191
|
+
EvaluationMetrics.LOGOS_AND_BRANDS,
|
|
95
192
|
_InternalEvaluationMetrics.ECI,
|
|
96
193
|
EvaluationMetrics.XPIA,
|
|
194
|
+
EvaluationMetrics.CODE_VULNERABILITY,
|
|
195
|
+
EvaluationMetrics.UNGROUNDED_ATTRIBUTES,
|
|
97
196
|
]
|
|
98
197
|
label_cols = []
|
|
198
|
+
details_cols = []
|
|
99
199
|
for col in df.columns:
|
|
100
200
|
metric_name = col.split(".")[1]
|
|
101
201
|
if metric_name.endswith("_label") and metric_name.replace("_label", "").lower() in handled_metrics:
|
|
102
202
|
label_cols.append(col)
|
|
203
|
+
if metric_name.endswith("_details") and metric_name.replace("_details", "").lower() in handled_metrics:
|
|
204
|
+
details_cols = col
|
|
103
205
|
|
|
104
206
|
label_df = df[label_cols]
|
|
105
207
|
defect_rates = {}
|
|
106
208
|
for col in label_df.columns:
|
|
107
209
|
defect_rate_name = col.replace("_label", "_defect_rate")
|
|
108
|
-
col_with_boolean_values = pd.to_numeric(label_df[col], errors="coerce")
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
210
|
+
col_with_boolean_values = cast(List[float], pd.to_numeric(label_df[col], errors="coerce"))
|
|
211
|
+
try:
|
|
212
|
+
defect_rates[defect_rate_name] = round(list_mean_nan_safe(col_with_boolean_values), 2)
|
|
213
|
+
except EvaluationException: # only exception that can be cause is all NaN values
|
|
214
|
+
msg = f"All score evaluations are NaN/None for column {col}. No aggregation can be performed."
|
|
215
|
+
LOGGER.warning(msg)
|
|
216
|
+
|
|
217
|
+
if details_cols:
|
|
218
|
+
details_df = df[details_cols]
|
|
219
|
+
detail_defect_rates = {}
|
|
220
|
+
|
|
221
|
+
for key, value in details_df.items():
|
|
222
|
+
_process_rows(value, detail_defect_rates)
|
|
223
|
+
|
|
224
|
+
for key, value in detail_defect_rates.items():
|
|
225
|
+
col_with_boolean_values = pd.to_numeric(value, errors="coerce")
|
|
226
|
+
try:
|
|
227
|
+
defect_rates[f"{details_cols}.{key}_defect_rate"] = round(
|
|
228
|
+
list_mean_nan_safe(col_with_boolean_values), 2
|
|
229
|
+
)
|
|
230
|
+
except EvaluationException: # only exception that can be cause is all NaN values
|
|
231
|
+
msg = f"All score evaluations are NaN/None for column {key}. No aggregation can be performed."
|
|
232
|
+
LOGGER.warning(msg)
|
|
233
|
+
|
|
113
234
|
return label_cols, defect_rates
|
|
114
235
|
|
|
115
236
|
|
|
116
|
-
def
|
|
237
|
+
def _process_rows(row, detail_defect_rates):
|
|
238
|
+
for key, value in row.items():
|
|
239
|
+
if key not in detail_defect_rates:
|
|
240
|
+
detail_defect_rates[key] = []
|
|
241
|
+
detail_defect_rates[key].append(value)
|
|
242
|
+
return detail_defect_rates
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
def _aggregation_binary_output(df: pd.DataFrame) -> Dict[str, float]:
|
|
246
|
+
"""
|
|
247
|
+
Aggregate binary output results (pass/fail) from evaluation dataframe.
|
|
248
|
+
|
|
249
|
+
For each evaluator, calculates the proportion of "pass" results.
|
|
250
|
+
|
|
251
|
+
:param df: The dataframe of evaluation results.
|
|
252
|
+
:type df: ~pandas.DataFrame
|
|
253
|
+
:return: A dictionary mapping evaluator names to the proportion of pass results.
|
|
254
|
+
:rtype: Dict[str, float]
|
|
255
|
+
"""
|
|
256
|
+
results = {}
|
|
257
|
+
|
|
258
|
+
# Find all columns that end with "_result"
|
|
259
|
+
result_columns = [col for col in df.columns if col.startswith("outputs.") and col.endswith("_result")]
|
|
260
|
+
|
|
261
|
+
for col in result_columns:
|
|
262
|
+
# Extract the evaluator name from the column name
|
|
263
|
+
# (outputs.<evaluator>.<metric>_result)
|
|
264
|
+
parts = col.split(".")
|
|
265
|
+
evaluator_name = None
|
|
266
|
+
if len(parts) >= 3:
|
|
267
|
+
evaluator_name = parts[1]
|
|
268
|
+
else:
|
|
269
|
+
LOGGER.warning(
|
|
270
|
+
"Skipping column '%s' due to unexpected format. Expected at least three parts separated by '.'", col
|
|
271
|
+
)
|
|
272
|
+
continue
|
|
273
|
+
if evaluator_name:
|
|
274
|
+
# Count the occurrences of each unique value (pass/fail)
|
|
275
|
+
value_counts = df[col].value_counts().to_dict()
|
|
276
|
+
|
|
277
|
+
# Calculate the proportion of EVALUATION_PASS_FAIL_MAPPING[True] results
|
|
278
|
+
total_rows = len(df)
|
|
279
|
+
pass_count = value_counts.get(EVALUATION_PASS_FAIL_MAPPING[True], 0)
|
|
280
|
+
proportion = pass_count / total_rows if total_rows > 0 else 0.0
|
|
281
|
+
|
|
282
|
+
# Set the result with the evaluator name as the key
|
|
283
|
+
result_key = f"{evaluator_name}.{BINARY_AGGREGATE_SUFFIX}"
|
|
284
|
+
results[result_key] = round(proportion, 2)
|
|
285
|
+
|
|
286
|
+
return results
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
def _get_token_count_columns_to_exclude(df: pd.DataFrame) -> List[str]:
|
|
290
|
+
"""Identify token count columns from known SDK metrics that should be excluded from aggregation.
|
|
291
|
+
|
|
292
|
+
Token counts from custom evaluators are not excluded, only those from EvaluationMetrics
|
|
293
|
+
and _InternalEvaluationMetrics.
|
|
294
|
+
|
|
295
|
+
:param df: The dataframe of evaluation results.
|
|
296
|
+
:type df: ~pandas.DataFrame
|
|
297
|
+
:return: List of column names to exclude from aggregation.
|
|
298
|
+
:rtype: List[str]
|
|
299
|
+
"""
|
|
300
|
+
# Get all metric values from EvaluationMetrics class
|
|
301
|
+
evaluation_metrics_values = [
|
|
302
|
+
getattr(EvaluationMetrics, attr)
|
|
303
|
+
for attr in dir(EvaluationMetrics)
|
|
304
|
+
if not attr.startswith("_") and isinstance(getattr(EvaluationMetrics, attr), str)
|
|
305
|
+
]
|
|
306
|
+
|
|
307
|
+
# Get all metric values from _InternalEvaluationMetrics class
|
|
308
|
+
internal_metrics_values = [
|
|
309
|
+
getattr(_InternalEvaluationMetrics, attr)
|
|
310
|
+
for attr in dir(_InternalEvaluationMetrics)
|
|
311
|
+
if not attr.startswith("_") and isinstance(getattr(_InternalEvaluationMetrics, attr), str)
|
|
312
|
+
]
|
|
313
|
+
|
|
314
|
+
# Combine all known metrics
|
|
315
|
+
all_known_metrics = evaluation_metrics_values + internal_metrics_values
|
|
316
|
+
|
|
317
|
+
# Find token count columns that belong to known metrics
|
|
318
|
+
token_count_cols = [
|
|
319
|
+
col
|
|
320
|
+
for col in df.columns
|
|
321
|
+
if (
|
|
322
|
+
any(
|
|
323
|
+
col.endswith(f"{metric}_prompt_tokens")
|
|
324
|
+
or col.endswith(f"{metric}_completion_tokens")
|
|
325
|
+
or col.endswith(f"{metric}_total_tokens")
|
|
326
|
+
for metric in all_known_metrics
|
|
327
|
+
)
|
|
328
|
+
)
|
|
329
|
+
]
|
|
330
|
+
|
|
331
|
+
return token_count_cols
|
|
332
|
+
|
|
333
|
+
|
|
334
|
+
def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, Callable]) -> Dict[str, float]:
|
|
117
335
|
"""Aggregate metrics from the evaluation results.
|
|
118
336
|
On top of naively calculating the mean of most metrics, this function also identifies certain columns
|
|
119
337
|
that represent defect rates and renames them accordingly. Other columns in the dataframe are dropped.
|
|
@@ -122,10 +340,12 @@ def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, Type]) -> Dict[st
|
|
|
122
340
|
:param df: The dataframe of evaluation results.
|
|
123
341
|
:type df: ~pandas.DataFrame
|
|
124
342
|
:param evaluators: A dictionary mapping of strings to evaluator classes.
|
|
125
|
-
:type evaluators: Dict[str,
|
|
343
|
+
:type evaluators: Dict[str, Callable]
|
|
126
344
|
:return: The aggregated metrics.
|
|
127
345
|
:rtype: Dict[str, float]
|
|
128
346
|
"""
|
|
347
|
+
binary_metrics = _aggregation_binary_output(df)
|
|
348
|
+
|
|
129
349
|
df.rename(columns={col: col.replace("outputs.", "") for col in df.columns}, inplace=True)
|
|
130
350
|
|
|
131
351
|
handled_columns = []
|
|
@@ -133,134 +353,265 @@ def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, Type]) -> Dict[st
|
|
|
133
353
|
# Rename certain columns as defect rates if we know that's what their aggregates represent
|
|
134
354
|
# Content safety metrics
|
|
135
355
|
content_safety_cols, cs_defect_rates = _aggregate_content_safety_metrics(df, evaluators)
|
|
356
|
+
other_renamed_cols, renamed_cols = _aggregate_other_metrics(df)
|
|
136
357
|
handled_columns.extend(content_safety_cols)
|
|
358
|
+
handled_columns.extend(other_renamed_cols)
|
|
137
359
|
defect_rates.update(cs_defect_rates)
|
|
360
|
+
defect_rates.update(renamed_cols)
|
|
138
361
|
# Label-based (true/false) metrics where 'true' means 'something is wrong'
|
|
139
362
|
label_cols, label_defect_rates = _aggregate_label_defect_metrics(df)
|
|
140
363
|
handled_columns.extend(label_cols)
|
|
141
364
|
defect_rates.update(label_defect_rates)
|
|
142
365
|
|
|
366
|
+
# Exclude token count columns from aggregation for known SDK metrics
|
|
367
|
+
token_count_cols = _get_token_count_columns_to_exclude(df)
|
|
368
|
+
handled_columns.extend(token_count_cols)
|
|
369
|
+
|
|
143
370
|
# For rest of metrics, we will calculate mean
|
|
144
371
|
df.drop(columns=handled_columns, inplace=True)
|
|
145
372
|
|
|
373
|
+
# Convert "not applicable" strings to None to allow proper numeric aggregation
|
|
374
|
+
df = df.replace(EvaluatorBase._NOT_APPLICABLE_RESULT, None)
|
|
375
|
+
|
|
376
|
+
# NOTE: nan/None values don't count as as booleans, so boolean columns with
|
|
377
|
+
# nan/None values won't have a mean produced from them.
|
|
378
|
+
# This is different from label-based known evaluators, which have special handling.
|
|
146
379
|
mean_value = df.mean(numeric_only=True)
|
|
147
380
|
metrics = mean_value.to_dict()
|
|
148
381
|
# Add defect rates back into metrics
|
|
149
382
|
metrics.update(defect_rates)
|
|
383
|
+
|
|
384
|
+
# Add binary threshold metrics based on pass/fail results
|
|
385
|
+
metrics.update(binary_metrics)
|
|
386
|
+
|
|
150
387
|
return metrics
|
|
151
388
|
|
|
152
389
|
|
|
153
|
-
def
|
|
390
|
+
def _validate_columns_for_target(
|
|
391
|
+
df: pd.DataFrame,
|
|
392
|
+
target: Callable,
|
|
393
|
+
) -> None:
|
|
394
|
+
"""
|
|
395
|
+
Check that all columns needed by target function are present.
|
|
396
|
+
|
|
397
|
+
:param df: The data frame to be validated.
|
|
398
|
+
:type df: pd.DataFrame
|
|
399
|
+
:param target: The callable to be applied to data set.
|
|
400
|
+
:type target: Optional[Callable]
|
|
401
|
+
:raises EvaluationException: If the column starts with "__outputs." or if the input data contains missing fields.
|
|
402
|
+
"""
|
|
403
|
+
if any(c.startswith(Prefixes.TSG_OUTPUTS) for c in df.columns):
|
|
404
|
+
msg = "The column cannot start from " f'"{Prefixes.TSG_OUTPUTS}" if target was defined.'
|
|
405
|
+
raise EvaluationException(
|
|
406
|
+
message=msg,
|
|
407
|
+
internal_message=msg,
|
|
408
|
+
target=ErrorTarget.EVALUATE,
|
|
409
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
410
|
+
blame=ErrorBlame.USER_ERROR,
|
|
411
|
+
)
|
|
412
|
+
# If the target function is given, it may return
|
|
413
|
+
# several columns and hence we cannot check the availability of columns
|
|
414
|
+
# without knowing target function semantics.
|
|
415
|
+
# Instead, here we will validate the columns, taken by target.
|
|
154
416
|
required_inputs = [
|
|
155
417
|
param.name
|
|
156
|
-
for param in inspect.signature(
|
|
418
|
+
for param in inspect.signature(target).parameters.values()
|
|
157
419
|
if param.default == inspect.Parameter.empty and param.name not in ["kwargs", "args", "self"]
|
|
158
420
|
]
|
|
159
421
|
|
|
160
|
-
missing_inputs = [col for col in required_inputs if col not in
|
|
422
|
+
missing_inputs = [col for col in required_inputs if col not in df.columns]
|
|
161
423
|
if missing_inputs:
|
|
162
|
-
|
|
163
|
-
msg = f"Missing required inputs for evaluator {evaluator_name} : {missing_inputs}."
|
|
164
|
-
raise EvaluationException(
|
|
165
|
-
message=msg,
|
|
166
|
-
internal_message=msg,
|
|
167
|
-
target=ErrorTarget.EVALUATE,
|
|
168
|
-
category=ErrorCategory.MISSING_FIELD,
|
|
169
|
-
blame=ErrorBlame.USER_ERROR,
|
|
170
|
-
)
|
|
171
|
-
msg = f"Missing required inputs for target : {missing_inputs}."
|
|
424
|
+
msg = f"Missing required inputs for target: {missing_inputs}."
|
|
172
425
|
raise EvaluationException(
|
|
173
426
|
message=msg,
|
|
174
|
-
internal_message=msg,
|
|
175
427
|
target=ErrorTarget.EVALUATE,
|
|
176
428
|
category=ErrorCategory.MISSING_FIELD,
|
|
177
429
|
blame=ErrorBlame.USER_ERROR,
|
|
178
430
|
)
|
|
179
431
|
|
|
180
432
|
|
|
181
|
-
def
|
|
433
|
+
def _validate_columns_for_evaluators(
|
|
434
|
+
df: pd.DataFrame,
|
|
435
|
+
evaluators: Dict[str, Callable],
|
|
436
|
+
target: Optional[Callable],
|
|
437
|
+
target_generated_columns: Optional[Set[str]],
|
|
438
|
+
column_mapping: Dict[str, Dict[str, str]],
|
|
439
|
+
) -> None:
|
|
440
|
+
"""
|
|
441
|
+
Check that all columns needed by evaluators are present.
|
|
442
|
+
|
|
443
|
+
:param df: The data frame to be validated.
|
|
444
|
+
:type df: pd.DataFrame
|
|
445
|
+
:param evaluators: The dictionary of evaluators.
|
|
446
|
+
:type evaluators: Dict[str, Callable]
|
|
447
|
+
:param target: The callable to be applied to data set.
|
|
448
|
+
:type target: Optional[Callable]
|
|
449
|
+
:param target_generated_columns: The set of columns generated by the target callable.
|
|
450
|
+
:type target_generated_columns: Optional[Set[str]]
|
|
451
|
+
:param column_mapping: Dictionary mapping evaluator name to evaluator column mapping.
|
|
452
|
+
:type column_mapping: Dict[str, Dict[str, str]]
|
|
453
|
+
:raises EvaluationException: If data is missing required inputs or if the target callable did not generate the necessary columns.
|
|
454
|
+
"""
|
|
455
|
+
missing_inputs_per_evaluator = {}
|
|
456
|
+
|
|
457
|
+
for evaluator_name, evaluator in evaluators.items():
|
|
458
|
+
# Apply column mapping
|
|
459
|
+
mapping_config = column_mapping.get(evaluator_name, column_mapping.get("default", None))
|
|
460
|
+
new_df = _apply_column_mapping(df, mapping_config)
|
|
461
|
+
|
|
462
|
+
# Validate input data for evaluator
|
|
463
|
+
is_built_in = evaluator.__module__.startswith("azure.ai.evaluation")
|
|
464
|
+
if is_built_in:
|
|
465
|
+
# Note that for built-in evaluators supporting the "conversation" parameter,
|
|
466
|
+
# input parameters are now optional.
|
|
467
|
+
evaluator_params = [
|
|
468
|
+
param.name
|
|
469
|
+
for param in inspect.signature(evaluator).parameters.values()
|
|
470
|
+
if param.name not in ["kwargs", "args", "self"]
|
|
471
|
+
]
|
|
472
|
+
|
|
473
|
+
if "conversation" in evaluator_params and "conversation" in new_df.columns:
|
|
474
|
+
# Ignore the missing fields if "conversation" presents in the input data
|
|
475
|
+
missing_inputs = []
|
|
476
|
+
else:
|
|
477
|
+
optional_params = (
|
|
478
|
+
cast(Any, evaluator)._OPTIONAL_PARAMS # pylint: disable=protected-access
|
|
479
|
+
if hasattr(evaluator, "_OPTIONAL_PARAMS")
|
|
480
|
+
else []
|
|
481
|
+
)
|
|
482
|
+
excluded_params = set(new_df.columns).union(optional_params)
|
|
483
|
+
missing_inputs = [col for col in evaluator_params if col not in excluded_params]
|
|
484
|
+
|
|
485
|
+
# If "conversation" is the only parameter and it is missing, keep it in the missing inputs
|
|
486
|
+
# Otherwise, remove it from the missing inputs
|
|
487
|
+
if "conversation" in missing_inputs:
|
|
488
|
+
if not (evaluator_params == ["conversation"] and missing_inputs == ["conversation"]):
|
|
489
|
+
missing_inputs.remove("conversation")
|
|
490
|
+
else:
|
|
491
|
+
evaluator_params = [
|
|
492
|
+
param.name
|
|
493
|
+
for param in inspect.signature(evaluator).parameters.values()
|
|
494
|
+
if param.default == inspect.Parameter.empty and param.name not in ["kwargs", "args", "self"]
|
|
495
|
+
]
|
|
496
|
+
|
|
497
|
+
missing_inputs = [col for col in evaluator_params if col not in new_df.columns]
|
|
498
|
+
|
|
499
|
+
if missing_inputs:
|
|
500
|
+
missing_inputs_per_evaluator[evaluator_name] = missing_inputs
|
|
501
|
+
|
|
502
|
+
if missing_inputs_per_evaluator:
|
|
503
|
+
msg = "Some evaluators are missing required inputs:\n"
|
|
504
|
+
for evaluator_name, missing in missing_inputs_per_evaluator.items():
|
|
505
|
+
msg += f"- {evaluator_name}: {missing}\n"
|
|
506
|
+
|
|
507
|
+
# Add the additional notes
|
|
508
|
+
msg += "\nTo resolve this issue:\n"
|
|
509
|
+
msg += "- Ensure the data contains required inputs.\n"
|
|
510
|
+
if target is not None:
|
|
511
|
+
msg += "- Verify that the target is generating the necessary columns for the evaluators. "
|
|
512
|
+
msg += f"Currently generated columns: {target_generated_columns} \n"
|
|
513
|
+
msg += "- Check that the column mapping is correctly configured."
|
|
514
|
+
|
|
515
|
+
raise EvaluationException(
|
|
516
|
+
message=msg.strip(),
|
|
517
|
+
target=ErrorTarget.EVALUATE,
|
|
518
|
+
category=ErrorCategory.MISSING_FIELD,
|
|
519
|
+
blame=ErrorBlame.USER_ERROR,
|
|
520
|
+
)
|
|
521
|
+
|
|
522
|
+
|
|
523
|
+
def _validate_and_load_data(target, data, evaluators, output_path, azure_ai_project, evaluation_name, tags):
|
|
182
524
|
if data is None:
|
|
183
|
-
msg = "data parameter
|
|
525
|
+
msg = "The 'data' parameter is required for evaluation."
|
|
184
526
|
raise EvaluationException(
|
|
185
527
|
message=msg,
|
|
186
|
-
internal_message=msg,
|
|
187
528
|
target=ErrorTarget.EVALUATE,
|
|
188
|
-
category=ErrorCategory.
|
|
529
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
530
|
+
blame=ErrorBlame.USER_ERROR,
|
|
531
|
+
)
|
|
532
|
+
if not isinstance(data, (os.PathLike, str)):
|
|
533
|
+
msg = "The 'data' parameter must be a string or a path-like object."
|
|
534
|
+
raise EvaluationException(
|
|
535
|
+
message=msg,
|
|
536
|
+
target=ErrorTarget.EVALUATE,
|
|
537
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
538
|
+
blame=ErrorBlame.USER_ERROR,
|
|
539
|
+
)
|
|
540
|
+
if not os.path.exists(data):
|
|
541
|
+
msg = f"The input data file path '{data}' does not exist."
|
|
542
|
+
raise EvaluationException(
|
|
543
|
+
message=msg,
|
|
544
|
+
target=ErrorTarget.EVALUATE,
|
|
545
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
189
546
|
blame=ErrorBlame.USER_ERROR,
|
|
190
547
|
)
|
|
191
548
|
|
|
192
549
|
if target is not None:
|
|
193
550
|
if not callable(target):
|
|
194
|
-
msg = "target parameter must be a callable function."
|
|
551
|
+
msg = "The 'target' parameter must be a callable function."
|
|
195
552
|
raise EvaluationException(
|
|
196
553
|
message=msg,
|
|
197
|
-
internal_message=msg,
|
|
198
554
|
target=ErrorTarget.EVALUATE,
|
|
199
555
|
category=ErrorCategory.INVALID_VALUE,
|
|
200
556
|
blame=ErrorBlame.USER_ERROR,
|
|
201
557
|
)
|
|
202
558
|
|
|
203
|
-
if
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
559
|
+
if not evaluators:
|
|
560
|
+
msg = "The 'evaluators' parameter is required and cannot be None or empty."
|
|
561
|
+
raise EvaluationException(
|
|
562
|
+
message=msg,
|
|
563
|
+
target=ErrorTarget.EVALUATE,
|
|
564
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
565
|
+
blame=ErrorBlame.USER_ERROR,
|
|
566
|
+
)
|
|
567
|
+
if not isinstance(evaluators, dict):
|
|
568
|
+
msg = "The 'evaluators' parameter must be a dictionary."
|
|
569
|
+
raise EvaluationException(
|
|
570
|
+
message=msg,
|
|
571
|
+
target=ErrorTarget.EVALUATE,
|
|
572
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
573
|
+
blame=ErrorBlame.USER_ERROR,
|
|
574
|
+
)
|
|
213
575
|
|
|
214
|
-
if
|
|
215
|
-
if not isinstance(
|
|
216
|
-
msg = "
|
|
576
|
+
if output_path is not None:
|
|
577
|
+
if not isinstance(output_path, (os.PathLike, str)):
|
|
578
|
+
msg = "The 'output_path' parameter must be a string or a path-like object."
|
|
217
579
|
raise EvaluationException(
|
|
218
580
|
message=msg,
|
|
219
|
-
internal_message=msg,
|
|
220
581
|
target=ErrorTarget.EVALUATE,
|
|
221
582
|
category=ErrorCategory.INVALID_VALUE,
|
|
222
583
|
blame=ErrorBlame.USER_ERROR,
|
|
223
584
|
)
|
|
224
585
|
|
|
225
|
-
|
|
226
|
-
if not
|
|
227
|
-
msg = "
|
|
586
|
+
output_dir = output_path if os.path.isdir(output_path) else os.path.dirname(output_path)
|
|
587
|
+
if output_dir and not os.path.exists(output_dir):
|
|
588
|
+
msg = f"The output directory '{output_dir}' does not exist. Please create the directory manually."
|
|
228
589
|
raise EvaluationException(
|
|
229
590
|
message=msg,
|
|
230
|
-
internal_message=msg,
|
|
231
591
|
target=ErrorTarget.EVALUATE,
|
|
232
592
|
category=ErrorCategory.INVALID_VALUE,
|
|
233
593
|
blame=ErrorBlame.USER_ERROR,
|
|
234
594
|
)
|
|
235
595
|
|
|
236
596
|
if azure_ai_project is not None:
|
|
237
|
-
|
|
238
|
-
msg = "azure_ai_project parameter must be a dictionary."
|
|
239
|
-
raise EvaluationException(
|
|
240
|
-
message=msg,
|
|
241
|
-
internal_message=msg,
|
|
242
|
-
target=ErrorTarget.EVALUATE,
|
|
243
|
-
category=ErrorCategory.INVALID_VALUE,
|
|
244
|
-
blame=ErrorBlame.USER_ERROR,
|
|
245
|
-
)
|
|
597
|
+
validate_azure_ai_project(azure_ai_project)
|
|
246
598
|
|
|
247
599
|
if evaluation_name is not None:
|
|
248
|
-
if not isinstance(evaluation_name, str):
|
|
249
|
-
msg = "evaluation_name parameter must be a string."
|
|
600
|
+
if not isinstance(evaluation_name, str) or not evaluation_name.strip():
|
|
601
|
+
msg = "The 'evaluation_name' parameter must be a non-empty string."
|
|
250
602
|
raise EvaluationException(
|
|
251
603
|
message=msg,
|
|
252
|
-
internal_message=msg,
|
|
253
604
|
target=ErrorTarget.EVALUATE,
|
|
254
605
|
category=ErrorCategory.INVALID_VALUE,
|
|
255
606
|
blame=ErrorBlame.USER_ERROR,
|
|
256
607
|
)
|
|
257
608
|
|
|
258
609
|
try:
|
|
259
|
-
|
|
610
|
+
data_loader = DataLoaderFactory.get_loader(data)
|
|
611
|
+
initial_data_df = data_loader.load()
|
|
260
612
|
except Exception as e:
|
|
261
613
|
raise EvaluationException(
|
|
262
|
-
message=f"
|
|
263
|
-
internal_message="Failed to load data. Confirm that it is valid jsonl data.",
|
|
614
|
+
message=f"Unable to load data from '{data}'. Supported formats are JSONL and CSV. Detailed error: {e}.",
|
|
264
615
|
target=ErrorTarget.EVALUATE,
|
|
265
616
|
category=ErrorCategory.INVALID_VALUE,
|
|
266
617
|
blame=ErrorBlame.USER_ERROR,
|
|
@@ -269,88 +620,67 @@ def _validate_and_load_data(target, data, evaluators, output_path, azure_ai_proj
|
|
|
269
620
|
return initial_data_df
|
|
270
621
|
|
|
271
622
|
|
|
272
|
-
def _validate_columns(
|
|
273
|
-
df: pd.DataFrame,
|
|
274
|
-
evaluators: Dict[str, Any],
|
|
275
|
-
target: Optional[Callable],
|
|
276
|
-
evaluator_config: Dict[str, Dict[str, str]],
|
|
277
|
-
) -> None:
|
|
278
|
-
"""
|
|
279
|
-
Check that all columns needed by evaluator or target function are present.
|
|
280
|
-
|
|
281
|
-
:param df: The data frame to be validated.
|
|
282
|
-
:type df: pd.DataFrame
|
|
283
|
-
:param evaluators: The dictionary of evaluators.
|
|
284
|
-
:type evaluators: Dict[str, Any]
|
|
285
|
-
:param target: The callable to be applied to data set.
|
|
286
|
-
:type target: Optional[Callable]
|
|
287
|
-
:param evaluator_config: The configuration for evaluators.
|
|
288
|
-
:type evaluator_config: Dict[str, Dict[str, str]]
|
|
289
|
-
:raises EvaluationException: If column starts from "__outputs." while target is defined.
|
|
290
|
-
"""
|
|
291
|
-
if target:
|
|
292
|
-
if any(c.startswith(Prefixes.TSG_OUTPUTS) for c in df.columns):
|
|
293
|
-
msg = "The column cannot start from " f'"{Prefixes.TSG_OUTPUTS}" if target was defined.'
|
|
294
|
-
raise EvaluationException(
|
|
295
|
-
message=msg,
|
|
296
|
-
internal_message=msg,
|
|
297
|
-
target=ErrorTarget.EVALUATE,
|
|
298
|
-
category=ErrorCategory.INVALID_VALUE,
|
|
299
|
-
blame=ErrorBlame.USER_ERROR,
|
|
300
|
-
)
|
|
301
|
-
# If the target function is given, it may return
|
|
302
|
-
# several columns and hence we cannot check the availability of columns
|
|
303
|
-
# without knowing target function semantics.
|
|
304
|
-
# Instead, here we will validate the columns, taken by target.
|
|
305
|
-
_validate_input_data_for_evaluator(target, None, df, is_target_fn=True)
|
|
306
|
-
else:
|
|
307
|
-
for evaluator_name, evaluator in evaluators.items():
|
|
308
|
-
# Apply column mapping
|
|
309
|
-
mapping_config = evaluator_config.get(evaluator_name, evaluator_config.get("default", None))
|
|
310
|
-
new_df = _apply_column_mapping(df, mapping_config)
|
|
311
|
-
|
|
312
|
-
# Validate input data for evaluator
|
|
313
|
-
_validate_input_data_for_evaluator(evaluator, evaluator_name, new_df)
|
|
314
|
-
|
|
315
|
-
|
|
316
623
|
def _apply_target_to_data(
|
|
317
624
|
target: Callable,
|
|
318
|
-
data: str,
|
|
319
|
-
|
|
625
|
+
data: Union[str, os.PathLike, pd.DataFrame],
|
|
626
|
+
batch_client: BatchClient,
|
|
320
627
|
initial_data: pd.DataFrame,
|
|
321
628
|
evaluation_name: Optional[str] = None,
|
|
322
|
-
|
|
323
|
-
) -> Tuple[pd.DataFrame, Set[str]]:
|
|
629
|
+
**kwargs,
|
|
630
|
+
) -> Tuple[pd.DataFrame, Set[str], BatchClientRun]:
|
|
324
631
|
"""
|
|
325
632
|
Apply the target function to the data set and return updated data and generated columns.
|
|
326
633
|
|
|
327
634
|
:param target: The function to be applied to data.
|
|
328
635
|
:type target: Callable
|
|
329
|
-
:param data: The path to input jsonl file.
|
|
330
|
-
:type data: str
|
|
331
|
-
:param
|
|
332
|
-
:type
|
|
636
|
+
:param data: The path to input jsonl or csv file.
|
|
637
|
+
:type data: Union[str, os.PathLike]
|
|
638
|
+
:param batch_client: The promptflow client to be used.
|
|
639
|
+
:type batch_client: PFClient
|
|
333
640
|
:param initial_data: The data frame with the loaded data.
|
|
334
641
|
:type initial_data: pd.DataFrame
|
|
335
642
|
:param evaluation_name: The name of the evaluation.
|
|
336
643
|
:type evaluation_name: Optional[str]
|
|
337
|
-
:param _run_name: The name of target run. Used for testing only.
|
|
338
|
-
:type _run_name: Optional[str]
|
|
339
644
|
:return: The tuple, containing data frame and the list of added columns.
|
|
340
645
|
:rtype: Tuple[pandas.DataFrame, List[str]]
|
|
341
646
|
"""
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
647
|
+
|
|
648
|
+
_run_name = kwargs.get("_run_name")
|
|
649
|
+
with TargetRunContext(batch_client):
|
|
650
|
+
run: BatchClientRun = batch_client.run(
|
|
651
|
+
flow=target,
|
|
652
|
+
display_name=evaluation_name,
|
|
653
|
+
data=data,
|
|
654
|
+
stream=True,
|
|
655
|
+
name=_run_name,
|
|
656
|
+
evaluator_name=getattr(target, "__qualname__", "TARGET"),
|
|
657
|
+
)
|
|
658
|
+
target_output: pd.DataFrame = batch_client.get_details(run, all_results=True)
|
|
659
|
+
run_summary = batch_client.get_run_summary(run)
|
|
660
|
+
|
|
661
|
+
if run_summary["completed_lines"] == 0:
|
|
662
|
+
msg = (
|
|
663
|
+
f"Evaluation target failed to produce any results."
|
|
664
|
+
f" Please check the logs at {run_summary['log_path']} for more details about cause of failure."
|
|
665
|
+
)
|
|
666
|
+
raise EvaluationException(
|
|
667
|
+
message=msg,
|
|
668
|
+
target=ErrorTarget.EVALUATE,
|
|
669
|
+
category=ErrorCategory.FAILED_EXECUTION,
|
|
670
|
+
blame=ErrorBlame.USER_ERROR,
|
|
671
|
+
)
|
|
672
|
+
|
|
673
|
+
# Log a warning if some rows failed
|
|
674
|
+
failed_lines = run_summary.get("failed_lines", 0)
|
|
675
|
+
completed_lines = run_summary["completed_lines"]
|
|
676
|
+
total_lines = failed_lines + completed_lines
|
|
677
|
+
|
|
678
|
+
if failed_lines > 0:
|
|
679
|
+
LOGGER.warning(
|
|
680
|
+
f"Target function completed {completed_lines} out of {total_lines} rows. "
|
|
681
|
+
f"{failed_lines} rows failed and will be filled with NaN values."
|
|
682
|
+
)
|
|
683
|
+
|
|
354
684
|
# Remove input and output prefix
|
|
355
685
|
generated_columns = {
|
|
356
686
|
col[len(Prefixes.OUTPUTS) :] for col in target_output.columns if col.startswith(Prefixes.OUTPUTS)
|
|
@@ -358,6 +688,13 @@ def _apply_target_to_data(
|
|
|
358
688
|
# Sort output by line numbers
|
|
359
689
|
target_output.set_index(f"inputs.{LINE_NUMBER}", inplace=True)
|
|
360
690
|
target_output.sort_index(inplace=True)
|
|
691
|
+
|
|
692
|
+
initial_data_with_line_numbers = initial_data.copy()
|
|
693
|
+
initial_data_with_line_numbers[LINE_NUMBER] = range(len(initial_data))
|
|
694
|
+
|
|
695
|
+
complete_index = initial_data_with_line_numbers[LINE_NUMBER]
|
|
696
|
+
target_output = target_output.reindex(complete_index)
|
|
697
|
+
|
|
361
698
|
target_output.reset_index(inplace=True, drop=False)
|
|
362
699
|
# target_output contains only input columns, taken by function,
|
|
363
700
|
# so we need to concatenate it to the input data frame.
|
|
@@ -366,34 +703,36 @@ def _apply_target_to_data(
|
|
|
366
703
|
# Rename outputs columns to __outputs
|
|
367
704
|
rename_dict = {col: col.replace(Prefixes.OUTPUTS, Prefixes.TSG_OUTPUTS) for col in target_output.columns}
|
|
368
705
|
target_output.rename(columns=rename_dict, inplace=True)
|
|
369
|
-
# Concatenate output to input
|
|
370
|
-
target_output = pd.concat([
|
|
706
|
+
# Concatenate output to input - now both dataframes have the same number of rows
|
|
707
|
+
target_output = pd.concat([initial_data, target_output], axis=1)
|
|
371
708
|
|
|
372
709
|
return target_output, generated_columns, run
|
|
373
710
|
|
|
374
711
|
|
|
375
|
-
def
|
|
376
|
-
|
|
712
|
+
def _process_column_mappings(
|
|
713
|
+
column_mapping: Dict[str, Optional[Dict[str, str]]],
|
|
714
|
+
) -> Dict[str, Dict[str, str]]:
|
|
715
|
+
"""Process column_mapping to replace ${target.} with ${data.}
|
|
377
716
|
|
|
378
|
-
:param
|
|
379
|
-
:type
|
|
717
|
+
:param column_mapping: The configuration for evaluators.
|
|
718
|
+
:type column_mapping: Dict[str, Optional[Dict[str, str]]]
|
|
380
719
|
:return: The processed configuration.
|
|
381
720
|
:rtype: Dict[str, Dict[str, str]]
|
|
382
721
|
"""
|
|
383
722
|
|
|
384
|
-
processed_config = {}
|
|
723
|
+
processed_config: Dict[str, Dict[str, str]] = {}
|
|
385
724
|
|
|
386
|
-
|
|
725
|
+
expected_references = re.compile(r"^\$\{(target|data)\.([a-zA-Z0-9_]+(?:\.[a-zA-Z0-9_]+)*)\}$")
|
|
387
726
|
|
|
388
|
-
if
|
|
389
|
-
for evaluator, mapping_config in
|
|
727
|
+
if column_mapping:
|
|
728
|
+
for evaluator, mapping_config in column_mapping.items():
|
|
390
729
|
if isinstance(mapping_config, dict):
|
|
391
730
|
processed_config[evaluator] = {}
|
|
392
731
|
|
|
393
732
|
for map_to_key, map_value in mapping_config.items():
|
|
394
733
|
# Check if there's any unexpected reference other than ${target.} or ${data.}
|
|
395
|
-
if
|
|
396
|
-
msg = "Unexpected references detected in '
|
|
734
|
+
if not expected_references.search(map_value):
|
|
735
|
+
msg = "Unexpected references detected in 'column_mapping'. Ensure only ${target.} and ${data.} are used."
|
|
397
736
|
raise EvaluationException(
|
|
398
737
|
message=msg,
|
|
399
738
|
internal_message=msg,
|
|
@@ -432,94 +771,93 @@ def _rename_columns_conditionally(df: pd.DataFrame) -> pd.DataFrame:
|
|
|
432
771
|
return df
|
|
433
772
|
|
|
434
773
|
|
|
435
|
-
# @log_evaluate_activity
|
|
436
774
|
def evaluate(
|
|
437
775
|
*,
|
|
438
|
-
data: str,
|
|
439
|
-
evaluators: Dict[str, Callable],
|
|
776
|
+
data: Union[str, os.PathLike],
|
|
777
|
+
evaluators: Dict[str, Union[Callable, AzureOpenAIGrader]],
|
|
440
778
|
evaluation_name: Optional[str] = None,
|
|
441
779
|
target: Optional[Callable] = None,
|
|
442
|
-
evaluator_config: Optional[Dict[str,
|
|
443
|
-
azure_ai_project: Optional[AzureAIProject] = None,
|
|
444
|
-
output_path: Optional[str] = None,
|
|
780
|
+
evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None,
|
|
781
|
+
azure_ai_project: Optional[Union[str, AzureAIProject]] = None,
|
|
782
|
+
output_path: Optional[Union[str, os.PathLike]] = None,
|
|
783
|
+
fail_on_evaluator_errors: bool = False,
|
|
784
|
+
tags: Optional[Dict[str, str]] = None,
|
|
445
785
|
**kwargs,
|
|
446
|
-
):
|
|
786
|
+
) -> EvaluationResult:
|
|
447
787
|
"""Evaluates target or data with built-in or custom evaluators. If both target and data are provided,
|
|
448
788
|
data will be run through target function and then results will be evaluated.
|
|
449
789
|
|
|
450
790
|
:keyword data: Path to the data to be evaluated or passed to target if target is set.
|
|
451
|
-
|
|
791
|
+
JSONL and CSV files are supported. `target` and `data` both cannot be None. Required.
|
|
452
792
|
:paramtype data: str
|
|
453
793
|
:keyword evaluators: Evaluators to be used for evaluation. It should be a dictionary with key as alias for evaluator
|
|
454
|
-
and value as the evaluator function.
|
|
455
|
-
|
|
794
|
+
and value as the evaluator function. Also accepts AzureOpenAIGrader instances as values, which are processed separately.
|
|
795
|
+
Required.
|
|
796
|
+
:paramtype evaluators: Dict[str, Union[Callable, ~azure.ai.evaluation.AzureOpenAIGrader]]
|
|
456
797
|
:keyword evaluation_name: Display name of the evaluation.
|
|
457
798
|
:paramtype evaluation_name: Optional[str]
|
|
458
799
|
:keyword target: Target to be evaluated. `target` and `data` both cannot be None
|
|
459
800
|
:paramtype target: Optional[Callable]
|
|
460
801
|
:keyword evaluator_config: Configuration for evaluators. The configuration should be a dictionary with evaluator
|
|
461
|
-
names as keys and a
|
|
462
|
-
keys as the column names in the evaluator input and values as the column names in the
|
|
463
|
-
generated by target.
|
|
464
|
-
:paramtype evaluator_config: Optional[Dict[str,
|
|
802
|
+
names as keys and a values that are dictionaries containing the column mappings. The column mappings should
|
|
803
|
+
be a dictionary with keys as the column names in the evaluator input and values as the column names in the
|
|
804
|
+
input data or data generated by target.
|
|
805
|
+
:paramtype evaluator_config: Optional[Dict[str, ~azure.ai.evaluation.EvaluatorConfig]]
|
|
465
806
|
:keyword output_path: The local folder or file path to save evaluation results to if set. If folder path is provided
|
|
466
807
|
the results will be saved to a file named `evaluation_results.json` in the folder.
|
|
467
808
|
:paramtype output_path: Optional[str]
|
|
468
|
-
:keyword azure_ai_project:
|
|
469
|
-
|
|
809
|
+
:keyword azure_ai_project: The Azure AI project, which can either be a string representing the project endpoint
|
|
810
|
+
or an instance of AzureAIProject. It contains subscription id, resource group, and project name.
|
|
811
|
+
:paramtype azure_ai_project: Optional[Union[str, ~azure.ai.evaluation.AzureAIProject]]
|
|
812
|
+
:keyword fail_on_evaluator_errors: Whether or not the evaluation should cancel early with an EvaluationException
|
|
813
|
+
if ANY evaluator fails during their evaluation.
|
|
814
|
+
Defaults to false, which means that evaluations will continue regardless of failures.
|
|
815
|
+
If such failures occur, metrics may be missing, and evidence of failures can be found in the evaluation's logs.
|
|
816
|
+
:paramtype fail_on_evaluator_errors: bool
|
|
817
|
+
:keyword tags: A dictionary of tags to be added to the evaluation run for tracking and organization purposes.
|
|
818
|
+
Keys and values must be strings. For more information about tag limits, see:
|
|
819
|
+
https://learn.microsoft.com/en-us/azure/machine-learning/resource-limits-capacity?view=azureml-api-2#runs
|
|
820
|
+
:paramtype tags: Optional[Dict[str, str]]
|
|
821
|
+
:keyword user_agent: A string to append to the default user-agent sent with evaluation http requests
|
|
822
|
+
:paramtype user_agent: Optional[str]
|
|
470
823
|
:return: Evaluation results.
|
|
471
|
-
:rtype:
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
path = "evaluate_test_data.jsonl"
|
|
492
|
-
result = evaluate(
|
|
493
|
-
data=path,
|
|
494
|
-
evaluators={
|
|
495
|
-
"coherence": coherence_eval,
|
|
496
|
-
"relevance": relevance_eval,
|
|
497
|
-
},
|
|
498
|
-
evaluator_config={
|
|
499
|
-
"coherence": {
|
|
500
|
-
"response": "${data.response}",
|
|
501
|
-
"query": "${data.query}"
|
|
502
|
-
},
|
|
503
|
-
"relevance": {
|
|
504
|
-
"response": "${data.response}",
|
|
505
|
-
"context": "${data.context}",
|
|
506
|
-
"query": "${data.query}"
|
|
507
|
-
}
|
|
508
|
-
}
|
|
509
|
-
)
|
|
510
|
-
|
|
824
|
+
:rtype: ~azure.ai.evaluation.EvaluationResult
|
|
825
|
+
|
|
826
|
+
.. admonition:: Example:
|
|
827
|
+
|
|
828
|
+
.. literalinclude:: ../samples/evaluation_samples_evaluate.py
|
|
829
|
+
:start-after: [START evaluate_method]
|
|
830
|
+
:end-before: [END evaluate_method]
|
|
831
|
+
:language: python
|
|
832
|
+
:dedent: 8
|
|
833
|
+
:caption: Run an evaluation on local data with one or more evaluators using azure.ai.evaluation.AzureAIProject
|
|
834
|
+
|
|
835
|
+
.. admonition:: Example using Azure AI Project URL:
|
|
836
|
+
|
|
837
|
+
.. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
|
|
838
|
+
:start-after: [START evaluate_method]
|
|
839
|
+
:end-before: [END evaluate_method]
|
|
840
|
+
:language: python
|
|
841
|
+
:dedent: 8
|
|
842
|
+
:caption: Run an evaluation on local data with one or more evaluators using Azure AI Project URL in following format
|
|
843
|
+
https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
|
|
511
844
|
"""
|
|
512
845
|
try:
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
846
|
+
user_agent: Optional[str] = kwargs.get("user_agent")
|
|
847
|
+
with UserAgentSingleton().add_useragent_product(user_agent) if user_agent else contextlib.nullcontext():
|
|
848
|
+
results = _evaluate(
|
|
849
|
+
evaluation_name=evaluation_name,
|
|
850
|
+
target=target,
|
|
851
|
+
data=data,
|
|
852
|
+
evaluators_and_graders=evaluators,
|
|
853
|
+
evaluator_config=evaluator_config,
|
|
854
|
+
azure_ai_project=azure_ai_project,
|
|
855
|
+
output_path=output_path,
|
|
856
|
+
fail_on_evaluator_errors=fail_on_evaluator_errors,
|
|
857
|
+
tags=tags,
|
|
858
|
+
**kwargs,
|
|
859
|
+
)
|
|
860
|
+
return results
|
|
523
861
|
except Exception as e:
|
|
524
862
|
# Handle multiprocess bootstrap error
|
|
525
863
|
bootstrap_error = (
|
|
@@ -538,116 +876,802 @@ def evaluate(
|
|
|
538
876
|
internal_message=error_message,
|
|
539
877
|
target=ErrorTarget.EVALUATE,
|
|
540
878
|
category=ErrorCategory.FAILED_EXECUTION,
|
|
541
|
-
blame=ErrorBlame.
|
|
879
|
+
blame=ErrorBlame.USER_ERROR,
|
|
880
|
+
) from e
|
|
881
|
+
|
|
882
|
+
# Ensure a consistent user experience when encountering errors by converting
|
|
883
|
+
# all other exceptions to EvaluationException.
|
|
884
|
+
if not isinstance(e, EvaluationException):
|
|
885
|
+
raise EvaluationException(
|
|
886
|
+
message=str(e),
|
|
887
|
+
target=ErrorTarget.EVALUATE,
|
|
888
|
+
category=ErrorCategory.FAILED_EXECUTION,
|
|
889
|
+
blame=ErrorBlame.SYSTEM_ERROR,
|
|
542
890
|
) from e
|
|
543
891
|
|
|
544
892
|
raise e
|
|
545
893
|
|
|
546
894
|
|
|
547
|
-
def
|
|
895
|
+
def _print_summary(per_evaluator_results: Dict[str, Any]) -> None:
|
|
896
|
+
# Extract evaluators with a non-empty "run_summary"
|
|
897
|
+
output_dict = {
|
|
898
|
+
name: result["run_summary"] for name, result in per_evaluator_results.items() if result.get("run_summary")
|
|
899
|
+
}
|
|
900
|
+
|
|
901
|
+
if output_dict:
|
|
902
|
+
print("======= Combined Run Summary (Per Evaluator) =======\n")
|
|
903
|
+
print(json.dumps(output_dict, indent=4))
|
|
904
|
+
print("\n====================================================\n")
|
|
905
|
+
|
|
906
|
+
|
|
907
|
+
def _print_fail_flag_warning() -> None:
|
|
908
|
+
print(
|
|
909
|
+
"Notice: fail_on_evaluator_errors is enabled. It is recommended that you disable "
|
|
910
|
+
+ "this flag for evaluations on large datasets (loosely defined as more than 10 rows of inputs, "
|
|
911
|
+
+ "or more than 4 evaluators). Using this flag on large datasets runs the risk of large runs failing "
|
|
912
|
+
+ "without producing any outputs, since a single failure will cancel the entire run "
|
|
913
|
+
"when fail_on_evaluator_errors is enabled."
|
|
914
|
+
)
|
|
915
|
+
|
|
916
|
+
|
|
917
|
+
def _evaluate( # pylint: disable=too-many-locals,too-many-statements
|
|
548
918
|
*,
|
|
919
|
+
evaluators_and_graders: Dict[str, Union[Callable, AzureOpenAIGrader]],
|
|
549
920
|
evaluation_name: Optional[str] = None,
|
|
550
921
|
target: Optional[Callable] = None,
|
|
551
|
-
data:
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
922
|
+
data: Union[str, os.PathLike],
|
|
923
|
+
evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None,
|
|
924
|
+
azure_ai_project: Optional[Union[str, AzureAIProject]] = None,
|
|
925
|
+
output_path: Optional[Union[str, os.PathLike]] = None,
|
|
926
|
+
fail_on_evaluator_errors: bool = False,
|
|
927
|
+
tags: Optional[Dict[str, str]] = None,
|
|
556
928
|
**kwargs,
|
|
557
|
-
):
|
|
558
|
-
|
|
929
|
+
) -> EvaluationResult:
|
|
930
|
+
if fail_on_evaluator_errors:
|
|
931
|
+
_print_fail_flag_warning()
|
|
559
932
|
|
|
933
|
+
# Turn inputted mess of data into a dataframe, apply targets if needed
|
|
934
|
+
# split graders and evaluators, and verify that column mappings are sensible.
|
|
935
|
+
validated_data = _preprocess_data(
|
|
936
|
+
data=data,
|
|
937
|
+
evaluators_and_graders=evaluators_and_graders,
|
|
938
|
+
evaluator_config=evaluator_config,
|
|
939
|
+
target=target,
|
|
940
|
+
output_path=output_path,
|
|
941
|
+
azure_ai_project=azure_ai_project,
|
|
942
|
+
evaluation_name=evaluation_name,
|
|
943
|
+
fail_on_evaluator_errors=fail_on_evaluator_errors,
|
|
944
|
+
tags=tags,
|
|
945
|
+
**kwargs,
|
|
946
|
+
)
|
|
947
|
+
|
|
948
|
+
# extract relevant info from validated data
|
|
949
|
+
column_mapping = validated_data["column_mapping"]
|
|
950
|
+
evaluators = validated_data["evaluators"]
|
|
951
|
+
graders = validated_data["graders"]
|
|
952
|
+
input_data_df = validated_data["input_data_df"]
|
|
953
|
+
results_df = pd.DataFrame()
|
|
954
|
+
metrics: Dict[str, float] = {}
|
|
955
|
+
eval_run_info_list: List[OAIEvalRunCreationInfo] = []
|
|
956
|
+
eval_run_summary_dict = {}
|
|
957
|
+
|
|
958
|
+
# Start OAI eval runs if any graders are present.
|
|
959
|
+
need_oai_run = len(graders) > 0
|
|
960
|
+
need_local_run = len(evaluators) > 0
|
|
961
|
+
need_get_oai_results = False
|
|
962
|
+
got_local_results = False
|
|
963
|
+
if need_oai_run:
|
|
964
|
+
try:
|
|
965
|
+
aoi_name = evaluation_name if evaluation_name else DEFAULT_OAI_EVAL_RUN_NAME
|
|
966
|
+
eval_run_info_list = _begin_aoai_evaluation(graders, column_mapping, input_data_df, aoi_name, **kwargs)
|
|
967
|
+
need_get_oai_results = len(eval_run_info_list) > 0
|
|
968
|
+
except EvaluationException as e:
|
|
969
|
+
if need_local_run:
|
|
970
|
+
# If there are normal evaluators, don't stop execution and try to run
|
|
971
|
+
# those.
|
|
972
|
+
LOGGER.warning(
|
|
973
|
+
"Remote Azure Open AI grader evaluations failed during run creation."
|
|
974
|
+
+ " Continuing with local evaluators."
|
|
975
|
+
)
|
|
976
|
+
LOGGER.warning(e)
|
|
977
|
+
else:
|
|
978
|
+
raise e
|
|
979
|
+
|
|
980
|
+
# Evaluate 'normal' evaluators. This includes built-in evaluators and any user-supplied callables.
|
|
981
|
+
if need_local_run:
|
|
982
|
+
try:
|
|
983
|
+
eval_result_df, eval_metrics, per_evaluator_results = _run_callable_evaluators(
|
|
984
|
+
validated_data=validated_data, fail_on_evaluator_errors=fail_on_evaluator_errors
|
|
985
|
+
)
|
|
986
|
+
results_df = eval_result_df
|
|
987
|
+
metrics = eval_metrics
|
|
988
|
+
got_local_results = True
|
|
989
|
+
# TODO figure out how to update this printing to include OAI results?
|
|
990
|
+
_print_summary(per_evaluator_results)
|
|
991
|
+
eval_run_summary_dict = {name: result["run_summary"] for name, result in per_evaluator_results.items()}
|
|
992
|
+
LOGGER.info(f"run_summary: \r\n{json.dumps(eval_run_summary_dict, indent=4)}")
|
|
993
|
+
except EvaluationException as e:
|
|
994
|
+
if need_get_oai_results:
|
|
995
|
+
# If there are OAI graders, we only print a warning on local failures.
|
|
996
|
+
LOGGER.warning("Local evaluations failed. Will still attempt to retrieve online grader results.")
|
|
997
|
+
LOGGER.warning(e)
|
|
998
|
+
else:
|
|
999
|
+
raise e
|
|
1000
|
+
|
|
1001
|
+
# Retrieve OAI eval run results if needed.
|
|
1002
|
+
if need_get_oai_results:
|
|
1003
|
+
try:
|
|
1004
|
+
aoai_results, aoai_metrics = _get_evaluation_run_results(eval_run_info_list) # type: ignore
|
|
1005
|
+
# Post build TODO: add equivalent of _print_summary(per_evaluator_results) here
|
|
1006
|
+
|
|
1007
|
+
# Combine results if both evaluators and graders are present
|
|
1008
|
+
if len(evaluators) > 0:
|
|
1009
|
+
results_df = pd.concat([results_df, aoai_results], axis=1)
|
|
1010
|
+
metrics.update(aoai_metrics)
|
|
1011
|
+
else:
|
|
1012
|
+
# Otherwise combine aoai results with input data df to include input columns in outputs.
|
|
1013
|
+
results_df = pd.concat([input_data_df, aoai_results], axis=1)
|
|
1014
|
+
metrics = aoai_metrics
|
|
1015
|
+
except EvaluationException as e:
|
|
1016
|
+
if got_local_results:
|
|
1017
|
+
# If there are local eval results, we only print a warning on OAI failure.
|
|
1018
|
+
LOGGER.warning("Remote Azure Open AI grader evaluations failed. Still returning local results.")
|
|
1019
|
+
LOGGER.warning(e)
|
|
1020
|
+
else:
|
|
1021
|
+
raise e
|
|
1022
|
+
|
|
1023
|
+
# Done with all evaluations, message outputs into final forms, and log results if needed.
|
|
1024
|
+
name_map = _map_names_to_builtins(evaluators, graders)
|
|
1025
|
+
if is_onedp_project(azure_ai_project):
|
|
1026
|
+
studio_url = _log_metrics_and_instance_results_onedp(
|
|
1027
|
+
metrics, results_df, azure_ai_project, evaluation_name, name_map, tags=tags, **kwargs
|
|
1028
|
+
)
|
|
1029
|
+
else:
|
|
1030
|
+
# Since tracing is disabled, pass None for target_run so a dummy evaluation run will be created each time.
|
|
1031
|
+
trace_destination = _trace_destination_from_project_scope(azure_ai_project) if azure_ai_project else None
|
|
1032
|
+
studio_url = None
|
|
1033
|
+
if trace_destination:
|
|
1034
|
+
studio_url = _log_metrics_and_instance_results(
|
|
1035
|
+
metrics, results_df, trace_destination, None, evaluation_name, name_map, tags=tags, **kwargs
|
|
1036
|
+
)
|
|
1037
|
+
|
|
1038
|
+
result_df_dict = results_df.to_dict("records")
|
|
1039
|
+
result: EvaluationResult = {"rows": result_df_dict, "metrics": metrics, "studio_url": studio_url} # type: ignore
|
|
1040
|
+
# _add_aoai_structured_results_to_results(result, LOGGER, kwargs.get("eval_meta_data"))
|
|
1041
|
+
|
|
1042
|
+
eval_id: Optional[str] = kwargs.get("_eval_id")
|
|
1043
|
+
eval_run_id: Optional[str] = kwargs.get("_eval_run_id")
|
|
1044
|
+
eval_meta_data: Optional[Dict[str, Any]] = kwargs.get("_eval_meta_data")
|
|
1045
|
+
if kwargs.get("_convert_to_aoai_evaluation_result", False):
|
|
1046
|
+
_convert_results_to_aoai_evaluation_results(
|
|
1047
|
+
result, LOGGER, eval_id, eval_run_id, evaluators_and_graders, eval_run_summary_dict, eval_meta_data
|
|
1048
|
+
)
|
|
1049
|
+
if app_insights_configuration := kwargs.get("_app_insights_configuration"):
|
|
1050
|
+
emit_eval_result_events_to_app_insights(
|
|
1051
|
+
app_insights_configuration, result["_evaluation_results_list"], evaluator_config
|
|
1052
|
+
)
|
|
1053
|
+
|
|
1054
|
+
if output_path:
|
|
1055
|
+
_write_output(output_path, result)
|
|
1056
|
+
return result
|
|
1057
|
+
|
|
1058
|
+
|
|
1059
|
+
def _build_internal_log_attributes(
|
|
1060
|
+
event_data: Dict[str, Any],
|
|
1061
|
+
metric_name: str,
|
|
1062
|
+
evaluator_config: Optional[Dict[str, EvaluatorConfig]],
|
|
1063
|
+
internal_log_attributes: Dict[str, str],
|
|
1064
|
+
) -> Dict[str, str]:
|
|
1065
|
+
"""
|
|
1066
|
+
Build internal log attributes for OpenTelemetry logging.
|
|
1067
|
+
|
|
1068
|
+
:param event_data: The event data containing threshold and name information
|
|
1069
|
+
:type event_data: Dict[str, Any]
|
|
1070
|
+
:param metric_name: The name of the metric being evaluated
|
|
1071
|
+
:type metric_name: str
|
|
1072
|
+
:param evaluator_config: Configuration for evaluators
|
|
1073
|
+
:type evaluator_config: Optional[Dict[str, EvaluatorConfig]]
|
|
1074
|
+
:return: Dictionary of internal log attributes
|
|
1075
|
+
:rtype: Dict[str, str]
|
|
1076
|
+
"""
|
|
1077
|
+
# Add threshold if present
|
|
1078
|
+
if event_data.get("threshold"):
|
|
1079
|
+
internal_log_attributes["gen_ai.evaluation.threshold"] = str(event_data["threshold"])
|
|
1080
|
+
|
|
1081
|
+
# Add testing criteria details if present
|
|
1082
|
+
testing_criteria_name = event_data.get("name")
|
|
1083
|
+
if testing_criteria_name:
|
|
1084
|
+
internal_log_attributes["gen_ai.evaluation.testing_criteria.name"] = testing_criteria_name
|
|
1085
|
+
|
|
1086
|
+
# Get evaluator definition details
|
|
1087
|
+
if evaluator_config and testing_criteria_name in evaluator_config:
|
|
1088
|
+
testing_criteria_config = evaluator_config[testing_criteria_name]
|
|
1089
|
+
|
|
1090
|
+
if evaluator_name := testing_criteria_config.get("_evaluator_name"):
|
|
1091
|
+
internal_log_attributes["gen_ai.evaluator.name"] = str(evaluator_name)
|
|
1092
|
+
|
|
1093
|
+
if evaluator_version := testing_criteria_config.get("_evaluator_version"):
|
|
1094
|
+
internal_log_attributes["gen_ai.evaluator.version"] = str(evaluator_version)
|
|
1095
|
+
|
|
1096
|
+
if evaluator_id := testing_criteria_config.get("_evaluator_id"):
|
|
1097
|
+
internal_log_attributes["gen_ai.evaluator.id"] = str(evaluator_id)
|
|
1098
|
+
|
|
1099
|
+
if evaluator_definition := testing_criteria_config.get("_evaluator_definition"):
|
|
1100
|
+
metric_config_detail = evaluator_definition.get("metrics").get(metric_name)
|
|
1101
|
+
|
|
1102
|
+
if metric_config_detail:
|
|
1103
|
+
if metric_config_detail.get("min_value") is not None:
|
|
1104
|
+
internal_log_attributes["gen_ai.evaluation.min_value"] = str(metric_config_detail["min_value"])
|
|
1105
|
+
if metric_config_detail.get("max_value") is not None:
|
|
1106
|
+
internal_log_attributes["gen_ai.evaluation.max_value"] = str(metric_config_detail["max_value"])
|
|
1107
|
+
|
|
1108
|
+
return internal_log_attributes
|
|
1109
|
+
|
|
1110
|
+
|
|
1111
|
+
def _log_events_to_app_insights(
|
|
1112
|
+
event_logger,
|
|
1113
|
+
events: List[Dict[str, Any]],
|
|
1114
|
+
log_attributes: Dict[str, Any],
|
|
1115
|
+
app_insights_config: AppInsightsConfig,
|
|
1116
|
+
data_source_item: Optional[Dict[str, Any]] = None,
|
|
1117
|
+
evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None,
|
|
1118
|
+
) -> None:
|
|
1119
|
+
"""
|
|
1120
|
+
Log independent events directly to App Insights using OpenTelemetry event logging.
|
|
1121
|
+
|
|
1122
|
+
:param event_logger: OpenTelemetry event logger instance
|
|
1123
|
+
:type event_logger: EventLogger
|
|
1124
|
+
:param events: List of event data dictionaries to log
|
|
1125
|
+
:type events: List[Dict[str, Any]]
|
|
1126
|
+
:param log_attributes: Attributes dict to use for each event (already includes extra_attributes if present)
|
|
1127
|
+
:type log_attributes: Dict[str, Any]
|
|
1128
|
+
:param app_insights_config: App Insights configuration containing connection string
|
|
1129
|
+
:type app_insights_config: AppInsightsConfig
|
|
1130
|
+
:param data_source_item: Data source item containing trace, response, and agent information
|
|
1131
|
+
:type data_source_item: Optional[Dict[str, Any]]
|
|
1132
|
+
"""
|
|
1133
|
+
|
|
1134
|
+
from opentelemetry._events import Event
|
|
1135
|
+
|
|
1136
|
+
try:
|
|
1137
|
+
# Initialize values from AppInsights config as defaults
|
|
1138
|
+
trace_id = None
|
|
1139
|
+
span_id = None
|
|
1140
|
+
response_id = None
|
|
1141
|
+
conversation_id = None
|
|
1142
|
+
previous_response_id = None
|
|
1143
|
+
agent_id = app_insights_config.get("agent_id", None)
|
|
1144
|
+
agent_version = app_insights_config.get("agent_version", None)
|
|
1145
|
+
agent_name = app_insights_config.get("agent_name", None)
|
|
1146
|
+
|
|
1147
|
+
# Data source item values have higher priority and will override AppInsights config defaults
|
|
1148
|
+
if data_source_item:
|
|
1149
|
+
for key, value in data_source_item.items():
|
|
1150
|
+
if key.endswith("trace_id") and value and isinstance(value, str):
|
|
1151
|
+
# Remove dashes if present
|
|
1152
|
+
trace_id_str = str(value).replace("-", "").lower()
|
|
1153
|
+
if len(trace_id_str) == 32: # Valid trace_id length
|
|
1154
|
+
trace_id = int(trace_id_str, 16)
|
|
1155
|
+
elif key == "previous_response_id" and value and isinstance(value, str):
|
|
1156
|
+
previous_response_id = value
|
|
1157
|
+
elif key == "response_id" and value and isinstance(value, str):
|
|
1158
|
+
response_id = value
|
|
1159
|
+
elif key == "conversation_id" and value and isinstance(value, str):
|
|
1160
|
+
conversation_id = value
|
|
1161
|
+
elif key == "agent_id" and value and isinstance(value, str):
|
|
1162
|
+
agent_id = value
|
|
1163
|
+
elif key.endswith("span_id") and value and isinstance(value, str):
|
|
1164
|
+
# Remove dashes if present and convert to int
|
|
1165
|
+
span_id_str = str(value).replace("-", "").lower()
|
|
1166
|
+
if len(span_id_str) == 16: # Valid span_id length (64-bit = 16 hex chars)
|
|
1167
|
+
span_id = int(span_id_str, 16)
|
|
1168
|
+
elif key == "agent_version" and value and isinstance(value, str):
|
|
1169
|
+
agent_version = value
|
|
1170
|
+
elif key == "agent_name" and value and isinstance(value, str):
|
|
1171
|
+
agent_name = value
|
|
1172
|
+
|
|
1173
|
+
# Log each event as a separate log record
|
|
1174
|
+
for i, event_data in enumerate(events):
|
|
1175
|
+
try:
|
|
1176
|
+
# Prepare log record attributes with specific mappings
|
|
1177
|
+
# The standard attributes are already in https://github.com/open-telemetry/semantic-conventions/blob/main/docs/gen-ai/gen-ai-events.md#event-eventgen_aievaluationresult
|
|
1178
|
+
metric_name = event_data.get("metric")
|
|
1179
|
+
standard_log_attributes = {}
|
|
1180
|
+
# This attributes makes evaluation events to go into customEvents table in App Insights
|
|
1181
|
+
standard_log_attributes["microsoft.custom_event.name"] = EVALUATION_EVENT_NAME
|
|
1182
|
+
standard_log_attributes["gen_ai.evaluation.name"] = metric_name
|
|
1183
|
+
if event_data.get("score") is not None:
|
|
1184
|
+
standard_log_attributes["gen_ai.evaluation.score.value"] = event_data.get("score")
|
|
1185
|
+
if event_data.get("label") is not None:
|
|
1186
|
+
standard_log_attributes["gen_ai.evaluation.score.label"] = event_data.get("label")
|
|
1187
|
+
|
|
1188
|
+
# Internal proposed attributes
|
|
1189
|
+
# Put it in internal property bag for now, will be expanded if we got sign-off to Otel standard later.
|
|
1190
|
+
internal_log_attributes = _build_internal_log_attributes(
|
|
1191
|
+
event_data, metric_name, evaluator_config, log_attributes
|
|
1192
|
+
)
|
|
1193
|
+
|
|
1194
|
+
# Optional field that may not always be present
|
|
1195
|
+
if "reason" in event_data:
|
|
1196
|
+
standard_log_attributes["gen_ai.evaluation.explanation"] = str(event_data["reason"])
|
|
1197
|
+
|
|
1198
|
+
# Handle error from sample if present
|
|
1199
|
+
# Put the error message in error.type to follow OTel semantic conventions
|
|
1200
|
+
error = event_data.get("sample", {}).get("error", {}).get("message", None)
|
|
1201
|
+
if error:
|
|
1202
|
+
standard_log_attributes["error.type"] = error
|
|
1203
|
+
|
|
1204
|
+
# Handle redteam attack properties if present
|
|
1205
|
+
if "properties" in event_data:
|
|
1206
|
+
properties = event_data["properties"]
|
|
1207
|
+
|
|
1208
|
+
if "attack_success" in properties:
|
|
1209
|
+
internal_log_attributes["gen_ai.redteam.attack.success"] = str(properties["attack_success"])
|
|
1210
|
+
|
|
1211
|
+
if "attack_technique" in properties:
|
|
1212
|
+
internal_log_attributes["gen_ai.redteam.attack.technique"] = str(properties["attack_technique"])
|
|
1213
|
+
|
|
1214
|
+
if "attack_complexity" in properties:
|
|
1215
|
+
internal_log_attributes["gen_ai.redteam.attack.complexity"] = str(
|
|
1216
|
+
properties["attack_complexity"]
|
|
1217
|
+
)
|
|
1218
|
+
|
|
1219
|
+
if "attack_success_threshold" in properties:
|
|
1220
|
+
internal_log_attributes["gen_ai.redteam.attack.success_threshold"] = str(
|
|
1221
|
+
properties["attack_success_threshold"]
|
|
1222
|
+
)
|
|
1223
|
+
|
|
1224
|
+
# Add data source item attributes if present
|
|
1225
|
+
if response_id:
|
|
1226
|
+
standard_log_attributes["gen_ai.response.id"] = response_id
|
|
1227
|
+
if conversation_id:
|
|
1228
|
+
standard_log_attributes["gen_ai.conversation.id"] = conversation_id
|
|
1229
|
+
if previous_response_id:
|
|
1230
|
+
internal_log_attributes["gen_ai.previous.response.id"] = previous_response_id
|
|
1231
|
+
if agent_id:
|
|
1232
|
+
standard_log_attributes["gen_ai.agent.id"] = agent_id
|
|
1233
|
+
if agent_name:
|
|
1234
|
+
standard_log_attributes["gen_ai.agent.name"] = agent_name
|
|
1235
|
+
if agent_version:
|
|
1236
|
+
internal_log_attributes["gen_ai.agent.version"] = agent_version
|
|
1237
|
+
|
|
1238
|
+
# Combine standard and internal attributes, put internal under the properties bag
|
|
1239
|
+
standard_log_attributes["internal_properties"] = json.dumps(internal_log_attributes)
|
|
1240
|
+
# Anonymize IP address to prevent Azure GeoIP enrichment and location tracking
|
|
1241
|
+
standard_log_attributes["http.client_ip"] = "0.0.0.0"
|
|
1242
|
+
|
|
1243
|
+
event_logger.emit(
|
|
1244
|
+
Event(
|
|
1245
|
+
name=EVALUATION_EVENT_NAME,
|
|
1246
|
+
attributes=standard_log_attributes,
|
|
1247
|
+
body=EVALUATION_EVENT_NAME,
|
|
1248
|
+
trace_id=trace_id if trace_id is not None else None,
|
|
1249
|
+
span_id=span_id if span_id is not None else None,
|
|
1250
|
+
)
|
|
1251
|
+
)
|
|
1252
|
+
|
|
1253
|
+
except Exception as e:
|
|
1254
|
+
LOGGER.warning(f"Failed to log event {i}: {e}")
|
|
1255
|
+
|
|
1256
|
+
except Exception as e:
|
|
1257
|
+
LOGGER.error(f"Failed to log events to App Insights: {e}")
|
|
1258
|
+
|
|
1259
|
+
|
|
1260
|
+
def emit_eval_result_events_to_app_insights(
|
|
1261
|
+
app_insights_config: AppInsightsConfig,
|
|
1262
|
+
results: List[Dict],
|
|
1263
|
+
evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None,
|
|
1264
|
+
) -> None:
|
|
1265
|
+
"""
|
|
1266
|
+
Emit evaluation result events to App Insights using OpenTelemetry logging.
|
|
1267
|
+
Each result is logged as an independent log record, potentially including trace context.
|
|
1268
|
+
|
|
1269
|
+
:param app_insights_config: App Insights configuration containing connection string
|
|
1270
|
+
:type app_insights_config: AppInsightsConfig
|
|
1271
|
+
:param results: List of evaluation results to log
|
|
1272
|
+
:type results: List[Dict]
|
|
1273
|
+
"""
|
|
1274
|
+
|
|
1275
|
+
from opentelemetry import _logs
|
|
1276
|
+
from opentelemetry.sdk._logs import LoggerProvider
|
|
1277
|
+
from opentelemetry.sdk._logs.export import BatchLogRecordProcessor
|
|
1278
|
+
from opentelemetry.sdk.resources import Resource
|
|
1279
|
+
from opentelemetry.semconv.resource import ResourceAttributes
|
|
1280
|
+
from azure.monitor.opentelemetry.exporter import AzureMonitorLogExporter
|
|
1281
|
+
from opentelemetry._events import get_event_logger
|
|
1282
|
+
from opentelemetry.sdk._events import EventLoggerProvider
|
|
1283
|
+
|
|
1284
|
+
if not results:
|
|
1285
|
+
LOGGER.debug("No results to log to App Insights")
|
|
1286
|
+
return
|
|
1287
|
+
|
|
1288
|
+
try:
|
|
1289
|
+
# Configure OpenTelemetry logging with anonymized Resource attributes
|
|
1290
|
+
|
|
1291
|
+
# Create a resource with minimal attributes to prevent sensitive data collection
|
|
1292
|
+
# SERVICE_INSTANCE_ID maps to cloud_RoleInstance in Azure Monitor and prevents
|
|
1293
|
+
# Azure Monitor from auto-detecting the device hostname
|
|
1294
|
+
anonymized_resource = Resource.create(
|
|
1295
|
+
{
|
|
1296
|
+
ResourceAttributes.SERVICE_NAME: "unknown",
|
|
1297
|
+
ResourceAttributes.SERVICE_INSTANCE_ID: "unknown",
|
|
1298
|
+
}
|
|
1299
|
+
)
|
|
1300
|
+
|
|
1301
|
+
logger_provider = LoggerProvider(resource=anonymized_resource)
|
|
1302
|
+
_logs.set_logger_provider(logger_provider)
|
|
1303
|
+
|
|
1304
|
+
# Create Azure Monitor log exporter
|
|
1305
|
+
azure_log_exporter = AzureMonitorLogExporter(connection_string=app_insights_config["connection_string"])
|
|
1306
|
+
|
|
1307
|
+
# Add the Azure Monitor exporter to the logger provider
|
|
1308
|
+
logger_provider.add_log_record_processor(BatchLogRecordProcessor(azure_log_exporter))
|
|
1309
|
+
|
|
1310
|
+
# Create event logger
|
|
1311
|
+
event_provider = EventLoggerProvider(logger_provider)
|
|
1312
|
+
event_logger = get_event_logger(__name__, event_logger_provider=event_provider)
|
|
1313
|
+
|
|
1314
|
+
# Initialize base log attributes with extra_attributes if present, otherwise empty dict
|
|
1315
|
+
base_log_attributes = app_insights_config.get("extra_attributes", {})
|
|
1316
|
+
|
|
1317
|
+
# Add AppInsights config attributes with proper semantic convention mappings
|
|
1318
|
+
if "run_type" in app_insights_config:
|
|
1319
|
+
base_log_attributes["gen_ai.evaluation.azure_ai_type"] = str(app_insights_config["run_type"])
|
|
1320
|
+
if "schedule_type" in app_insights_config:
|
|
1321
|
+
base_log_attributes["gen_ai.evaluation.azure_ai_scheduled"] = str(app_insights_config["schedule_type"])
|
|
1322
|
+
if "run_id" in app_insights_config:
|
|
1323
|
+
base_log_attributes["gen_ai.evaluation.run.id"] = str(app_insights_config["run_id"])
|
|
1324
|
+
if "project_id" in app_insights_config:
|
|
1325
|
+
base_log_attributes["gen_ai.azure_ai_project.id"] = str(app_insights_config["project_id"])
|
|
1326
|
+
|
|
1327
|
+
for result in results:
|
|
1328
|
+
# Create a copy of base attributes for this result's events
|
|
1329
|
+
log_attributes = base_log_attributes.copy()
|
|
1330
|
+
|
|
1331
|
+
_log_events_to_app_insights(
|
|
1332
|
+
event_logger=event_logger,
|
|
1333
|
+
events=result["results"],
|
|
1334
|
+
log_attributes=log_attributes,
|
|
1335
|
+
data_source_item=result["datasource_item"] if "datasource_item" in result else None,
|
|
1336
|
+
evaluator_config=evaluator_config,
|
|
1337
|
+
app_insights_config=app_insights_config,
|
|
1338
|
+
)
|
|
1339
|
+
# Force flush to ensure events are sent
|
|
1340
|
+
logger_provider.force_flush()
|
|
1341
|
+
LOGGER.info(f"Successfully logged {len(results)} evaluation results to App Insights")
|
|
1342
|
+
|
|
1343
|
+
except Exception as e:
|
|
1344
|
+
LOGGER.error(f"Failed to emit evaluation results to App Insights: {e}")
|
|
1345
|
+
|
|
1346
|
+
|
|
1347
|
+
def _preprocess_data(
|
|
1348
|
+
data: Union[str, os.PathLike],
|
|
1349
|
+
evaluators_and_graders: Dict[str, Union[Callable, AzureOpenAIGrader]],
|
|
1350
|
+
evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None,
|
|
1351
|
+
target: Optional[Callable] = None,
|
|
1352
|
+
output_path: Optional[Union[str, os.PathLike]] = None,
|
|
1353
|
+
azure_ai_project: Optional[Union[str, AzureAIProject]] = None,
|
|
1354
|
+
evaluation_name: Optional[str] = None,
|
|
1355
|
+
fail_on_evaluator_errors: bool = False,
|
|
1356
|
+
tags: Optional[Dict[str, str]] = None,
|
|
1357
|
+
**kwargs,
|
|
1358
|
+
) -> __ValidatedData:
|
|
560
1359
|
# Process evaluator config to replace ${target.} with ${data.}
|
|
561
1360
|
if evaluator_config is None:
|
|
562
1361
|
evaluator_config = {}
|
|
563
|
-
evaluator_config = _process_evaluator_config(evaluator_config)
|
|
564
|
-
_validate_columns(input_data_df, evaluators, target, evaluator_config)
|
|
565
|
-
|
|
566
|
-
# Target Run
|
|
567
|
-
pf_client = PFClient(
|
|
568
|
-
config=(
|
|
569
|
-
{"trace.destination": _trace_destination_from_project_scope(azure_ai_project)} if azure_ai_project else None
|
|
570
|
-
),
|
|
571
|
-
user_agent=USER_AGENT,
|
|
572
|
-
)
|
|
573
1362
|
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
1363
|
+
input_data_df = _validate_and_load_data(
|
|
1364
|
+
target, data, evaluators_and_graders, output_path, azure_ai_project, evaluation_name, tags
|
|
1365
|
+
)
|
|
1366
|
+
if target is not None:
|
|
1367
|
+
_validate_columns_for_target(input_data_df, target)
|
|
1368
|
+
|
|
1369
|
+
# extract column mapping dicts into dictionary mapping evaluator name to column mapping
|
|
1370
|
+
column_mapping = _process_column_mappings(
|
|
1371
|
+
{
|
|
1372
|
+
evaluator_name: evaluator_configuration.get("column_mapping", None)
|
|
1373
|
+
for evaluator_name, evaluator_configuration in evaluator_config.items()
|
|
1374
|
+
}
|
|
1375
|
+
)
|
|
577
1376
|
|
|
578
1377
|
# Create default configuration for evaluators that directly maps
|
|
579
1378
|
# input data names to keyword inputs of the same name in the evaluators.
|
|
580
|
-
|
|
581
|
-
|
|
1379
|
+
column_mapping = column_mapping or {}
|
|
1380
|
+
column_mapping.setdefault("default", {})
|
|
1381
|
+
|
|
1382
|
+
# Split normal evaluators and OAI graders
|
|
1383
|
+
evaluators, graders = _split_evaluators_and_grader_configs(evaluators_and_graders)
|
|
1384
|
+
|
|
1385
|
+
target_run: Optional[BatchClientRun] = None
|
|
1386
|
+
target_generated_columns: Set[str] = set()
|
|
1387
|
+
batch_run_client: BatchClient
|
|
1388
|
+
batch_run_data: Union[str, os.PathLike, pd.DataFrame] = data
|
|
1389
|
+
|
|
1390
|
+
def get_client_type(evaluate_kwargs: Dict[str, Any]) -> Literal["run_submitter", "pf_client", "code_client"]:
|
|
1391
|
+
"""Determines the BatchClient to use from provided kwargs (_use_run_submitter_client and _use_pf_client)"""
|
|
1392
|
+
_use_run_submitter_client = cast(Optional[bool], kwargs.pop("_use_run_submitter_client", None))
|
|
1393
|
+
_use_pf_client = cast(Optional[bool], kwargs.pop("_use_pf_client", None))
|
|
1394
|
+
|
|
1395
|
+
if _use_run_submitter_client is None and _use_pf_client is None:
|
|
1396
|
+
# If both are unset, return default
|
|
1397
|
+
return "run_submitter"
|
|
1398
|
+
|
|
1399
|
+
if _use_run_submitter_client and _use_pf_client:
|
|
1400
|
+
raise EvaluationException(
|
|
1401
|
+
message="Only one of _use_pf_client and _use_run_submitter_client should be set to True.",
|
|
1402
|
+
target=ErrorTarget.EVALUATE,
|
|
1403
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
1404
|
+
blame=ErrorBlame.USER_ERROR,
|
|
1405
|
+
)
|
|
1406
|
+
|
|
1407
|
+
if _use_run_submitter_client == False and _use_pf_client == False:
|
|
1408
|
+
return "code_client"
|
|
1409
|
+
|
|
1410
|
+
if _use_run_submitter_client:
|
|
1411
|
+
return "run_submitter"
|
|
1412
|
+
if _use_pf_client:
|
|
1413
|
+
return "pf_client"
|
|
1414
|
+
|
|
1415
|
+
if _use_run_submitter_client is None and _use_pf_client == False:
|
|
1416
|
+
return "run_submitter"
|
|
1417
|
+
if _use_run_submitter_client == False and _use_pf_client is None:
|
|
1418
|
+
return "pf_client"
|
|
1419
|
+
|
|
1420
|
+
assert False, "This should be impossible"
|
|
1421
|
+
|
|
1422
|
+
client_type: Literal["run_submitter", "pf_client", "code_client"] = get_client_type(kwargs)
|
|
1423
|
+
|
|
1424
|
+
if client_type == "run_submitter":
|
|
1425
|
+
batch_run_client = RunSubmitterClient(raise_on_errors=fail_on_evaluator_errors)
|
|
1426
|
+
batch_run_data = input_data_df
|
|
1427
|
+
elif client_type == "pf_client":
|
|
1428
|
+
batch_run_client = ProxyClient(user_agent=UserAgentSingleton().value)
|
|
1429
|
+
# Ensure the absolute path is Re to pf.run, as relative path doesn't work with
|
|
1430
|
+
# multiple evaluators. If the path is already absolute, abspath will return the original path.
|
|
1431
|
+
batch_run_data = os.path.abspath(data)
|
|
1432
|
+
elif client_type == "code_client":
|
|
1433
|
+
batch_run_client = CodeClient()
|
|
1434
|
+
batch_run_data = input_data_df
|
|
582
1435
|
|
|
583
1436
|
# If target is set, apply 1-1 column mapping from target outputs to evaluator inputs
|
|
584
1437
|
if data is not None and target is not None:
|
|
585
1438
|
input_data_df, target_generated_columns, target_run = _apply_target_to_data(
|
|
586
|
-
target,
|
|
1439
|
+
target, batch_run_data, batch_run_client, input_data_df, evaluation_name, **kwargs
|
|
587
1440
|
)
|
|
588
1441
|
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
1442
|
+
# IMPORTANT FIX: For ProxyClient, create a temporary file with the complete dataframe
|
|
1443
|
+
# This ensures that evaluators get all rows (including failed ones with NaN values)
|
|
1444
|
+
if isinstance(batch_run_client, ProxyClient):
|
|
1445
|
+
# Create a temporary JSONL file with the complete dataframe
|
|
1446
|
+
temp_file = tempfile.NamedTemporaryFile(mode="w", suffix=".jsonl", delete=False)
|
|
1447
|
+
try:
|
|
1448
|
+
for _, row in input_data_df.iterrows():
|
|
1449
|
+
row_dict = row.to_dict()
|
|
1450
|
+
temp_file.write(json.dumps(row_dict) + "\n")
|
|
1451
|
+
temp_file.close()
|
|
1452
|
+
batch_run_data = temp_file.name
|
|
1453
|
+
|
|
1454
|
+
# Update column mappings to use data references instead of run outputs
|
|
1455
|
+
for evaluator_name, mapping in column_mapping.items():
|
|
1456
|
+
mapped_to_values = set(mapping.values())
|
|
1457
|
+
for col in target_generated_columns:
|
|
1458
|
+
# Use data reference instead of run output to ensure we get all rows
|
|
1459
|
+
target_reference = f"${{data.{Prefixes.TSG_OUTPUTS}{col}}}"
|
|
1460
|
+
|
|
1461
|
+
# We will add our mapping only if customer did not map target output.
|
|
1462
|
+
if col not in mapping and target_reference not in mapped_to_values:
|
|
1463
|
+
column_mapping[evaluator_name][col] = target_reference
|
|
1464
|
+
|
|
1465
|
+
# Don't pass the target_run since we're now using the complete dataframe
|
|
1466
|
+
target_run = None
|
|
1467
|
+
|
|
1468
|
+
except Exception as e:
|
|
1469
|
+
# Clean up the temp file if something goes wrong
|
|
1470
|
+
if os.path.exists(temp_file.name):
|
|
1471
|
+
os.unlink(temp_file.name)
|
|
1472
|
+
raise e
|
|
1473
|
+
else:
|
|
1474
|
+
# For DataFrame-based clients, update batch_run_data to use the updated input_data_df
|
|
1475
|
+
batch_run_data = input_data_df
|
|
1476
|
+
|
|
1477
|
+
# Update column mappings for DataFrame clients
|
|
1478
|
+
for evaluator_name, mapping in column_mapping.items():
|
|
1479
|
+
mapped_to_values = set(mapping.values())
|
|
1480
|
+
for col in target_generated_columns:
|
|
1481
|
+
target_reference = f"${{data.{Prefixes.TSG_OUTPUTS}{col}}}"
|
|
1482
|
+
|
|
1483
|
+
# We will add our mapping only if customer did not map target output.
|
|
1484
|
+
if col not in mapping and target_reference not in mapped_to_values:
|
|
1485
|
+
column_mapping[evaluator_name][col] = target_reference
|
|
1486
|
+
|
|
1487
|
+
# After we have generated all columns, we can check if we have everything we need for evaluators.
|
|
1488
|
+
_validate_columns_for_evaluators(input_data_df, evaluators, target, target_generated_columns, column_mapping)
|
|
604
1489
|
|
|
605
1490
|
# Apply 1-1 mapping from input data to evaluator inputs, excluding values already assigned
|
|
606
1491
|
# via target mapping.
|
|
607
1492
|
# If both the data and the output dictionary of the target function
|
|
608
1493
|
# have the same column, then the target function value is used.
|
|
1494
|
+
# NEW: flatten nested object columns (e.g., 'item') so we can map leaf values automatically.
|
|
1495
|
+
# Ensure the data does not contain top-level 'conversation' or 'messages' columns (which indicate chat/conversation data)
|
|
609
1496
|
if input_data_df is not None:
|
|
1497
|
+
if "conversation" in input_data_df.columns or "messages" in input_data_df.columns:
|
|
1498
|
+
# No action is taken when 'conversation' or 'messages' columns are present,
|
|
1499
|
+
# as these indicate chat/conversation data which should not be flattened or mapped by default.
|
|
1500
|
+
pass
|
|
1501
|
+
else:
|
|
1502
|
+
input_data_df = _flatten_object_columns_for_default_mapping(input_data_df)
|
|
1503
|
+
|
|
1504
|
+
# Build default mapping for leaves:
|
|
1505
|
+
if input_data_df is not None:
|
|
1506
|
+
# First, map flattened nested columns (those containing a dot) to leaf names.
|
|
610
1507
|
for col in input_data_df.columns:
|
|
611
|
-
#
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
|
|
1508
|
+
# Skip target output columns
|
|
1509
|
+
if col.startswith(Prefixes.TSG_OUTPUTS):
|
|
1510
|
+
continue
|
|
1511
|
+
# Skip root container columns (no dot) here; they'll be handled below if truly primitive.
|
|
1512
|
+
if "." in col:
|
|
1513
|
+
leaf_name = col.split(".")[-1]
|
|
1514
|
+
if leaf_name not in column_mapping["default"]:
|
|
1515
|
+
column_mapping["default"][leaf_name] = f"${{data.{col}}}"
|
|
1516
|
+
|
|
1517
|
+
# Then, handle remaining top-level primitive columns (original logic).
|
|
1518
|
+
for col in input_data_df.columns:
|
|
1519
|
+
if (
|
|
1520
|
+
not col.startswith(Prefixes.TSG_OUTPUTS)
|
|
1521
|
+
and col not in column_mapping["default"].keys()
|
|
1522
|
+
and "." not in col # only pure top-level primitives
|
|
1523
|
+
):
|
|
1524
|
+
column_mapping["default"][col] = f"${{data.{col}}}"
|
|
1525
|
+
|
|
1526
|
+
return __ValidatedData(
|
|
1527
|
+
evaluators=evaluators,
|
|
1528
|
+
graders=graders,
|
|
1529
|
+
input_data_df=input_data_df,
|
|
1530
|
+
column_mapping=column_mapping,
|
|
1531
|
+
target_run=target_run,
|
|
1532
|
+
batch_run_client=batch_run_client,
|
|
1533
|
+
batch_run_data=batch_run_data,
|
|
1534
|
+
)
|
|
1535
|
+
|
|
1536
|
+
|
|
1537
|
+
def _flatten_object_columns_for_default_mapping(
|
|
1538
|
+
df: pd.DataFrame, root_prefixes: Optional[Iterable[str]] = None
|
|
1539
|
+
) -> pd.DataFrame:
|
|
1540
|
+
"""Flatten nested dictionary-valued columns into dotted leaf columns.
|
|
1541
|
+
|
|
1542
|
+
For any column whose cells (in at least one row) are ``dict`` objects, this utility discovers all
|
|
1543
|
+
leaf paths (recursively descending only through ``dict`` nodes) and materializes new DataFrame
|
|
1544
|
+
columns named ``"<original_col>.<nested.path.leaf>"`` for every unique leaf encountered across
|
|
1545
|
+
all rows. A *leaf* is defined as any value that is **not** a ``dict`` (lists / primitives / ``None``
|
|
1546
|
+
are all treated as leaves). Existing columns are never overwritten (idempotent behavior).
|
|
1547
|
+
|
|
1548
|
+
Example
|
|
1549
|
+
If a column ``item`` contains objects like ``{"a": {"b": 1, "c": 2}}`` a pair of new
|
|
1550
|
+
columns ``item.a.b`` and ``item.a.c`` will be added with the corresponding scalar values.
|
|
1551
|
+
|
|
1552
|
+
:param df: Input DataFrame to flatten in place.
|
|
1553
|
+
:type df: ~pandas.DataFrame
|
|
1554
|
+
:param root_prefixes: Optional iterable restricting which top-level columns are considered
|
|
1555
|
+
for flattening. If ``None``, all columns containing at least one ``dict`` value are processed.
|
|
1556
|
+
:type root_prefixes: Optional[Iterable[str]]
|
|
1557
|
+
:return: The same DataFrame instance (returned for convenient chaining).
|
|
1558
|
+
:rtype: ~pandas.DataFrame
|
|
1559
|
+
"""
|
|
1560
|
+
candidate_cols = []
|
|
1561
|
+
if root_prefixes is not None:
|
|
1562
|
+
candidate_cols = [c for c in root_prefixes if c in df.columns]
|
|
624
1563
|
else:
|
|
625
|
-
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
1564
|
+
# pick columns where at least one non-null value is a dict
|
|
1565
|
+
for c in df.columns:
|
|
1566
|
+
series = df[c]
|
|
1567
|
+
if series.map(lambda v: isinstance(v, dict)).any():
|
|
1568
|
+
candidate_cols.append(c)
|
|
1569
|
+
|
|
1570
|
+
def _extract_leaves(obj: Any, prefix: str) -> Iterator[Tuple[str, Any]]:
|
|
1571
|
+
if isinstance(obj, dict):
|
|
1572
|
+
for k, v in obj.items():
|
|
1573
|
+
new_prefix = f"{prefix}.{k}" if prefix else k
|
|
1574
|
+
if isinstance(v, dict):
|
|
1575
|
+
yield from _extract_leaves(v, new_prefix)
|
|
1576
|
+
else:
|
|
1577
|
+
# treat list / primitive / None as leaf
|
|
1578
|
+
yield new_prefix, v
|
|
1579
|
+
|
|
1580
|
+
for root_col in candidate_cols:
|
|
1581
|
+
# Build a union of leaf paths across rows to ensure consistent columns
|
|
1582
|
+
leaf_paths: Set[str] = set()
|
|
1583
|
+
for val in df[root_col]:
|
|
1584
|
+
if isinstance(val, dict):
|
|
1585
|
+
for path, _ in _extract_leaves(val, root_col):
|
|
1586
|
+
leaf_paths.add(path)
|
|
1587
|
+
|
|
1588
|
+
if not leaf_paths:
|
|
1589
|
+
continue
|
|
1590
|
+
|
|
1591
|
+
# Create each flattened column if absent
|
|
1592
|
+
for path in leaf_paths:
|
|
1593
|
+
if path in df.columns:
|
|
1594
|
+
continue # already present
|
|
1595
|
+
relative_keys = path[len(root_col) + 1 :].split(".") if len(path) > len(root_col) else []
|
|
1596
|
+
|
|
1597
|
+
def getter(root_val: Any) -> Any:
|
|
1598
|
+
cur = root_val
|
|
1599
|
+
for rk in relative_keys:
|
|
1600
|
+
if not isinstance(cur, dict):
|
|
1601
|
+
return None
|
|
1602
|
+
cur = cur.get(rk, None)
|
|
1603
|
+
return cur
|
|
1604
|
+
|
|
1605
|
+
df[path] = df[root_col].map(lambda rv: getter(rv) if isinstance(rv, dict) else None)
|
|
1606
|
+
|
|
1607
|
+
return df
|
|
1608
|
+
|
|
640
1609
|
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
1610
|
+
def _run_callable_evaluators(
|
|
1611
|
+
validated_data: __ValidatedData,
|
|
1612
|
+
fail_on_evaluator_errors: bool = False,
|
|
1613
|
+
**kwargs,
|
|
1614
|
+
) -> Tuple[pd.DataFrame, Dict[str, Any], Dict[str, __EvaluatorInfo]]:
|
|
1615
|
+
|
|
1616
|
+
# Extract needed values
|
|
1617
|
+
batch_run_client = validated_data["batch_run_client"]
|
|
1618
|
+
target_run = validated_data["target_run"]
|
|
1619
|
+
batch_run_data = validated_data["batch_run_data"]
|
|
1620
|
+
column_mapping = validated_data["column_mapping"]
|
|
1621
|
+
evaluators = validated_data["evaluators"]
|
|
1622
|
+
|
|
1623
|
+
# Clean up temporary file after evaluation if it was created
|
|
1624
|
+
temp_file_to_cleanup = None
|
|
1625
|
+
if (
|
|
1626
|
+
isinstance(batch_run_client, ProxyClient)
|
|
1627
|
+
and isinstance(batch_run_data, str)
|
|
1628
|
+
and batch_run_data.endswith(".jsonl")
|
|
1629
|
+
):
|
|
1630
|
+
# Check if it's a temporary file (contains temp directory path)
|
|
1631
|
+
if tempfile.gettempdir() in batch_run_data:
|
|
1632
|
+
temp_file_to_cleanup = batch_run_data
|
|
645
1633
|
|
|
1634
|
+
try:
|
|
1635
|
+
with EvalRunContext(batch_run_client):
|
|
1636
|
+
runs = {
|
|
1637
|
+
evaluator_name: batch_run_client.run(
|
|
1638
|
+
flow=evaluator,
|
|
1639
|
+
data=batch_run_data,
|
|
1640
|
+
# Don't pass target_run when using complete dataframe
|
|
1641
|
+
run=target_run,
|
|
1642
|
+
evaluator_name=evaluator_name,
|
|
1643
|
+
column_mapping=column_mapping.get(evaluator_name, column_mapping.get("default", None)),
|
|
1644
|
+
stream=True,
|
|
1645
|
+
name=kwargs.get("_run_name"),
|
|
1646
|
+
)
|
|
1647
|
+
for evaluator_name, evaluator in evaluators.items()
|
|
1648
|
+
}
|
|
1649
|
+
|
|
1650
|
+
# get_details needs to be called within EvalRunContext scope in order to have user agent populated
|
|
1651
|
+
per_evaluator_results: Dict[str, __EvaluatorInfo] = {
|
|
1652
|
+
evaluator_name: {
|
|
1653
|
+
"result": batch_run_client.get_details(run, all_results=True),
|
|
1654
|
+
"metrics": batch_run_client.get_metrics(run),
|
|
1655
|
+
"run_summary": batch_run_client.get_run_summary(run),
|
|
1656
|
+
}
|
|
1657
|
+
for evaluator_name, run in runs.items()
|
|
1658
|
+
}
|
|
1659
|
+
finally:
|
|
1660
|
+
# Clean up temporary file if it was created
|
|
1661
|
+
if temp_file_to_cleanup and os.path.exists(temp_file_to_cleanup):
|
|
1662
|
+
try:
|
|
1663
|
+
os.unlink(temp_file_to_cleanup)
|
|
1664
|
+
except Exception as e:
|
|
1665
|
+
LOGGER.warning(f"Failed to clean up temporary file {temp_file_to_cleanup}: {e}")
|
|
646
1666
|
# Concatenate all results
|
|
647
|
-
evaluators_result_df =
|
|
1667
|
+
evaluators_result_df = pd.DataFrame()
|
|
648
1668
|
evaluators_metric = {}
|
|
649
|
-
for evaluator_name,
|
|
650
|
-
|
|
1669
|
+
for evaluator_name, evaluator_result in per_evaluator_results.items():
|
|
1670
|
+
if fail_on_evaluator_errors and evaluator_result["run_summary"]["failed_lines"] > 0:
|
|
1671
|
+
_print_summary(per_evaluator_results)
|
|
1672
|
+
_turn_error_logs_into_exception(evaluator_result["run_summary"]["log_path"] + "/error.json")
|
|
1673
|
+
|
|
1674
|
+
evaluator_result_df = evaluator_result["result"]
|
|
651
1675
|
|
|
652
1676
|
# drop input columns
|
|
653
1677
|
evaluator_result_df = evaluator_result_df.drop(
|
|
@@ -670,27 +1694,826 @@ def _evaluate( # pylint: disable=too-many-locals
|
|
|
670
1694
|
else evaluator_result_df
|
|
671
1695
|
)
|
|
672
1696
|
|
|
673
|
-
evaluators_metric.update({f"{evaluator_name}.{k}": v for k, v in
|
|
1697
|
+
evaluators_metric.update({f"{evaluator_name}.{k}": v for k, v in evaluator_result["metrics"].items()})
|
|
674
1698
|
|
|
675
1699
|
# Rename columns, generated by target function to outputs instead of inputs.
|
|
676
1700
|
# If target generates columns, already present in the input data, these columns
|
|
677
1701
|
# will be marked as outputs already so we do not need to rename them.
|
|
678
|
-
|
|
679
|
-
|
|
680
|
-
|
|
681
|
-
|
|
682
|
-
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
|
|
686
|
-
|
|
687
|
-
|
|
688
|
-
|
|
1702
|
+
|
|
1703
|
+
input_data_df = _rename_columns_conditionally(validated_data["input_data_df"])
|
|
1704
|
+
eval_result_df = pd.concat([input_data_df, evaluators_result_df], axis=1, verify_integrity=True)
|
|
1705
|
+
eval_metrics = _aggregate_metrics(evaluators_result_df, evaluators)
|
|
1706
|
+
eval_metrics.update(evaluators_metric)
|
|
1707
|
+
|
|
1708
|
+
return eval_result_df, eval_metrics, per_evaluator_results
|
|
1709
|
+
|
|
1710
|
+
|
|
1711
|
+
def _map_names_to_builtins(
|
|
1712
|
+
evaluators: Dict[str, Callable],
|
|
1713
|
+
graders: Dict[str, AzureOpenAIGrader],
|
|
1714
|
+
) -> Dict[str, str]:
|
|
1715
|
+
"""
|
|
1716
|
+
Construct a mapping from user-supplied evaluator names to which known, built-in
|
|
1717
|
+
evaluator or grader they refer to. Custom evaluators are excluded from the mapping
|
|
1718
|
+
as we only want to track built-in evaluators and graders.
|
|
1719
|
+
|
|
1720
|
+
:param evaluators: The dictionary of evaluators.
|
|
1721
|
+
:type evaluators: Dict[str, Callable]
|
|
1722
|
+
:param graders: The dictionary of graders.
|
|
1723
|
+
:type graders: Dict[str, AzureOpenAIGrader]
|
|
1724
|
+
:param evaluator_config: The configuration for evaluators.
|
|
1725
|
+
:type evaluator_config: Optional[Dict[str, EvaluatorConfig]]
|
|
1726
|
+
|
|
1727
|
+
"""
|
|
1728
|
+
from .._eval_mapping import EVAL_CLASS_MAP
|
|
1729
|
+
|
|
1730
|
+
name_map = {}
|
|
1731
|
+
|
|
1732
|
+
for name, evaluator in evaluators.items():
|
|
1733
|
+
# Check if the evaluator is a known built-in evaluator
|
|
1734
|
+
found_eval = False
|
|
1735
|
+
for eval_class, eval_id in EVAL_CLASS_MAP.items():
|
|
1736
|
+
if isinstance(evaluator, eval_class):
|
|
1737
|
+
name_map[name] = eval_id
|
|
1738
|
+
found_eval = True
|
|
1739
|
+
break
|
|
1740
|
+
if not found_eval:
|
|
1741
|
+
# Skip custom evaluators - we only want to track built-in evaluators
|
|
1742
|
+
pass
|
|
1743
|
+
|
|
1744
|
+
for name, grader in graders.items():
|
|
1745
|
+
name_map[name] = grader.id
|
|
1746
|
+
|
|
1747
|
+
return name_map
|
|
1748
|
+
|
|
1749
|
+
|
|
1750
|
+
def _turn_error_logs_into_exception(log_path: str) -> None:
|
|
1751
|
+
"""Produce an EvaluationException using the contents of the inputted
|
|
1752
|
+
file as the error message.
|
|
1753
|
+
|
|
1754
|
+
:param log_path: The path to the error log file.
|
|
1755
|
+
:type log_path: str
|
|
1756
|
+
"""
|
|
1757
|
+
with open(log_path, "r", encoding=DefaultOpenEncoding.READ) as file:
|
|
1758
|
+
error_message = file.read()
|
|
1759
|
+
raise EvaluationException(
|
|
1760
|
+
message=error_message,
|
|
1761
|
+
target=ErrorTarget.EVALUATE,
|
|
1762
|
+
category=ErrorCategory.FAILED_EXECUTION,
|
|
1763
|
+
blame=ErrorBlame.UNKNOWN,
|
|
689
1764
|
)
|
|
690
1765
|
|
|
691
|
-
result = {"rows": result_df.to_dict("records"), "metrics": metrics, "studio_url": studio_url}
|
|
692
1766
|
|
|
693
|
-
|
|
694
|
-
|
|
1767
|
+
def _convert_results_to_aoai_evaluation_results(
|
|
1768
|
+
results: EvaluationResult,
|
|
1769
|
+
logger: logging.Logger,
|
|
1770
|
+
eval_id: Optional[str] = None,
|
|
1771
|
+
eval_run_id: Optional[str] = None,
|
|
1772
|
+
evaluators: Dict[str, Union[Callable, AzureOpenAIGrader]] = None,
|
|
1773
|
+
eval_run_summary: Optional[Dict[str, Any]] = None,
|
|
1774
|
+
eval_meta_data: Optional[Dict[str, Any]] = None,
|
|
1775
|
+
) -> None:
|
|
1776
|
+
"""
|
|
1777
|
+
Convert evaluation results to AOAI evaluation results format.
|
|
1778
|
+
|
|
1779
|
+
Each row of input results.rows looks like:
|
|
1780
|
+
{"inputs.query":"What is the capital of France?","inputs.context":"France is in Europe",
|
|
1781
|
+
"inputs.generated_response":"Paris is the capital of France.","inputs.ground_truth":"Paris is the capital of France.",
|
|
1782
|
+
"outputs.F1_score.f1_score":1.0,"outputs.F1_score.f1_result":"pass","outputs.F1_score.f1_threshold":0.5}
|
|
1783
|
+
|
|
1784
|
+
Convert each row into new RunOutputItem object with results array.
|
|
1785
|
+
|
|
1786
|
+
:param results: The evaluation results to convert
|
|
1787
|
+
:type results: EvaluationResult
|
|
1788
|
+
:param eval_meta_data: The evaluation metadata, containing eval_id, eval_run_id, and testing_criteria
|
|
1789
|
+
:type eval_meta_data: Dict[str, Any]
|
|
1790
|
+
:param logger: Logger instance
|
|
1791
|
+
:type logger: logging.Logger
|
|
1792
|
+
:return: EvaluationResult with converted evaluation results in AOAI format
|
|
1793
|
+
:rtype: EvaluationResult
|
|
1794
|
+
"""
|
|
695
1795
|
|
|
696
|
-
|
|
1796
|
+
if evaluators is None:
|
|
1797
|
+
return
|
|
1798
|
+
|
|
1799
|
+
# Get the testing_criteria_name and testing_criteria_type from evaluators
|
|
1800
|
+
testing_criteria_name_types_metrics: Optional[Dict[str, Any]] = {}
|
|
1801
|
+
criteria_name_types_from_meta: Optional[Dict[str, str]] = {}
|
|
1802
|
+
if eval_meta_data and "testing_criteria" in eval_meta_data:
|
|
1803
|
+
testing_criteria_list: Optional[List[Dict[str, Any]]] = eval_meta_data.get("testing_criteria")
|
|
1804
|
+
if testing_criteria_list is not None:
|
|
1805
|
+
for criteria in testing_criteria_list:
|
|
1806
|
+
criteria_name = criteria.get("name")
|
|
1807
|
+
criteria_type = criteria.get("type")
|
|
1808
|
+
if criteria_name is not None and criteria_type is not None:
|
|
1809
|
+
criteria_name_types_from_meta[criteria_name] = criteria
|
|
1810
|
+
|
|
1811
|
+
for criteria_name, evaluator in evaluators.items():
|
|
1812
|
+
criteria_type = None
|
|
1813
|
+
metrics = []
|
|
1814
|
+
if criteria_name in criteria_name_types_from_meta:
|
|
1815
|
+
criteria_type = criteria_name_types_from_meta[criteria_name].get("type", None)
|
|
1816
|
+
evaluator_name = criteria_name_types_from_meta[criteria_name].get("evaluator_name", None)
|
|
1817
|
+
current_evaluator_metrics = criteria_name_types_from_meta[criteria_name].get("metrics", None)
|
|
1818
|
+
if current_evaluator_metrics and len(current_evaluator_metrics) > 0:
|
|
1819
|
+
metrics.extend(current_evaluator_metrics)
|
|
1820
|
+
elif evaluator_name:
|
|
1821
|
+
if criteria_type == "azure_ai_evaluator" and evaluator_name.startswith("builtin."):
|
|
1822
|
+
evaluator_name = evaluator_name.replace("builtin.", "")
|
|
1823
|
+
metrics_mapped = _EvaluatorMetricMapping.EVALUATOR_NAME_METRICS_MAPPINGS.get(evaluator_name, [])
|
|
1824
|
+
if metrics_mapped and len(metrics_mapped) > 0:
|
|
1825
|
+
metrics.extend(metrics_mapped)
|
|
1826
|
+
else:
|
|
1827
|
+
metrics.append(criteria_name)
|
|
1828
|
+
else:
|
|
1829
|
+
metrics.append(criteria_name)
|
|
1830
|
+
elif isinstance(evaluator, AzureOpenAIGrader):
|
|
1831
|
+
criteria_type = evaluator._type # pylint: disable=protected-access
|
|
1832
|
+
metrics.append(criteria_name)
|
|
1833
|
+
elif isinstance(evaluator, EvaluatorBase):
|
|
1834
|
+
criteria_type = "azure_ai_evaluator"
|
|
1835
|
+
evaluator_class_name = evaluator.__class__.__name__
|
|
1836
|
+
eval_name = _EvaluatorMetricMapping.EVAL_CLASS_NAME_MAP.get(evaluator_class_name, None)
|
|
1837
|
+
if eval_name:
|
|
1838
|
+
metrics_mapped = _EvaluatorMetricMapping.EVALUATOR_NAME_METRICS_MAPPINGS.get(eval_name, [])
|
|
1839
|
+
if metrics_mapped and len(metrics_mapped) > 0:
|
|
1840
|
+
metrics.extend(metrics_mapped)
|
|
1841
|
+
else:
|
|
1842
|
+
metrics.append(criteria_name)
|
|
1843
|
+
else:
|
|
1844
|
+
criteria_type = "unknown"
|
|
1845
|
+
metrics.append(criteria_name)
|
|
1846
|
+
testing_criteria_name_types_metrics[criteria_name] = {"type": criteria_type, "metrics": metrics}
|
|
1847
|
+
|
|
1848
|
+
created_time = int(time.time())
|
|
1849
|
+
converted_rows = []
|
|
1850
|
+
|
|
1851
|
+
for row_idx, row in enumerate(results.get("rows", [])):
|
|
1852
|
+
# Group outputs by test criteria name
|
|
1853
|
+
criteria_groups = {criteria: {} for criteria in testing_criteria_name_types_metrics.keys()}
|
|
1854
|
+
input_groups = {}
|
|
1855
|
+
top_sample = {}
|
|
1856
|
+
for key, value in row.items():
|
|
1857
|
+
if key.startswith("outputs."):
|
|
1858
|
+
# Parse key: outputs.<test-criteria-name>.<metric>
|
|
1859
|
+
parts = key.split(".", 2) # Split into max 3 parts: ['outputs', '<criteria-name>', '<metric>']
|
|
1860
|
+
if len(parts) >= 3:
|
|
1861
|
+
criteria_name = parts[1]
|
|
1862
|
+
metric_name = parts[2]
|
|
1863
|
+
|
|
1864
|
+
if criteria_name not in criteria_groups:
|
|
1865
|
+
criteria_groups[criteria_name] = {}
|
|
1866
|
+
|
|
1867
|
+
criteria_groups[criteria_name][metric_name] = value
|
|
1868
|
+
elif key.startswith("inputs."):
|
|
1869
|
+
input_key = key.replace("inputs.", "")
|
|
1870
|
+
if input_key not in input_groups:
|
|
1871
|
+
input_groups[input_key] = value
|
|
1872
|
+
|
|
1873
|
+
# Convert each criteria group to RunOutputItem result
|
|
1874
|
+
run_output_results = []
|
|
1875
|
+
for criteria_name, metrics in criteria_groups.items():
|
|
1876
|
+
# Extract metrics for this criteria
|
|
1877
|
+
expected_metrics = testing_criteria_name_types_metrics.get(criteria_name, {}).get("metrics", [])
|
|
1878
|
+
criteria_type = testing_criteria_name_types_metrics.get(criteria_name, {}).get("type", "unknown")
|
|
1879
|
+
result_per_metric = {}
|
|
1880
|
+
# Find score - look for various score patterns
|
|
1881
|
+
for metric_key, metric_value in metrics.items():
|
|
1882
|
+
if metric_key.endswith("_score") or metric_key == "score":
|
|
1883
|
+
metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
|
|
1884
|
+
if metric not in result_per_metric:
|
|
1885
|
+
result_per_metric[metric] = {"score": metric_value}
|
|
1886
|
+
else:
|
|
1887
|
+
result_per_metric[metric]["score"] = metric_value
|
|
1888
|
+
_append_indirect_attachments_to_results(result_per_metric, "score", metric, metric_value)
|
|
1889
|
+
if metric_key == "passed":
|
|
1890
|
+
metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
|
|
1891
|
+
if metric not in result_per_metric:
|
|
1892
|
+
result_per_metric[metric] = {"passed": metric_value}
|
|
1893
|
+
else:
|
|
1894
|
+
result_per_metric[metric]["passed"] = metric_value
|
|
1895
|
+
_append_indirect_attachments_to_results(result_per_metric, "passed", metric, metric_value)
|
|
1896
|
+
elif metric_key.endswith("_result") or metric_key == "result" or metric_key.endswith("_label"):
|
|
1897
|
+
metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
|
|
1898
|
+
label = metric_value
|
|
1899
|
+
passed = (
|
|
1900
|
+
True if (str(metric_value).lower() == "pass" or str(metric_value).lower() == "true") else False
|
|
1901
|
+
)
|
|
1902
|
+
if metric not in result_per_metric:
|
|
1903
|
+
if criteria_type == "azure_ai_evaluator":
|
|
1904
|
+
result_per_metric[metric] = {"label": label, "passed": passed}
|
|
1905
|
+
else:
|
|
1906
|
+
result_per_metric[metric] = {"label": label}
|
|
1907
|
+
else:
|
|
1908
|
+
result_per_metric[metric]["label"] = metric_value
|
|
1909
|
+
if criteria_type == "azure_ai_evaluator":
|
|
1910
|
+
result_per_metric[metric]["passed"] = passed
|
|
1911
|
+
_append_indirect_attachments_to_results(result_per_metric, "label", metric, label)
|
|
1912
|
+
if criteria_type == "azure_ai_evaluator":
|
|
1913
|
+
_append_indirect_attachments_to_results(result_per_metric, "passed", metric, passed)
|
|
1914
|
+
elif (
|
|
1915
|
+
metric_key.endswith("_reason") and not metric_key.endswith("_finish_reason")
|
|
1916
|
+
) or metric_key == "reason":
|
|
1917
|
+
metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
|
|
1918
|
+
if metric not in result_per_metric:
|
|
1919
|
+
result_per_metric[metric] = {"reason": metric_value}
|
|
1920
|
+
else:
|
|
1921
|
+
result_per_metric[metric]["reason"] = metric_value
|
|
1922
|
+
_append_indirect_attachments_to_results(result_per_metric, "reason", metric, metric_value)
|
|
1923
|
+
elif metric_key.endswith("_threshold") or metric_key == "threshold":
|
|
1924
|
+
metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
|
|
1925
|
+
if metric not in result_per_metric:
|
|
1926
|
+
result_per_metric[metric] = {"threshold": metric_value}
|
|
1927
|
+
else:
|
|
1928
|
+
result_per_metric[metric]["threshold"] = metric_value
|
|
1929
|
+
_append_indirect_attachments_to_results(result_per_metric, "threshold", metric, metric_value)
|
|
1930
|
+
elif metric_key == "sample":
|
|
1931
|
+
metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
|
|
1932
|
+
if metric not in result_per_metric:
|
|
1933
|
+
result_per_metric[metric] = {"sample": metric_value}
|
|
1934
|
+
else:
|
|
1935
|
+
result_per_metric[metric]["sample"] = metric_value
|
|
1936
|
+
_append_indirect_attachments_to_results(result_per_metric, "sample", metric, metric_value)
|
|
1937
|
+
elif metric_key.endswith("_finish_reason"):
|
|
1938
|
+
metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
|
|
1939
|
+
if metric not in result_per_metric:
|
|
1940
|
+
result_per_metric[metric] = {"sample": {"finish_reason": metric_value}}
|
|
1941
|
+
elif metric in result_per_metric and "sample" not in result_per_metric[metric]:
|
|
1942
|
+
result_per_metric[metric]["sample"] = {"finish_reason": metric_value}
|
|
1943
|
+
elif (
|
|
1944
|
+
metric in result_per_metric
|
|
1945
|
+
and "sample" in result_per_metric[metric]
|
|
1946
|
+
and "finish_reason" not in result_per_metric[metric]["sample"]
|
|
1947
|
+
):
|
|
1948
|
+
result_per_metric[metric]["sample"]["finish_reason"] = metric_value
|
|
1949
|
+
_append_indirect_attachments_to_results(
|
|
1950
|
+
result_per_metric, "sample", metric, metric_value, "finish_reason"
|
|
1951
|
+
)
|
|
1952
|
+
elif metric_key.endswith("_model"):
|
|
1953
|
+
metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
|
|
1954
|
+
if metric not in result_per_metric:
|
|
1955
|
+
result_per_metric[metric] = {"sample": {"model": metric_value}}
|
|
1956
|
+
elif metric in result_per_metric and "sample" not in result_per_metric[metric]:
|
|
1957
|
+
result_per_metric[metric]["sample"] = {"model": metric_value}
|
|
1958
|
+
elif (
|
|
1959
|
+
metric in result_per_metric
|
|
1960
|
+
and "sample" in result_per_metric[metric]
|
|
1961
|
+
and "model" not in result_per_metric[metric]["sample"]
|
|
1962
|
+
):
|
|
1963
|
+
result_per_metric[metric]["sample"]["model"] = metric_value
|
|
1964
|
+
_append_indirect_attachments_to_results(result_per_metric, "sample", metric, metric_value, "model")
|
|
1965
|
+
elif metric_key.endswith("_sample_input"):
|
|
1966
|
+
metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
|
|
1967
|
+
input_metric_val_json: Optional[List[Dict[str, Any]]] = []
|
|
1968
|
+
try:
|
|
1969
|
+
input_metric_val_json = json.loads(metric_value)
|
|
1970
|
+
except Exception as e:
|
|
1971
|
+
logger.warning(f"Failed to parse _sample_input value as JSON: {e}")
|
|
1972
|
+
if metric not in result_per_metric:
|
|
1973
|
+
result_per_metric[metric] = {"sample": {"input": input_metric_val_json}}
|
|
1974
|
+
elif metric in result_per_metric and "sample" not in result_per_metric[metric]:
|
|
1975
|
+
result_per_metric[metric]["sample"] = {"input": input_metric_val_json}
|
|
1976
|
+
elif (
|
|
1977
|
+
metric in result_per_metric
|
|
1978
|
+
and "sample" in result_per_metric[metric]
|
|
1979
|
+
and "input" not in result_per_metric[metric]["sample"]
|
|
1980
|
+
):
|
|
1981
|
+
result_per_metric[metric]["sample"]["input"] = input_metric_val_json
|
|
1982
|
+
_append_indirect_attachments_to_results(
|
|
1983
|
+
result_per_metric, "sample", metric, input_metric_val_json, "input"
|
|
1984
|
+
)
|
|
1985
|
+
elif metric_key.endswith("_sample_output"):
|
|
1986
|
+
metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
|
|
1987
|
+
output_metric_val_json: Optional[List[Dict[str, Any]]] = []
|
|
1988
|
+
try:
|
|
1989
|
+
output_metric_val_json = json.loads(metric_value)
|
|
1990
|
+
except Exception as e:
|
|
1991
|
+
logger.warning(f"Failed to parse _sample_output value as JSON: {e}")
|
|
1992
|
+
if metric not in result_per_metric:
|
|
1993
|
+
result_per_metric[metric] = {"sample": {"output": output_metric_val_json}}
|
|
1994
|
+
elif metric in result_per_metric and "sample" not in result_per_metric[metric]:
|
|
1995
|
+
result_per_metric[metric]["sample"] = {"output": output_metric_val_json}
|
|
1996
|
+
elif (
|
|
1997
|
+
metric in result_per_metric
|
|
1998
|
+
and "sample" in result_per_metric[metric]
|
|
1999
|
+
and "output" not in result_per_metric[metric]["sample"]
|
|
2000
|
+
):
|
|
2001
|
+
result_per_metric[metric]["sample"]["output"] = output_metric_val_json
|
|
2002
|
+
_append_indirect_attachments_to_results(
|
|
2003
|
+
result_per_metric, "sample", metric, output_metric_val_json, "output"
|
|
2004
|
+
)
|
|
2005
|
+
elif metric_key.endswith("_total_tokens"):
|
|
2006
|
+
metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
|
|
2007
|
+
metric_value = None if _is_none_or_nan(metric_value) else metric_value
|
|
2008
|
+
if metric not in result_per_metric:
|
|
2009
|
+
result_per_metric[metric] = {"sample": {"usage": {"total_tokens": metric_value}}}
|
|
2010
|
+
elif metric in result_per_metric and "sample" not in result_per_metric[metric]:
|
|
2011
|
+
result_per_metric[metric]["sample"] = {"usage": {"total_tokens": metric_value}}
|
|
2012
|
+
elif (
|
|
2013
|
+
metric in result_per_metric
|
|
2014
|
+
and "sample" in result_per_metric[metric]
|
|
2015
|
+
and "usage" not in result_per_metric[metric]["sample"]
|
|
2016
|
+
):
|
|
2017
|
+
result_per_metric[metric]["sample"]["usage"] = {"total_tokens": metric_value}
|
|
2018
|
+
else:
|
|
2019
|
+
result_per_metric[metric]["sample"]["usage"]["total_tokens"] = metric_value
|
|
2020
|
+
_append_indirect_attachments_to_results(
|
|
2021
|
+
result_per_metric, "sample", metric, metric_value, "usage", "total_tokens"
|
|
2022
|
+
)
|
|
2023
|
+
elif metric_key.endswith("_prompt_tokens"):
|
|
2024
|
+
metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
|
|
2025
|
+
metric_value = None if _is_none_or_nan(metric_value) else metric_value
|
|
2026
|
+
if metric not in result_per_metric:
|
|
2027
|
+
result_per_metric[metric] = {"sample": {"usage": {"prompt_tokens": metric_value}}}
|
|
2028
|
+
elif metric in result_per_metric and "sample" not in result_per_metric[metric]:
|
|
2029
|
+
result_per_metric[metric]["sample"] = {"usage": {"prompt_tokens": metric_value}}
|
|
2030
|
+
elif (
|
|
2031
|
+
metric in result_per_metric
|
|
2032
|
+
and "sample" in result_per_metric[metric]
|
|
2033
|
+
and "usage" not in result_per_metric[metric]["sample"]
|
|
2034
|
+
):
|
|
2035
|
+
result_per_metric[metric]["sample"]["usage"] = {"prompt_tokens": metric_value}
|
|
2036
|
+
else:
|
|
2037
|
+
result_per_metric[metric]["sample"]["usage"]["prompt_tokens"] = metric_value
|
|
2038
|
+
_append_indirect_attachments_to_results(
|
|
2039
|
+
result_per_metric, "sample", metric, metric_value, "usage", "prompt_tokens"
|
|
2040
|
+
)
|
|
2041
|
+
elif metric_key.endswith("_completion_tokens"):
|
|
2042
|
+
metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
|
|
2043
|
+
metric_value = None if _is_none_or_nan(metric_value) else metric_value
|
|
2044
|
+
if metric not in result_per_metric:
|
|
2045
|
+
result_per_metric[metric] = {"sample": {"usage": {"completion_tokens": metric_value}}}
|
|
2046
|
+
elif metric in result_per_metric and "sample" not in result_per_metric[metric]:
|
|
2047
|
+
result_per_metric[metric]["sample"] = {"usage": {"completion_tokens": metric_value}}
|
|
2048
|
+
elif (
|
|
2049
|
+
metric in result_per_metric
|
|
2050
|
+
and "sample" in result_per_metric[metric]
|
|
2051
|
+
and "usage" not in result_per_metric[metric]["sample"]
|
|
2052
|
+
):
|
|
2053
|
+
result_per_metric[metric]["sample"]["usage"] = {"completion_tokens": metric_value}
|
|
2054
|
+
else:
|
|
2055
|
+
result_per_metric[metric]["sample"]["usage"]["completion_tokens"] = metric_value
|
|
2056
|
+
_append_indirect_attachments_to_results(
|
|
2057
|
+
result_per_metric, "sample", metric, metric_value, "usage", "completion_tokens"
|
|
2058
|
+
)
|
|
2059
|
+
elif not any(
|
|
2060
|
+
metric_key.endswith(suffix)
|
|
2061
|
+
for suffix in [
|
|
2062
|
+
"_result",
|
|
2063
|
+
"_reason",
|
|
2064
|
+
"_threshold",
|
|
2065
|
+
"_label",
|
|
2066
|
+
"_score",
|
|
2067
|
+
"_model",
|
|
2068
|
+
"_finish_reason",
|
|
2069
|
+
"_sample_input",
|
|
2070
|
+
"_sample_output",
|
|
2071
|
+
"_total_tokens",
|
|
2072
|
+
"_prompt_tokens",
|
|
2073
|
+
"_completion_tokens",
|
|
2074
|
+
]
|
|
2075
|
+
):
|
|
2076
|
+
metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
|
|
2077
|
+
# If no score found yet and this doesn't match other patterns, use as score
|
|
2078
|
+
if metric_key == metric and metric not in result_per_metric:
|
|
2079
|
+
result_per_metric[metric] = {"score": metric_value}
|
|
2080
|
+
elif metric_key == metric and result_per_metric[metric].get("score", None) is None:
|
|
2081
|
+
result_per_metric[metric]["score"] = metric_value
|
|
2082
|
+
|
|
2083
|
+
for metric, metric_values in result_per_metric.items():
|
|
2084
|
+
score = metric_values.get("score", None)
|
|
2085
|
+
label = metric_values.get("label", None)
|
|
2086
|
+
reason = metric_values.get("reason", None)
|
|
2087
|
+
threshold = metric_values.get("threshold", None)
|
|
2088
|
+
passed = metric_values.get("passed", None)
|
|
2089
|
+
sample = metric_values.get("sample", None)
|
|
2090
|
+
|
|
2091
|
+
# Create result object for this criteria
|
|
2092
|
+
result_obj = {
|
|
2093
|
+
"type": testing_criteria_name_types_metrics.get(criteria_name, {}).get(
|
|
2094
|
+
"type", "azure_ai_evaluator"
|
|
2095
|
+
),
|
|
2096
|
+
"name": criteria_name, # Use criteria name as name
|
|
2097
|
+
"metric": metric if metric is not None else criteria_name, # Use criteria name as metric
|
|
2098
|
+
}
|
|
2099
|
+
# Add optional fields
|
|
2100
|
+
if (
|
|
2101
|
+
metric in _EvaluatorMetricMapping.EVALUATOR_NAME_METRICS_MAPPINGS["indirect_attack"]
|
|
2102
|
+
or metric in _EvaluatorMetricMapping.EVALUATOR_NAME_METRICS_MAPPINGS["code_vulnerability"]
|
|
2103
|
+
or metric in _EvaluatorMetricMapping.EVALUATOR_NAME_METRICS_MAPPINGS["protected_material"]
|
|
2104
|
+
):
|
|
2105
|
+
copy_label = label
|
|
2106
|
+
if copy_label is not None and isinstance(copy_label, bool) and copy_label == True:
|
|
2107
|
+
label = "fail"
|
|
2108
|
+
score = 0.0
|
|
2109
|
+
passed = False
|
|
2110
|
+
else:
|
|
2111
|
+
label = "pass"
|
|
2112
|
+
score = 1.0
|
|
2113
|
+
passed = True
|
|
2114
|
+
result_obj["score"] = (
|
|
2115
|
+
score if not (score is None or (isinstance(score, float) and math.isnan(score))) else None
|
|
2116
|
+
)
|
|
2117
|
+
result_obj["label"] = label
|
|
2118
|
+
result_obj["reason"] = reason
|
|
2119
|
+
result_obj["threshold"] = threshold
|
|
2120
|
+
result_obj["passed"] = passed
|
|
2121
|
+
|
|
2122
|
+
if sample is not None:
|
|
2123
|
+
result_obj["sample"] = sample
|
|
2124
|
+
top_sample = sample # Save top sample for the row
|
|
2125
|
+
run_output_results.append(result_obj)
|
|
2126
|
+
|
|
2127
|
+
if (
|
|
2128
|
+
eval_run_summary
|
|
2129
|
+
and criteria_name in eval_run_summary
|
|
2130
|
+
and isinstance(eval_run_summary[criteria_name], dict)
|
|
2131
|
+
and "error_code" in eval_run_summary[criteria_name]
|
|
2132
|
+
) and eval_run_summary[criteria_name].get("error_code", None) is not None:
|
|
2133
|
+
error_info = (
|
|
2134
|
+
{
|
|
2135
|
+
"code": eval_run_summary[criteria_name].get("error_code", None),
|
|
2136
|
+
"message": eval_run_summary[criteria_name].get("error_message", None),
|
|
2137
|
+
}
|
|
2138
|
+
if eval_run_summary[criteria_name].get("error_code", None) is not None
|
|
2139
|
+
else None
|
|
2140
|
+
)
|
|
2141
|
+
sample = {"error": error_info} if error_info is not None else None
|
|
2142
|
+
# Create result object for this criteria
|
|
2143
|
+
metrics = testing_criteria_name_types_metrics.get(criteria_name, {}).get("metrics", [])
|
|
2144
|
+
for metric in metrics:
|
|
2145
|
+
should_add_error_summary = True
|
|
2146
|
+
for result in run_output_results:
|
|
2147
|
+
if result.get("name", None) == criteria_name and result.get("metric", None) == metric:
|
|
2148
|
+
rs_score = result.get("score", None)
|
|
2149
|
+
rs_threshold = result.get("threshold", None)
|
|
2150
|
+
rs_label = result.get("label", None)
|
|
2151
|
+
rs_reason = result.get("reason", None)
|
|
2152
|
+
if (
|
|
2153
|
+
_is_none_or_nan(rs_score)
|
|
2154
|
+
and _is_none_or_nan(rs_threshold)
|
|
2155
|
+
and _is_none_or_nan(rs_label)
|
|
2156
|
+
and _is_none_or_nan(rs_reason)
|
|
2157
|
+
):
|
|
2158
|
+
run_output_results.remove(result)
|
|
2159
|
+
else:
|
|
2160
|
+
should_add_error_summary = False
|
|
2161
|
+
break # Skip if already have result for this criteria and metric
|
|
2162
|
+
if should_add_error_summary:
|
|
2163
|
+
result_obj = {
|
|
2164
|
+
"type": testing_criteria_name_types_metrics.get(criteria_name, {}).get(
|
|
2165
|
+
"type", "azure_ai_evaluator"
|
|
2166
|
+
),
|
|
2167
|
+
"name": criteria_name, # Use criteria name as name
|
|
2168
|
+
"metric": metric if metric is not None else criteria_name, # Use criteria name as metric
|
|
2169
|
+
"score": None,
|
|
2170
|
+
"label": None,
|
|
2171
|
+
"reason": None,
|
|
2172
|
+
"threshold": None,
|
|
2173
|
+
"passed": None,
|
|
2174
|
+
"sample": sample,
|
|
2175
|
+
}
|
|
2176
|
+
run_output_results.append(result_obj)
|
|
2177
|
+
|
|
2178
|
+
# Create RunOutputItem structure
|
|
2179
|
+
run_output_item = {
|
|
2180
|
+
"object": "eval.run.output_item",
|
|
2181
|
+
"id": f"{row_idx+1}",
|
|
2182
|
+
"run_id": eval_run_id,
|
|
2183
|
+
"eval_id": eval_id,
|
|
2184
|
+
"created_at": created_time,
|
|
2185
|
+
"datasource_item_id": row_idx,
|
|
2186
|
+
"datasource_item": input_groups,
|
|
2187
|
+
"results": run_output_results,
|
|
2188
|
+
"status": "completed" if len(run_output_results) > 0 else "error",
|
|
2189
|
+
}
|
|
2190
|
+
|
|
2191
|
+
run_output_item["sample"] = top_sample
|
|
2192
|
+
|
|
2193
|
+
converted_rows.append(run_output_item)
|
|
2194
|
+
|
|
2195
|
+
# Create converted results maintaining the same structure
|
|
2196
|
+
results["_evaluation_results_list"] = converted_rows
|
|
2197
|
+
logger.info(
|
|
2198
|
+
f"Converted {len(converted_rows)} rows to AOAI evaluation format, eval_id: {eval_id}, eval_run_id: {eval_run_id}"
|
|
2199
|
+
)
|
|
2200
|
+
# Calculate summary statistics
|
|
2201
|
+
evaluation_summary = _calculate_aoai_evaluation_summary(converted_rows, logger, criteria_name_types_from_meta)
|
|
2202
|
+
results["_evaluation_summary"] = evaluation_summary
|
|
2203
|
+
logger.info(
|
|
2204
|
+
f"Summary statistics calculated for {len(converted_rows)} rows, eval_id: {eval_id}, eval_run_id: {eval_run_id}"
|
|
2205
|
+
)
|
|
2206
|
+
|
|
2207
|
+
|
|
2208
|
+
def _is_none_or_nan(value: Any) -> bool:
|
|
2209
|
+
"""
|
|
2210
|
+
Check if a value is None or NaN.
|
|
2211
|
+
|
|
2212
|
+
:param value: The value to check
|
|
2213
|
+
:type value: Any
|
|
2214
|
+
:return: True if the value is None or NaN, False otherwise
|
|
2215
|
+
:rtype: bool
|
|
2216
|
+
"""
|
|
2217
|
+
if value is None:
|
|
2218
|
+
return True
|
|
2219
|
+
if isinstance(value, float) and math.isnan(value):
|
|
2220
|
+
return True
|
|
2221
|
+
if isinstance(value, str) and value.lower() in ["nan", "null", "none", ""]:
|
|
2222
|
+
return True
|
|
2223
|
+
return False
|
|
2224
|
+
|
|
2225
|
+
|
|
2226
|
+
def _append_indirect_attachments_to_results(
|
|
2227
|
+
current_result_dict: Dict[str, Any],
|
|
2228
|
+
result_name: str,
|
|
2229
|
+
metric: str,
|
|
2230
|
+
metric_value: Any,
|
|
2231
|
+
nested_result_name: Optional[str] = None,
|
|
2232
|
+
secondnested_result_name: Optional[str] = None,
|
|
2233
|
+
) -> None:
|
|
2234
|
+
"""
|
|
2235
|
+
Append indirect attachments to the current result dictionary.
|
|
2236
|
+
|
|
2237
|
+
:param current_result_dict: The current result dictionary to update
|
|
2238
|
+
:type current_result_dict: Dict[str, Any]
|
|
2239
|
+
:param result_name: The result name
|
|
2240
|
+
:type result_name: str
|
|
2241
|
+
:param metric: The metric name
|
|
2242
|
+
:type metric: str
|
|
2243
|
+
:param metric_value: The value of the metric
|
|
2244
|
+
:type metric_value: Any
|
|
2245
|
+
"""
|
|
2246
|
+
if metric == "xpia" and result_name:
|
|
2247
|
+
for metric_extended in ["xpia_manipulated_content", "xpia_intrusion", "xpia_information_gathering"]:
|
|
2248
|
+
if nested_result_name is None:
|
|
2249
|
+
if metric_extended not in current_result_dict:
|
|
2250
|
+
current_result_dict[metric_extended] = {result_name: metric_value}
|
|
2251
|
+
else:
|
|
2252
|
+
current_result_dict[metric_extended][result_name] = metric_value
|
|
2253
|
+
elif nested_result_name is not None and secondnested_result_name is None:
|
|
2254
|
+
if metric_extended not in current_result_dict:
|
|
2255
|
+
current_result_dict[metric_extended] = {result_name: {nested_result_name: metric_value}}
|
|
2256
|
+
elif metric_extended in current_result_dict and result_name not in current_result_dict[metric_extended]:
|
|
2257
|
+
current_result_dict[metric_extended][result_name] = {nested_result_name: metric_value}
|
|
2258
|
+
elif (
|
|
2259
|
+
metric_extended in current_result_dict
|
|
2260
|
+
and result_name in current_result_dict[metric_extended]
|
|
2261
|
+
and nested_result_name not in current_result_dict[metric_extended][result_name]
|
|
2262
|
+
):
|
|
2263
|
+
current_result_dict[metric_extended][result_name][nested_result_name] = metric_value
|
|
2264
|
+
elif nested_result_name is not None and secondnested_result_name is not None:
|
|
2265
|
+
if metric_extended not in current_result_dict:
|
|
2266
|
+
current_result_dict[metric_extended] = {
|
|
2267
|
+
result_name: {nested_result_name: {secondnested_result_name: metric_value}}
|
|
2268
|
+
}
|
|
2269
|
+
elif metric_extended in current_result_dict and result_name not in current_result_dict[metric_extended]:
|
|
2270
|
+
current_result_dict[metric_extended][result_name] = {
|
|
2271
|
+
nested_result_name: {secondnested_result_name: metric_value}
|
|
2272
|
+
}
|
|
2273
|
+
elif (
|
|
2274
|
+
metric_extended in current_result_dict
|
|
2275
|
+
and result_name in current_result_dict[metric_extended]
|
|
2276
|
+
and nested_result_name not in current_result_dict[metric_extended][result_name]
|
|
2277
|
+
):
|
|
2278
|
+
current_result_dict[metric_extended][result_name][nested_result_name] = {
|
|
2279
|
+
secondnested_result_name: metric_value
|
|
2280
|
+
}
|
|
2281
|
+
else:
|
|
2282
|
+
(
|
|
2283
|
+
current_result_dict[metric_extended][result_name][nested_result_name][secondnested_result_name]
|
|
2284
|
+
) = metric_value
|
|
2285
|
+
|
|
2286
|
+
|
|
2287
|
+
def _get_metric_from_criteria(testing_criteria_name: str, metric_key: str, metric_list: List[str]) -> str:
|
|
2288
|
+
"""
|
|
2289
|
+
Get the metric name from the testing criteria and metric key.
|
|
2290
|
+
|
|
2291
|
+
:param testing_criteria_name: The name of the testing criteria
|
|
2292
|
+
:type testing_criteria_name: str
|
|
2293
|
+
:param metric_key: The metric key to look for
|
|
2294
|
+
:type metric_key: str
|
|
2295
|
+
:param metric_list: List of expected metrics for the testing criteria
|
|
2296
|
+
:type metric_list: List[str]
|
|
2297
|
+
:return: The metric name if found, otherwise the testing criteria name
|
|
2298
|
+
:rtype: str
|
|
2299
|
+
"""
|
|
2300
|
+
metric = None
|
|
2301
|
+
|
|
2302
|
+
if metric_key == "xpia_manipulated_content":
|
|
2303
|
+
metric = "xpia_manipulated_content"
|
|
2304
|
+
return metric
|
|
2305
|
+
elif metric_key == "xpia_intrusion":
|
|
2306
|
+
metric = "xpia_intrusion"
|
|
2307
|
+
return metric
|
|
2308
|
+
elif metric_key == "xpia_information_gathering":
|
|
2309
|
+
metric = "xpia_information_gathering"
|
|
2310
|
+
return metric
|
|
2311
|
+
for expected_metric in metric_list:
|
|
2312
|
+
if metric_key.startswith(expected_metric):
|
|
2313
|
+
metric = expected_metric
|
|
2314
|
+
break
|
|
2315
|
+
if metric is None:
|
|
2316
|
+
metric = testing_criteria_name
|
|
2317
|
+
return metric
|
|
2318
|
+
|
|
2319
|
+
|
|
2320
|
+
def _is_primary_metric(metric_name: str, evaluator_name: str) -> bool:
|
|
2321
|
+
"""
|
|
2322
|
+
Check if the given metric name is a primary metric.
|
|
2323
|
+
|
|
2324
|
+
:param metric_name: The name of the metric
|
|
2325
|
+
:type metric_name: str
|
|
2326
|
+
:param evaluator_name: The name of the evaluator
|
|
2327
|
+
:type evaluator_name: str
|
|
2328
|
+
:return: True if the metric is a primary metric, False otherwise
|
|
2329
|
+
:rtype: bool
|
|
2330
|
+
"""
|
|
2331
|
+
if (
|
|
2332
|
+
not _is_none_or_nan(metric_name)
|
|
2333
|
+
and not _is_none_or_nan(evaluator_name)
|
|
2334
|
+
and evaluator_name in _EvaluatorMetricMapping.EVALUATOR_NAME_METRICS_MAPPINGS
|
|
2335
|
+
and isinstance(_EvaluatorMetricMapping.EVALUATOR_NAME_METRICS_MAPPINGS[evaluator_name], list)
|
|
2336
|
+
and len(_EvaluatorMetricMapping.EVALUATOR_NAME_METRICS_MAPPINGS[evaluator_name]) > 1
|
|
2337
|
+
and metric_name in _EvaluatorMetricMapping.EVALUATOR_NAME_METRICS_MAPPINGS[evaluator_name]
|
|
2338
|
+
and metric_name.lower() != _EvaluatorMetricMapping.EVALUATOR_NAME_METRICS_MAPPINGS[evaluator_name][0].lower()
|
|
2339
|
+
):
|
|
2340
|
+
return False
|
|
2341
|
+
else:
|
|
2342
|
+
return True
|
|
2343
|
+
|
|
2344
|
+
|
|
2345
|
+
def _calculate_aoai_evaluation_summary(
|
|
2346
|
+
aoai_results: list, logger: logging.Logger, criteria_name_types_from_meta: Optional[Dict[str, Any]]
|
|
2347
|
+
) -> Dict[str, Any]:
|
|
2348
|
+
"""
|
|
2349
|
+
Calculate summary statistics for AOAI evaluation results.
|
|
2350
|
+
|
|
2351
|
+
:param aoai_results: List of AOAI result objects (run_output_items)
|
|
2352
|
+
:type aoai_results: list
|
|
2353
|
+
:return: Summary statistics dictionary
|
|
2354
|
+
:rtype: Dict[str, Any]
|
|
2355
|
+
"""
|
|
2356
|
+
# Calculate result counts based on aoaiResults
|
|
2357
|
+
result_counts = {"total": 0, "errored": 0, "failed": 0, "passed": 0}
|
|
2358
|
+
|
|
2359
|
+
# Count results by status and calculate per model usage
|
|
2360
|
+
model_usage_stats = {} # Dictionary to aggregate usage by model
|
|
2361
|
+
result_counts_stats = {} # Dictionary to aggregate usage by model
|
|
2362
|
+
|
|
2363
|
+
for aoai_result in aoai_results:
|
|
2364
|
+
logger.info(
|
|
2365
|
+
f"Processing aoai_result with id: {getattr(aoai_result, 'id', 'unknown')}, row keys: {aoai_result.keys() if hasattr(aoai_result, 'keys') else 'N/A'}"
|
|
2366
|
+
)
|
|
2367
|
+
result_counts["total"] += 1
|
|
2368
|
+
passed_count = 0
|
|
2369
|
+
failed_count = 0
|
|
2370
|
+
error_count = 0
|
|
2371
|
+
if isinstance(aoai_result, dict) and "results" in aoai_result:
|
|
2372
|
+
logger.info(
|
|
2373
|
+
f"Processing aoai_result with id: {getattr(aoai_result, 'id', 'unknown')}, results count: {len(aoai_result['results'])}"
|
|
2374
|
+
)
|
|
2375
|
+
for result_item in aoai_result["results"]:
|
|
2376
|
+
if isinstance(result_item, dict):
|
|
2377
|
+
testing_criteria = result_item.get("name", "")
|
|
2378
|
+
is_primary_metric = True
|
|
2379
|
+
if (
|
|
2380
|
+
criteria_name_types_from_meta is not None
|
|
2381
|
+
and isinstance(criteria_name_types_from_meta, dict)
|
|
2382
|
+
and testing_criteria in criteria_name_types_from_meta
|
|
2383
|
+
):
|
|
2384
|
+
evaluator_name = criteria_name_types_from_meta[testing_criteria].get("evaluator_name", None)
|
|
2385
|
+
criteria_type = criteria_name_types_from_meta[testing_criteria].get("type", None)
|
|
2386
|
+
if criteria_type == "azure_ai_evaluator" and evaluator_name.startswith("builtin."):
|
|
2387
|
+
evaluator_name = evaluator_name.replace("builtin.", "")
|
|
2388
|
+
is_primary_metric = _is_primary_metric(result_item.get("metric", ""), evaluator_name)
|
|
2389
|
+
if not is_primary_metric:
|
|
2390
|
+
logger.info(
|
|
2391
|
+
f"Skip counts for non-primary metric for testing_criteria: {testing_criteria}, metric: {result_item.get('metric', '')}"
|
|
2392
|
+
)
|
|
2393
|
+
continue
|
|
2394
|
+
# Check if the result has a 'passed' field
|
|
2395
|
+
if "passed" in result_item and result_item["passed"] is not None:
|
|
2396
|
+
if testing_criteria not in result_counts_stats:
|
|
2397
|
+
result_counts_stats[testing_criteria] = {
|
|
2398
|
+
"testing_criteria": testing_criteria,
|
|
2399
|
+
"failed": 0,
|
|
2400
|
+
"passed": 0,
|
|
2401
|
+
}
|
|
2402
|
+
if result_item["passed"] is True:
|
|
2403
|
+
passed_count += 1
|
|
2404
|
+
result_counts_stats[testing_criteria]["passed"] += 1
|
|
2405
|
+
|
|
2406
|
+
elif result_item["passed"] is False:
|
|
2407
|
+
failed_count += 1
|
|
2408
|
+
result_counts_stats[testing_criteria]["failed"] += 1
|
|
2409
|
+
# Check if the result indicates an error status
|
|
2410
|
+
elif ("status" in result_item and result_item["status"] in ["error", "errored"]) or (
|
|
2411
|
+
"sample" in result_item
|
|
2412
|
+
and isinstance(result_item["sample"], dict)
|
|
2413
|
+
and result_item["sample"].get("error", None) is not None
|
|
2414
|
+
):
|
|
2415
|
+
error_count += 1
|
|
2416
|
+
elif hasattr(aoai_result, "status") and aoai_result.status == "error":
|
|
2417
|
+
error_count += 1
|
|
2418
|
+
elif isinstance(aoai_result, dict) and aoai_result.get("status") == "error":
|
|
2419
|
+
error_count += 1
|
|
2420
|
+
|
|
2421
|
+
# Update overall result counts, error counts will not be considered for passed/failed
|
|
2422
|
+
if error_count > 0:
|
|
2423
|
+
result_counts["errored"] += 1
|
|
2424
|
+
|
|
2425
|
+
if failed_count > 0:
|
|
2426
|
+
result_counts["failed"] += 1
|
|
2427
|
+
elif (
|
|
2428
|
+
failed_count == 0 and passed_count > 0 and passed_count == len(aoai_result.get("results", [])) - error_count
|
|
2429
|
+
):
|
|
2430
|
+
result_counts["passed"] += 1
|
|
2431
|
+
|
|
2432
|
+
# Extract usage statistics from aoai_result.sample
|
|
2433
|
+
sample_data_list = []
|
|
2434
|
+
dup_usage_list = _EvaluatorMetricMapping.EVALUATOR_NAME_METRICS_MAPPINGS["indirect_attack"].copy()
|
|
2435
|
+
dup_usage_list.remove("xpia")
|
|
2436
|
+
if isinstance(aoai_result, dict) and aoai_result["results"] and isinstance(aoai_result["results"], list):
|
|
2437
|
+
for result_item in aoai_result["results"]:
|
|
2438
|
+
if (
|
|
2439
|
+
isinstance(result_item, dict)
|
|
2440
|
+
and "sample" in result_item
|
|
2441
|
+
and result_item["sample"]
|
|
2442
|
+
and result_item["metric"] not in dup_usage_list
|
|
2443
|
+
):
|
|
2444
|
+
sample_data_list.append(result_item["sample"])
|
|
2445
|
+
|
|
2446
|
+
for sample_data in sample_data_list:
|
|
2447
|
+
if sample_data and isinstance(sample_data, dict) and "usage" in sample_data:
|
|
2448
|
+
usage_data = sample_data["usage"]
|
|
2449
|
+
model_name = sample_data.get("model", "unknown") if usage_data.get("model", "unknown") else "unknown"
|
|
2450
|
+
if _is_none_or_nan(model_name):
|
|
2451
|
+
continue
|
|
2452
|
+
if model_name not in model_usage_stats:
|
|
2453
|
+
model_usage_stats[model_name] = {
|
|
2454
|
+
"invocation_count": 0,
|
|
2455
|
+
"total_tokens": 0,
|
|
2456
|
+
"prompt_tokens": 0,
|
|
2457
|
+
"completion_tokens": 0,
|
|
2458
|
+
"cached_tokens": 0,
|
|
2459
|
+
}
|
|
2460
|
+
# Aggregate usage statistics
|
|
2461
|
+
model_stats = model_usage_stats[model_name]
|
|
2462
|
+
model_stats["invocation_count"] += 1
|
|
2463
|
+
if isinstance(usage_data, dict):
|
|
2464
|
+
cur_total_tokens = usage_data.get("total_tokens", 0)
|
|
2465
|
+
if _is_none_or_nan(cur_total_tokens):
|
|
2466
|
+
cur_total_tokens = 0
|
|
2467
|
+
cur_prompt_tokens = usage_data.get("prompt_tokens", 0)
|
|
2468
|
+
if _is_none_or_nan(cur_prompt_tokens):
|
|
2469
|
+
cur_prompt_tokens = 0
|
|
2470
|
+
cur_completion_tokens = usage_data.get("completion_tokens", 0)
|
|
2471
|
+
if _is_none_or_nan(cur_completion_tokens):
|
|
2472
|
+
cur_completion_tokens = 0
|
|
2473
|
+
cur_cached_tokens = usage_data.get("cached_tokens", 0)
|
|
2474
|
+
if _is_none_or_nan(cur_cached_tokens):
|
|
2475
|
+
cur_cached_tokens = 0
|
|
2476
|
+
logger.info(
|
|
2477
|
+
f"Model: {model_name}, cur_total_tokens: {cur_total_tokens}, {_is_none_or_nan(cur_total_tokens)}, cur_prompt_tokens: {cur_prompt_tokens}, cur_completion_tokens: {cur_completion_tokens}, cur_cached_tokens: {cur_cached_tokens}"
|
|
2478
|
+
)
|
|
2479
|
+
model_stats["total_tokens"] += cur_total_tokens
|
|
2480
|
+
model_stats["prompt_tokens"] += cur_prompt_tokens
|
|
2481
|
+
model_stats["completion_tokens"] += cur_completion_tokens
|
|
2482
|
+
model_stats["cached_tokens"] += cur_cached_tokens
|
|
2483
|
+
|
|
2484
|
+
# Convert model usage stats to list format matching EvaluationRunPerModelUsage
|
|
2485
|
+
per_model_usage = []
|
|
2486
|
+
for model_name, stats in model_usage_stats.items():
|
|
2487
|
+
per_model_usage.append(
|
|
2488
|
+
{
|
|
2489
|
+
"model_name": model_name,
|
|
2490
|
+
"invocation_count": stats["invocation_count"],
|
|
2491
|
+
"total_tokens": stats["total_tokens"],
|
|
2492
|
+
"prompt_tokens": stats["prompt_tokens"],
|
|
2493
|
+
"completion_tokens": stats["completion_tokens"],
|
|
2494
|
+
"cached_tokens": stats["cached_tokens"],
|
|
2495
|
+
}
|
|
2496
|
+
)
|
|
2497
|
+
result_counts_stats_val = []
|
|
2498
|
+
logger.info(f"\r\n Result counts stats: {result_counts_stats}")
|
|
2499
|
+
for criteria_name, stats_val in result_counts_stats.items():
|
|
2500
|
+
if isinstance(stats_val, dict):
|
|
2501
|
+
logger.info(f"\r\n Criteria: {criteria_name}, stats: {stats_val}")
|
|
2502
|
+
cur_passed = stats_val.get("passed", 0)
|
|
2503
|
+
if _is_none_or_nan(cur_passed):
|
|
2504
|
+
cur_passed = 0
|
|
2505
|
+
cur_failed_count = stats_val.get("failed", 0)
|
|
2506
|
+
if _is_none_or_nan(cur_failed_count):
|
|
2507
|
+
cur_failed_count = 0
|
|
2508
|
+
result_counts_stats_val.append(
|
|
2509
|
+
{
|
|
2510
|
+
"testing_criteria": criteria_name if not _is_none_or_nan(criteria_name) else "unknown",
|
|
2511
|
+
"passed": cur_passed,
|
|
2512
|
+
"failed": cur_failed_count,
|
|
2513
|
+
}
|
|
2514
|
+
)
|
|
2515
|
+
return {
|
|
2516
|
+
"result_counts": result_counts,
|
|
2517
|
+
"per_model_usage": per_model_usage,
|
|
2518
|
+
"per_testing_criteria_results": result_counts_stats_val,
|
|
2519
|
+
}
|