azure-ai-evaluation 1.0.1__py3-none-any.whl → 1.13.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of azure-ai-evaluation might be problematic. Click here for more details.
- azure/ai/evaluation/__init__.py +83 -14
- azure/ai/evaluation/_aoai/__init__.py +10 -0
- azure/ai/evaluation/_aoai/aoai_grader.py +140 -0
- azure/ai/evaluation/_aoai/label_grader.py +68 -0
- azure/ai/evaluation/_aoai/python_grader.py +86 -0
- azure/ai/evaluation/_aoai/score_model_grader.py +94 -0
- azure/ai/evaluation/_aoai/string_check_grader.py +66 -0
- azure/ai/evaluation/_aoai/text_similarity_grader.py +80 -0
- azure/ai/evaluation/_azure/__init__.py +3 -0
- azure/ai/evaluation/_azure/_clients.py +204 -0
- azure/ai/evaluation/_azure/_envs.py +207 -0
- azure/ai/evaluation/_azure/_models.py +227 -0
- azure/ai/evaluation/_azure/_token_manager.py +129 -0
- azure/ai/evaluation/_common/__init__.py +9 -1
- azure/ai/evaluation/_common/constants.py +124 -2
- azure/ai/evaluation/_common/evaluation_onedp_client.py +169 -0
- azure/ai/evaluation/_common/onedp/__init__.py +32 -0
- azure/ai/evaluation/_common/onedp/_client.py +166 -0
- azure/ai/evaluation/_common/onedp/_configuration.py +72 -0
- azure/ai/evaluation/_common/onedp/_model_base.py +1232 -0
- azure/ai/evaluation/_common/onedp/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/_serialization.py +2032 -0
- azure/ai/evaluation/_common/onedp/_types.py +21 -0
- azure/ai/evaluation/_common/onedp/_utils/__init__.py +6 -0
- azure/ai/evaluation/_common/onedp/_utils/model_base.py +1232 -0
- azure/ai/evaluation/_common/onedp/_utils/serialization.py +2032 -0
- azure/ai/evaluation/_common/onedp/_validation.py +66 -0
- azure/ai/evaluation/_common/onedp/_vendor.py +50 -0
- azure/ai/evaluation/_common/onedp/_version.py +9 -0
- azure/ai/evaluation/_common/onedp/aio/__init__.py +29 -0
- azure/ai/evaluation/_common/onedp/aio/_client.py +168 -0
- azure/ai/evaluation/_common/onedp/aio/_configuration.py +72 -0
- azure/ai/evaluation/_common/onedp/aio/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +49 -0
- azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +7143 -0
- azure/ai/evaluation/_common/onedp/aio/operations/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/models/__init__.py +358 -0
- azure/ai/evaluation/_common/onedp/models/_enums.py +447 -0
- azure/ai/evaluation/_common/onedp/models/_models.py +5963 -0
- azure/ai/evaluation/_common/onedp/models/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/operations/__init__.py +49 -0
- azure/ai/evaluation/_common/onedp/operations/_operations.py +8951 -0
- azure/ai/evaluation/_common/onedp/operations/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/py.typed +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/__init__.py +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/aio/__init__.py +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/__init__.py +25 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_operations.py +34 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/__init__.py +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/__init__.py +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/__init__.py +22 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_operations.py +29 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/__init__.py +22 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_operations.py +29 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/operations/__init__.py +25 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/operations/_operations.py +34 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/rai_service.py +578 -69
- azure/ai/evaluation/_common/raiclient/__init__.py +34 -0
- azure/ai/evaluation/_common/raiclient/_client.py +128 -0
- azure/ai/evaluation/_common/raiclient/_configuration.py +87 -0
- azure/ai/evaluation/_common/raiclient/_model_base.py +1235 -0
- azure/ai/evaluation/_common/raiclient/_patch.py +20 -0
- azure/ai/evaluation/_common/raiclient/_serialization.py +2050 -0
- azure/ai/evaluation/_common/raiclient/_version.py +9 -0
- azure/ai/evaluation/_common/raiclient/aio/__init__.py +29 -0
- azure/ai/evaluation/_common/raiclient/aio/_client.py +130 -0
- azure/ai/evaluation/_common/raiclient/aio/_configuration.py +87 -0
- azure/ai/evaluation/_common/raiclient/aio/_patch.py +20 -0
- azure/ai/evaluation/_common/raiclient/aio/operations/__init__.py +25 -0
- azure/ai/evaluation/_common/raiclient/aio/operations/_operations.py +981 -0
- azure/ai/evaluation/_common/raiclient/aio/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/raiclient/models/__init__.py +60 -0
- azure/ai/evaluation/_common/raiclient/models/_enums.py +18 -0
- azure/ai/evaluation/_common/raiclient/models/_models.py +651 -0
- azure/ai/evaluation/_common/raiclient/models/_patch.py +20 -0
- azure/ai/evaluation/_common/raiclient/operations/__init__.py +25 -0
- azure/ai/evaluation/_common/raiclient/operations/_operations.py +1238 -0
- azure/ai/evaluation/_common/raiclient/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/raiclient/py.typed +1 -0
- azure/ai/evaluation/_common/utils.py +505 -27
- azure/ai/evaluation/_constants.py +148 -0
- azure/ai/evaluation/_converters/__init__.py +3 -0
- azure/ai/evaluation/_converters/_ai_services.py +899 -0
- azure/ai/evaluation/_converters/_models.py +467 -0
- azure/ai/evaluation/_converters/_sk_services.py +495 -0
- azure/ai/evaluation/_eval_mapping.py +83 -0
- azure/ai/evaluation/_evaluate/_batch_run/__init__.py +10 -2
- azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +176 -0
- azure/ai/evaluation/_evaluate/_batch_run/batch_clients.py +82 -0
- azure/ai/evaluation/_evaluate/_batch_run/code_client.py +18 -12
- azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +19 -6
- azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +47 -22
- azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +18 -2
- azure/ai/evaluation/_evaluate/_eval_run.py +32 -46
- azure/ai/evaluation/_evaluate/_evaluate.py +1809 -142
- azure/ai/evaluation/_evaluate/_evaluate_aoai.py +992 -0
- azure/ai/evaluation/_evaluate/_telemetry/__init__.py +5 -90
- azure/ai/evaluation/_evaluate/_utils.py +237 -42
- azure/ai/evaluation/_evaluator_definition.py +76 -0
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +80 -28
- azure/ai/evaluation/_evaluators/_code_vulnerability/__init__.py +5 -0
- azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +119 -0
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +40 -4
- azure/ai/evaluation/_evaluators/_common/__init__.py +2 -0
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +427 -29
- azure/ai/evaluation/_evaluators/_common/_base_multi_eval.py +63 -0
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +269 -12
- azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +74 -9
- azure/ai/evaluation/_evaluators/_common/_conversation_aggregators.py +49 -0
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +73 -53
- azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +35 -5
- azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +26 -5
- azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +35 -5
- azure/ai/evaluation/_evaluators/_content_safety/_violence.py +34 -4
- azure/ai/evaluation/_evaluators/_document_retrieval/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +442 -0
- azure/ai/evaluation/_evaluators/_eci/_eci.py +6 -3
- azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +97 -70
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +39 -3
- azure/ai/evaluation/_evaluators/_gleu/_gleu.py +80 -25
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +230 -20
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +30 -29
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +19 -14
- azure/ai/evaluation/_evaluators/_intent_resolution/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +196 -0
- azure/ai/evaluation/_evaluators/_intent_resolution/intent_resolution.prompty +275 -0
- azure/ai/evaluation/_evaluators/_meteor/_meteor.py +89 -36
- azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +22 -4
- azure/ai/evaluation/_evaluators/_qa/_qa.py +94 -35
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +100 -4
- azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +154 -56
- azure/ai/evaluation/_evaluators/_response_completeness/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +202 -0
- azure/ai/evaluation/_evaluators/_response_completeness/response_completeness.prompty +84 -0
- azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +39 -3
- azure/ai/evaluation/_evaluators/_rouge/_rouge.py +166 -26
- azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +38 -7
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +81 -85
- azure/ai/evaluation/_evaluators/_task_adherence/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +226 -0
- azure/ai/evaluation/_evaluators/_task_adherence/task_adherence.prompty +101 -0
- azure/ai/evaluation/_evaluators/_task_completion/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_task_completion/_task_completion.py +177 -0
- azure/ai/evaluation/_evaluators/_task_completion/task_completion.prompty +220 -0
- azure/ai/evaluation/_evaluators/_task_navigation_efficiency/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_task_navigation_efficiency/_task_navigation_efficiency.py +384 -0
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +298 -0
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +166 -0
- azure/ai/evaluation/_evaluators/_tool_input_accuracy/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py +263 -0
- azure/ai/evaluation/_evaluators/_tool_input_accuracy/tool_input_accuracy.prompty +76 -0
- azure/ai/evaluation/_evaluators/_tool_output_utilization/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py +225 -0
- azure/ai/evaluation/_evaluators/_tool_output_utilization/tool_output_utilization.prompty +221 -0
- azure/ai/evaluation/_evaluators/_tool_selection/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py +266 -0
- azure/ai/evaluation/_evaluators/_tool_selection/tool_selection.prompty +104 -0
- azure/ai/evaluation/_evaluators/_tool_success/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_tool_success/_tool_success.py +301 -0
- azure/ai/evaluation/_evaluators/_tool_success/tool_success.prompty +321 -0
- azure/ai/evaluation/_evaluators/_ungrounded_attributes/__init__.py +5 -0
- azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +102 -0
- azure/ai/evaluation/_evaluators/_xpia/xpia.py +20 -4
- azure/ai/evaluation/_exceptions.py +24 -1
- azure/ai/evaluation/_http_utils.py +7 -5
- azure/ai/evaluation/_legacy/__init__.py +3 -0
- azure/ai/evaluation/_legacy/_adapters/__init__.py +7 -0
- azure/ai/evaluation/_legacy/_adapters/_check.py +17 -0
- azure/ai/evaluation/_legacy/_adapters/_configuration.py +45 -0
- azure/ai/evaluation/_legacy/_adapters/_constants.py +10 -0
- azure/ai/evaluation/_legacy/_adapters/_errors.py +29 -0
- azure/ai/evaluation/_legacy/_adapters/_flows.py +28 -0
- azure/ai/evaluation/_legacy/_adapters/_service.py +16 -0
- azure/ai/evaluation/_legacy/_adapters/client.py +51 -0
- azure/ai/evaluation/_legacy/_adapters/entities.py +26 -0
- azure/ai/evaluation/_legacy/_adapters/tracing.py +28 -0
- azure/ai/evaluation/_legacy/_adapters/types.py +15 -0
- azure/ai/evaluation/_legacy/_adapters/utils.py +31 -0
- azure/ai/evaluation/_legacy/_batch_engine/__init__.py +9 -0
- azure/ai/evaluation/_legacy/_batch_engine/_config.py +48 -0
- azure/ai/evaluation/_legacy/_batch_engine/_engine.py +477 -0
- azure/ai/evaluation/_legacy/_batch_engine/_exceptions.py +88 -0
- azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py +132 -0
- azure/ai/evaluation/_legacy/_batch_engine/_result.py +107 -0
- azure/ai/evaluation/_legacy/_batch_engine/_run.py +127 -0
- azure/ai/evaluation/_legacy/_batch_engine/_run_storage.py +128 -0
- azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +262 -0
- azure/ai/evaluation/_legacy/_batch_engine/_status.py +25 -0
- azure/ai/evaluation/_legacy/_batch_engine/_trace.py +97 -0
- azure/ai/evaluation/_legacy/_batch_engine/_utils.py +97 -0
- azure/ai/evaluation/_legacy/_batch_engine/_utils_deprecated.py +131 -0
- azure/ai/evaluation/_legacy/_common/__init__.py +3 -0
- azure/ai/evaluation/_legacy/_common/_async_token_provider.py +117 -0
- azure/ai/evaluation/_legacy/_common/_logging.py +292 -0
- azure/ai/evaluation/_legacy/_common/_thread_pool_executor_with_context.py +17 -0
- azure/ai/evaluation/_legacy/prompty/__init__.py +36 -0
- azure/ai/evaluation/_legacy/prompty/_connection.py +119 -0
- azure/ai/evaluation/_legacy/prompty/_exceptions.py +139 -0
- azure/ai/evaluation/_legacy/prompty/_prompty.py +430 -0
- azure/ai/evaluation/_legacy/prompty/_utils.py +663 -0
- azure/ai/evaluation/_legacy/prompty/_yaml_utils.py +99 -0
- azure/ai/evaluation/_model_configurations.py +26 -0
- azure/ai/evaluation/_safety_evaluation/__init__.py +3 -0
- azure/ai/evaluation/_safety_evaluation/_generated_rai_client.py +0 -0
- azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +917 -0
- azure/ai/evaluation/_user_agent.py +32 -1
- azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +0 -4
- azure/ai/evaluation/_vendor/rouge_score/scoring.py +0 -4
- azure/ai/evaluation/_vendor/rouge_score/tokenize.py +0 -4
- azure/ai/evaluation/_version.py +2 -1
- azure/ai/evaluation/red_team/__init__.py +22 -0
- azure/ai/evaluation/red_team/_agent/__init__.py +3 -0
- azure/ai/evaluation/red_team/_agent/_agent_functions.py +261 -0
- azure/ai/evaluation/red_team/_agent/_agent_tools.py +461 -0
- azure/ai/evaluation/red_team/_agent/_agent_utils.py +89 -0
- azure/ai/evaluation/red_team/_agent/_semantic_kernel_plugin.py +228 -0
- azure/ai/evaluation/red_team/_attack_objective_generator.py +268 -0
- azure/ai/evaluation/red_team/_attack_strategy.py +49 -0
- azure/ai/evaluation/red_team/_callback_chat_target.py +115 -0
- azure/ai/evaluation/red_team/_default_converter.py +21 -0
- azure/ai/evaluation/red_team/_evaluation_processor.py +505 -0
- azure/ai/evaluation/red_team/_mlflow_integration.py +430 -0
- azure/ai/evaluation/red_team/_orchestrator_manager.py +803 -0
- azure/ai/evaluation/red_team/_red_team.py +1717 -0
- azure/ai/evaluation/red_team/_red_team_result.py +661 -0
- azure/ai/evaluation/red_team/_result_processor.py +1708 -0
- azure/ai/evaluation/red_team/_utils/__init__.py +37 -0
- azure/ai/evaluation/red_team/_utils/_rai_service_eval_chat_target.py +128 -0
- azure/ai/evaluation/red_team/_utils/_rai_service_target.py +601 -0
- azure/ai/evaluation/red_team/_utils/_rai_service_true_false_scorer.py +114 -0
- azure/ai/evaluation/red_team/_utils/constants.py +72 -0
- azure/ai/evaluation/red_team/_utils/exception_utils.py +345 -0
- azure/ai/evaluation/red_team/_utils/file_utils.py +266 -0
- azure/ai/evaluation/red_team/_utils/formatting_utils.py +365 -0
- azure/ai/evaluation/red_team/_utils/logging_utils.py +139 -0
- azure/ai/evaluation/red_team/_utils/metric_mapping.py +73 -0
- azure/ai/evaluation/red_team/_utils/objective_utils.py +46 -0
- azure/ai/evaluation/red_team/_utils/progress_utils.py +252 -0
- azure/ai/evaluation/red_team/_utils/retry_utils.py +218 -0
- azure/ai/evaluation/red_team/_utils/strategy_utils.py +218 -0
- azure/ai/evaluation/simulator/_adversarial_scenario.py +6 -0
- azure/ai/evaluation/simulator/_adversarial_simulator.py +187 -80
- azure/ai/evaluation/simulator/_constants.py +1 -0
- azure/ai/evaluation/simulator/_conversation/__init__.py +138 -11
- azure/ai/evaluation/simulator/_conversation/_conversation.py +6 -2
- azure/ai/evaluation/simulator/_conversation/constants.py +1 -1
- azure/ai/evaluation/simulator/_direct_attack_simulator.py +37 -24
- azure/ai/evaluation/simulator/_helpers/_language_suffix_mapping.py +1 -0
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +56 -28
- azure/ai/evaluation/simulator/_model_tools/__init__.py +2 -1
- azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +225 -0
- azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +12 -10
- azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +100 -45
- azure/ai/evaluation/simulator/_model_tools/_rai_client.py +101 -3
- azure/ai/evaluation/simulator/_model_tools/_template_handler.py +31 -11
- azure/ai/evaluation/simulator/_model_tools/models.py +20 -17
- azure/ai/evaluation/simulator/_simulator.py +43 -19
- {azure_ai_evaluation-1.0.1.dist-info → azure_ai_evaluation-1.13.3.dist-info}/METADATA +366 -27
- azure_ai_evaluation-1.13.3.dist-info/RECORD +305 -0
- {azure_ai_evaluation-1.0.1.dist-info → azure_ai_evaluation-1.13.3.dist-info}/WHEEL +1 -1
- azure/ai/evaluation/_evaluators/_multimodal/__init__.py +0 -20
- azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py +0 -132
- azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal_base.py +0 -55
- azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py +0 -100
- azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py +0 -124
- azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py +0 -100
- azure/ai/evaluation/_evaluators/_multimodal/_sexual.py +0 -100
- azure/ai/evaluation/_evaluators/_multimodal/_violence.py +0 -100
- azure/ai/evaluation/simulator/_tracing.py +0 -89
- azure_ai_evaluation-1.0.1.dist-info/RECORD +0 -119
- {azure_ai_evaluation-1.0.1.dist-info → azure_ai_evaluation-1.13.3.dist-info/licenses}/NOTICE.txt +0 -0
- {azure_ai_evaluation-1.0.1.dist-info → azure_ai_evaluation-1.13.3.dist-info}/top_level.txt +0 -0
|
@@ -2,46 +2,72 @@
|
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
4
|
import inspect
|
|
5
|
+
import contextlib
|
|
5
6
|
import json
|
|
6
7
|
import logging
|
|
8
|
+
import math
|
|
7
9
|
import os
|
|
8
10
|
import re
|
|
9
|
-
|
|
11
|
+
import tempfile
|
|
12
|
+
import json
|
|
13
|
+
import time
|
|
14
|
+
from typing import Any, Callable, Dict, Iterable, Iterator, List, Literal, Optional, Set, Tuple, TypedDict, Union, cast
|
|
10
15
|
|
|
16
|
+
from openai import OpenAI, AzureOpenAI
|
|
17
|
+
from azure.ai.evaluation._legacy._adapters._constants import LINE_NUMBER
|
|
18
|
+
from azure.ai.evaluation._legacy._adapters.entities import Run
|
|
11
19
|
import pandas as pd
|
|
12
|
-
from promptflow._sdk._constants import LINE_NUMBER
|
|
13
|
-
from promptflow._sdk._errors import UserAuthenticationError, UploadInternalError
|
|
14
|
-
from promptflow.client import PFClient
|
|
15
|
-
from promptflow.entities import Run
|
|
16
20
|
|
|
17
21
|
from azure.ai.evaluation._common.math import list_mean_nan_safe, apply_transform_nan_safe
|
|
18
|
-
from azure.ai.evaluation._common.utils import validate_azure_ai_project
|
|
22
|
+
from azure.ai.evaluation._common.utils import validate_azure_ai_project, is_onedp_project
|
|
23
|
+
from azure.ai.evaluation._evaluators._common._base_eval import EvaluatorBase
|
|
19
24
|
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
25
|
+
from azure.ai.evaluation._aoai.aoai_grader import AzureOpenAIGrader
|
|
20
26
|
|
|
21
27
|
from .._constants import (
|
|
22
28
|
CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT,
|
|
29
|
+
EVALUATION_PASS_FAIL_MAPPING,
|
|
23
30
|
EvaluationMetrics,
|
|
24
|
-
|
|
31
|
+
DefaultOpenEncoding,
|
|
25
32
|
Prefixes,
|
|
26
33
|
_InternalEvaluationMetrics,
|
|
34
|
+
BINARY_AGGREGATE_SUFFIX,
|
|
35
|
+
DEFAULT_OAI_EVAL_RUN_NAME,
|
|
36
|
+
EVALUATION_EVENT_NAME,
|
|
37
|
+
_EvaluatorMetricMapping,
|
|
38
|
+
)
|
|
39
|
+
from .._model_configurations import AzureAIProject, EvaluationResult, EvaluatorConfig, AppInsightsConfig
|
|
40
|
+
from .._user_agent import UserAgentSingleton
|
|
41
|
+
from ._batch_run import (
|
|
42
|
+
EvalRunContext,
|
|
43
|
+
CodeClient,
|
|
44
|
+
ProxyClient,
|
|
45
|
+
TargetRunContext,
|
|
46
|
+
RunSubmitterClient,
|
|
27
47
|
)
|
|
28
|
-
from .._model_configurations import AzureAIProject, EvaluationResult, EvaluatorConfig
|
|
29
|
-
from .._user_agent import USER_AGENT
|
|
30
|
-
from ._batch_run import EvalRunContext, CodeClient, ProxyClient, TargetRunContext
|
|
31
48
|
from ._utils import (
|
|
32
49
|
_apply_column_mapping,
|
|
33
50
|
_log_metrics_and_instance_results,
|
|
34
51
|
_trace_destination_from_project_scope,
|
|
35
52
|
_write_output,
|
|
53
|
+
DataLoaderFactory,
|
|
54
|
+
_log_metrics_and_instance_results_onedp,
|
|
55
|
+
)
|
|
56
|
+
from ._batch_run.batch_clients import BatchClient, BatchClientRun
|
|
57
|
+
|
|
58
|
+
from ._evaluate_aoai import (
|
|
59
|
+
_begin_aoai_evaluation,
|
|
60
|
+
_split_evaluators_and_grader_configs,
|
|
61
|
+
_get_evaluation_run_results,
|
|
62
|
+
OAIEvalRunCreationInfo,
|
|
36
63
|
)
|
|
37
64
|
|
|
38
|
-
TClient = TypeVar("TClient", ProxyClient, CodeClient)
|
|
39
65
|
LOGGER = logging.getLogger(__name__)
|
|
40
66
|
|
|
41
67
|
# For metrics (aggregates) whose metric names intentionally differ from their
|
|
42
68
|
# originating column name, usually because the aggregation of the original value
|
|
43
69
|
# means something sufficiently different.
|
|
44
|
-
# Note that content safety metrics are handled
|
|
70
|
+
# Note that content safety metrics are handled separately.
|
|
45
71
|
METRIC_COLUMN_NAME_REPLACEMENTS = {
|
|
46
72
|
"groundedness_pro_label": "groundedness_pro_passing_rate",
|
|
47
73
|
}
|
|
@@ -53,6 +79,21 @@ class __EvaluatorInfo(TypedDict):
|
|
|
53
79
|
run_summary: Dict[str, Any]
|
|
54
80
|
|
|
55
81
|
|
|
82
|
+
class __ValidatedData(TypedDict):
|
|
83
|
+
"""
|
|
84
|
+
Simple dictionary that contains ALL pre-processed data and
|
|
85
|
+
the resultant objects that are needed for downstream evaluation.
|
|
86
|
+
"""
|
|
87
|
+
|
|
88
|
+
evaluators: Dict[str, Callable]
|
|
89
|
+
graders: Dict[str, AzureOpenAIGrader]
|
|
90
|
+
input_data_df: pd.DataFrame
|
|
91
|
+
column_mapping: Dict[str, Dict[str, str]]
|
|
92
|
+
target_run: Optional[BatchClientRun]
|
|
93
|
+
batch_run_client: BatchClient
|
|
94
|
+
batch_run_data: Union[str, os.PathLike, pd.DataFrame]
|
|
95
|
+
|
|
96
|
+
|
|
56
97
|
def _aggregate_other_metrics(df: pd.DataFrame) -> Tuple[List[str], Dict[str, float]]:
|
|
57
98
|
"""Identify and average various metrics that need to have the metric name be replaced,
|
|
58
99
|
instead of having the metric match the originating column name.
|
|
@@ -70,7 +111,7 @@ def _aggregate_other_metrics(df: pd.DataFrame) -> Tuple[List[str], Dict[str, flo
|
|
|
70
111
|
if metric_name in METRIC_COLUMN_NAME_REPLACEMENTS:
|
|
71
112
|
renamed_cols.append(col)
|
|
72
113
|
new_col_name = metric_prefix + "." + METRIC_COLUMN_NAME_REPLACEMENTS[metric_name]
|
|
73
|
-
col_with_numeric_values = pd.to_numeric(df[col], errors="coerce")
|
|
114
|
+
col_with_numeric_values = cast(List[float], pd.to_numeric(df[col], errors="coerce"))
|
|
74
115
|
try:
|
|
75
116
|
metric_columns[new_col_name] = round(list_mean_nan_safe(col_with_numeric_values), 2)
|
|
76
117
|
except EvaluationException: # only exception that can be cause is all NaN values
|
|
@@ -111,7 +152,6 @@ def _aggregate_content_safety_metrics(
|
|
|
111
152
|
module = inspect.getmodule(evaluators[evaluator_name])
|
|
112
153
|
if (
|
|
113
154
|
module
|
|
114
|
-
and module.__name__.startswith("azure.ai.evaluation.")
|
|
115
155
|
and metric_name.endswith("_score")
|
|
116
156
|
and metric_name.replace("_score", "") in content_safety_metrics
|
|
117
157
|
):
|
|
@@ -121,7 +161,7 @@ def _aggregate_content_safety_metrics(
|
|
|
121
161
|
defect_rates = {}
|
|
122
162
|
for col in content_safety_df.columns:
|
|
123
163
|
defect_rate_name = col.replace("_score", "_defect_rate")
|
|
124
|
-
col_with_numeric_values = pd.to_numeric(content_safety_df[col], errors="coerce")
|
|
164
|
+
col_with_numeric_values = cast(List[float], pd.to_numeric(content_safety_df[col], errors="coerce"))
|
|
125
165
|
try:
|
|
126
166
|
col_with_boolean_values = apply_transform_nan_safe(
|
|
127
167
|
col_with_numeric_values, lambda x: 1 if x >= CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT else 0
|
|
@@ -146,28 +186,151 @@ def _aggregate_label_defect_metrics(df: pd.DataFrame) -> Tuple[List[str], Dict[s
|
|
|
146
186
|
"""
|
|
147
187
|
handled_metrics = [
|
|
148
188
|
EvaluationMetrics.PROTECTED_MATERIAL,
|
|
189
|
+
EvaluationMetrics.FICTIONAL_CHARACTERS,
|
|
190
|
+
EvaluationMetrics.ARTWORK,
|
|
191
|
+
EvaluationMetrics.LOGOS_AND_BRANDS,
|
|
149
192
|
_InternalEvaluationMetrics.ECI,
|
|
150
193
|
EvaluationMetrics.XPIA,
|
|
194
|
+
EvaluationMetrics.CODE_VULNERABILITY,
|
|
195
|
+
EvaluationMetrics.UNGROUNDED_ATTRIBUTES,
|
|
151
196
|
]
|
|
152
197
|
label_cols = []
|
|
198
|
+
details_cols = []
|
|
153
199
|
for col in df.columns:
|
|
154
200
|
metric_name = col.split(".")[1]
|
|
155
201
|
if metric_name.endswith("_label") and metric_name.replace("_label", "").lower() in handled_metrics:
|
|
156
202
|
label_cols.append(col)
|
|
203
|
+
if metric_name.endswith("_details") and metric_name.replace("_details", "").lower() in handled_metrics:
|
|
204
|
+
details_cols = col
|
|
157
205
|
|
|
158
206
|
label_df = df[label_cols]
|
|
159
207
|
defect_rates = {}
|
|
160
208
|
for col in label_df.columns:
|
|
161
209
|
defect_rate_name = col.replace("_label", "_defect_rate")
|
|
162
|
-
col_with_boolean_values = pd.to_numeric(label_df[col], errors="coerce")
|
|
210
|
+
col_with_boolean_values = cast(List[float], pd.to_numeric(label_df[col], errors="coerce"))
|
|
163
211
|
try:
|
|
164
212
|
defect_rates[defect_rate_name] = round(list_mean_nan_safe(col_with_boolean_values), 2)
|
|
165
213
|
except EvaluationException: # only exception that can be cause is all NaN values
|
|
166
214
|
msg = f"All score evaluations are NaN/None for column {col}. No aggregation can be performed."
|
|
167
215
|
LOGGER.warning(msg)
|
|
216
|
+
|
|
217
|
+
if details_cols:
|
|
218
|
+
details_df = df[details_cols]
|
|
219
|
+
detail_defect_rates = {}
|
|
220
|
+
|
|
221
|
+
for key, value in details_df.items():
|
|
222
|
+
_process_rows(value, detail_defect_rates)
|
|
223
|
+
|
|
224
|
+
for key, value in detail_defect_rates.items():
|
|
225
|
+
col_with_boolean_values = pd.to_numeric(value, errors="coerce")
|
|
226
|
+
try:
|
|
227
|
+
defect_rates[f"{details_cols}.{key}_defect_rate"] = round(
|
|
228
|
+
list_mean_nan_safe(col_with_boolean_values), 2
|
|
229
|
+
)
|
|
230
|
+
except EvaluationException: # only exception that can be cause is all NaN values
|
|
231
|
+
msg = f"All score evaluations are NaN/None for column {key}. No aggregation can be performed."
|
|
232
|
+
LOGGER.warning(msg)
|
|
233
|
+
|
|
168
234
|
return label_cols, defect_rates
|
|
169
235
|
|
|
170
236
|
|
|
237
|
+
def _process_rows(row, detail_defect_rates):
|
|
238
|
+
for key, value in row.items():
|
|
239
|
+
if key not in detail_defect_rates:
|
|
240
|
+
detail_defect_rates[key] = []
|
|
241
|
+
detail_defect_rates[key].append(value)
|
|
242
|
+
return detail_defect_rates
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
def _aggregation_binary_output(df: pd.DataFrame) -> Dict[str, float]:
|
|
246
|
+
"""
|
|
247
|
+
Aggregate binary output results (pass/fail) from evaluation dataframe.
|
|
248
|
+
|
|
249
|
+
For each evaluator, calculates the proportion of "pass" results.
|
|
250
|
+
|
|
251
|
+
:param df: The dataframe of evaluation results.
|
|
252
|
+
:type df: ~pandas.DataFrame
|
|
253
|
+
:return: A dictionary mapping evaluator names to the proportion of pass results.
|
|
254
|
+
:rtype: Dict[str, float]
|
|
255
|
+
"""
|
|
256
|
+
results = {}
|
|
257
|
+
|
|
258
|
+
# Find all columns that end with "_result"
|
|
259
|
+
result_columns = [col for col in df.columns if col.startswith("outputs.") and col.endswith("_result")]
|
|
260
|
+
|
|
261
|
+
for col in result_columns:
|
|
262
|
+
# Extract the evaluator name from the column name
|
|
263
|
+
# (outputs.<evaluator>.<metric>_result)
|
|
264
|
+
parts = col.split(".")
|
|
265
|
+
evaluator_name = None
|
|
266
|
+
if len(parts) >= 3:
|
|
267
|
+
evaluator_name = parts[1]
|
|
268
|
+
else:
|
|
269
|
+
LOGGER.warning(
|
|
270
|
+
"Skipping column '%s' due to unexpected format. Expected at least three parts separated by '.'", col
|
|
271
|
+
)
|
|
272
|
+
continue
|
|
273
|
+
if evaluator_name:
|
|
274
|
+
# Count the occurrences of each unique value (pass/fail)
|
|
275
|
+
value_counts = df[col].value_counts().to_dict()
|
|
276
|
+
|
|
277
|
+
# Calculate the proportion of EVALUATION_PASS_FAIL_MAPPING[True] results
|
|
278
|
+
total_rows = len(df)
|
|
279
|
+
pass_count = value_counts.get(EVALUATION_PASS_FAIL_MAPPING[True], 0)
|
|
280
|
+
proportion = pass_count / total_rows if total_rows > 0 else 0.0
|
|
281
|
+
|
|
282
|
+
# Set the result with the evaluator name as the key
|
|
283
|
+
result_key = f"{evaluator_name}.{BINARY_AGGREGATE_SUFFIX}"
|
|
284
|
+
results[result_key] = round(proportion, 2)
|
|
285
|
+
|
|
286
|
+
return results
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
def _get_token_count_columns_to_exclude(df: pd.DataFrame) -> List[str]:
|
|
290
|
+
"""Identify token count columns from known SDK metrics that should be excluded from aggregation.
|
|
291
|
+
|
|
292
|
+
Token counts from custom evaluators are not excluded, only those from EvaluationMetrics
|
|
293
|
+
and _InternalEvaluationMetrics.
|
|
294
|
+
|
|
295
|
+
:param df: The dataframe of evaluation results.
|
|
296
|
+
:type df: ~pandas.DataFrame
|
|
297
|
+
:return: List of column names to exclude from aggregation.
|
|
298
|
+
:rtype: List[str]
|
|
299
|
+
"""
|
|
300
|
+
# Get all metric values from EvaluationMetrics class
|
|
301
|
+
evaluation_metrics_values = [
|
|
302
|
+
getattr(EvaluationMetrics, attr)
|
|
303
|
+
for attr in dir(EvaluationMetrics)
|
|
304
|
+
if not attr.startswith("_") and isinstance(getattr(EvaluationMetrics, attr), str)
|
|
305
|
+
]
|
|
306
|
+
|
|
307
|
+
# Get all metric values from _InternalEvaluationMetrics class
|
|
308
|
+
internal_metrics_values = [
|
|
309
|
+
getattr(_InternalEvaluationMetrics, attr)
|
|
310
|
+
for attr in dir(_InternalEvaluationMetrics)
|
|
311
|
+
if not attr.startswith("_") and isinstance(getattr(_InternalEvaluationMetrics, attr), str)
|
|
312
|
+
]
|
|
313
|
+
|
|
314
|
+
# Combine all known metrics
|
|
315
|
+
all_known_metrics = evaluation_metrics_values + internal_metrics_values
|
|
316
|
+
|
|
317
|
+
# Find token count columns that belong to known metrics
|
|
318
|
+
token_count_cols = [
|
|
319
|
+
col
|
|
320
|
+
for col in df.columns
|
|
321
|
+
if (
|
|
322
|
+
any(
|
|
323
|
+
col.endswith(f"{metric}_prompt_tokens")
|
|
324
|
+
or col.endswith(f"{metric}_completion_tokens")
|
|
325
|
+
or col.endswith(f"{metric}_total_tokens")
|
|
326
|
+
for metric in all_known_metrics
|
|
327
|
+
)
|
|
328
|
+
)
|
|
329
|
+
]
|
|
330
|
+
|
|
331
|
+
return token_count_cols
|
|
332
|
+
|
|
333
|
+
|
|
171
334
|
def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, Callable]) -> Dict[str, float]:
|
|
172
335
|
"""Aggregate metrics from the evaluation results.
|
|
173
336
|
On top of naively calculating the mean of most metrics, this function also identifies certain columns
|
|
@@ -181,6 +344,8 @@ def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, Callable]) -> Dic
|
|
|
181
344
|
:return: The aggregated metrics.
|
|
182
345
|
:rtype: Dict[str, float]
|
|
183
346
|
"""
|
|
347
|
+
binary_metrics = _aggregation_binary_output(df)
|
|
348
|
+
|
|
184
349
|
df.rename(columns={col: col.replace("outputs.", "") for col in df.columns}, inplace=True)
|
|
185
350
|
|
|
186
351
|
handled_columns = []
|
|
@@ -198,9 +363,16 @@ def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, Callable]) -> Dic
|
|
|
198
363
|
handled_columns.extend(label_cols)
|
|
199
364
|
defect_rates.update(label_defect_rates)
|
|
200
365
|
|
|
366
|
+
# Exclude token count columns from aggregation for known SDK metrics
|
|
367
|
+
token_count_cols = _get_token_count_columns_to_exclude(df)
|
|
368
|
+
handled_columns.extend(token_count_cols)
|
|
369
|
+
|
|
201
370
|
# For rest of metrics, we will calculate mean
|
|
202
371
|
df.drop(columns=handled_columns, inplace=True)
|
|
203
372
|
|
|
373
|
+
# Convert "not applicable" strings to None to allow proper numeric aggregation
|
|
374
|
+
df = df.replace(EvaluatorBase._NOT_APPLICABLE_RESULT, None)
|
|
375
|
+
|
|
204
376
|
# NOTE: nan/None values don't count as as booleans, so boolean columns with
|
|
205
377
|
# nan/None values won't have a mean produced from them.
|
|
206
378
|
# This is different from label-based known evaluators, which have special handling.
|
|
@@ -208,6 +380,10 @@ def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, Callable]) -> Dic
|
|
|
208
380
|
metrics = mean_value.to_dict()
|
|
209
381
|
# Add defect rates back into metrics
|
|
210
382
|
metrics.update(defect_rates)
|
|
383
|
+
|
|
384
|
+
# Add binary threshold metrics based on pass/fail results
|
|
385
|
+
metrics.update(binary_metrics)
|
|
386
|
+
|
|
211
387
|
return metrics
|
|
212
388
|
|
|
213
389
|
|
|
@@ -299,7 +475,7 @@ def _validate_columns_for_evaluators(
|
|
|
299
475
|
missing_inputs = []
|
|
300
476
|
else:
|
|
301
477
|
optional_params = (
|
|
302
|
-
evaluator._OPTIONAL_PARAMS # pylint: disable=protected-access
|
|
478
|
+
cast(Any, evaluator)._OPTIONAL_PARAMS # pylint: disable=protected-access
|
|
303
479
|
if hasattr(evaluator, "_OPTIONAL_PARAMS")
|
|
304
480
|
else []
|
|
305
481
|
)
|
|
@@ -344,7 +520,7 @@ def _validate_columns_for_evaluators(
|
|
|
344
520
|
)
|
|
345
521
|
|
|
346
522
|
|
|
347
|
-
def _validate_and_load_data(target, data, evaluators, output_path, azure_ai_project, evaluation_name):
|
|
523
|
+
def _validate_and_load_data(target, data, evaluators, output_path, azure_ai_project, evaluation_name, tags):
|
|
348
524
|
if data is None:
|
|
349
525
|
msg = "The 'data' parameter is required for evaluation."
|
|
350
526
|
raise EvaluationException(
|
|
@@ -431,10 +607,11 @@ def _validate_and_load_data(target, data, evaluators, output_path, azure_ai_proj
|
|
|
431
607
|
)
|
|
432
608
|
|
|
433
609
|
try:
|
|
434
|
-
|
|
610
|
+
data_loader = DataLoaderFactory.get_loader(data)
|
|
611
|
+
initial_data_df = data_loader.load()
|
|
435
612
|
except Exception as e:
|
|
436
613
|
raise EvaluationException(
|
|
437
|
-
message=f"Unable to load data from '{data}'.
|
|
614
|
+
message=f"Unable to load data from '{data}'. Supported formats are JSONL and CSV. Detailed error: {e}.",
|
|
438
615
|
target=ErrorTarget.EVALUATE,
|
|
439
616
|
category=ErrorCategory.INVALID_VALUE,
|
|
440
617
|
blame=ErrorBlame.USER_ERROR,
|
|
@@ -445,21 +622,21 @@ def _validate_and_load_data(target, data, evaluators, output_path, azure_ai_proj
|
|
|
445
622
|
|
|
446
623
|
def _apply_target_to_data(
|
|
447
624
|
target: Callable,
|
|
448
|
-
data: Union[str, os.PathLike],
|
|
449
|
-
|
|
625
|
+
data: Union[str, os.PathLike, pd.DataFrame],
|
|
626
|
+
batch_client: BatchClient,
|
|
450
627
|
initial_data: pd.DataFrame,
|
|
451
628
|
evaluation_name: Optional[str] = None,
|
|
452
629
|
**kwargs,
|
|
453
|
-
) -> Tuple[pd.DataFrame, Set[str],
|
|
630
|
+
) -> Tuple[pd.DataFrame, Set[str], BatchClientRun]:
|
|
454
631
|
"""
|
|
455
632
|
Apply the target function to the data set and return updated data and generated columns.
|
|
456
633
|
|
|
457
634
|
:param target: The function to be applied to data.
|
|
458
635
|
:type target: Callable
|
|
459
|
-
:param data: The path to input jsonl file.
|
|
636
|
+
:param data: The path to input jsonl or csv file.
|
|
460
637
|
:type data: Union[str, os.PathLike]
|
|
461
|
-
:param
|
|
462
|
-
:type
|
|
638
|
+
:param batch_client: The promptflow client to be used.
|
|
639
|
+
:type batch_client: PFClient
|
|
463
640
|
:param initial_data: The data frame with the loaded data.
|
|
464
641
|
:type initial_data: pd.DataFrame
|
|
465
642
|
:param evaluation_name: The name of the evaluation.
|
|
@@ -467,36 +644,43 @@ def _apply_target_to_data(
|
|
|
467
644
|
:return: The tuple, containing data frame and the list of added columns.
|
|
468
645
|
:rtype: Tuple[pandas.DataFrame, List[str]]
|
|
469
646
|
"""
|
|
647
|
+
|
|
470
648
|
_run_name = kwargs.get("_run_name")
|
|
471
|
-
|
|
649
|
+
with TargetRunContext(batch_client):
|
|
650
|
+
run: BatchClientRun = batch_client.run(
|
|
651
|
+
flow=target,
|
|
652
|
+
display_name=evaluation_name,
|
|
653
|
+
data=data,
|
|
654
|
+
stream=True,
|
|
655
|
+
name=_run_name,
|
|
656
|
+
evaluator_name=getattr(target, "__qualname__", "TARGET"),
|
|
657
|
+
)
|
|
658
|
+
target_output: pd.DataFrame = batch_client.get_details(run, all_results=True)
|
|
659
|
+
run_summary = batch_client.get_run_summary(run)
|
|
472
660
|
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
if "Failed to upload run" in ex.message:
|
|
485
|
-
msg = (
|
|
486
|
-
"Failed to upload the target run to the cloud. "
|
|
487
|
-
"This may be caused by insufficient permission to access storage or other errors."
|
|
488
|
-
)
|
|
489
|
-
raise EvaluationException(
|
|
490
|
-
message=msg,
|
|
491
|
-
target=ErrorTarget.EVALUATE,
|
|
492
|
-
category=ErrorCategory.FAILED_REMOTE_TRACKING,
|
|
493
|
-
blame=ErrorBlame.USER_ERROR,
|
|
494
|
-
tsg_link="https://aka.ms/azsdk/python/evaluation/remotetracking/troubleshoot",
|
|
495
|
-
) from ex
|
|
661
|
+
if run_summary["completed_lines"] == 0:
|
|
662
|
+
msg = (
|
|
663
|
+
f"Evaluation target failed to produce any results."
|
|
664
|
+
f" Please check the logs at {run_summary['log_path']} for more details about cause of failure."
|
|
665
|
+
)
|
|
666
|
+
raise EvaluationException(
|
|
667
|
+
message=msg,
|
|
668
|
+
target=ErrorTarget.EVALUATE,
|
|
669
|
+
category=ErrorCategory.FAILED_EXECUTION,
|
|
670
|
+
blame=ErrorBlame.USER_ERROR,
|
|
671
|
+
)
|
|
496
672
|
|
|
497
|
-
|
|
673
|
+
# Log a warning if some rows failed
|
|
674
|
+
failed_lines = run_summary.get("failed_lines", 0)
|
|
675
|
+
completed_lines = run_summary["completed_lines"]
|
|
676
|
+
total_lines = failed_lines + completed_lines
|
|
677
|
+
|
|
678
|
+
if failed_lines > 0:
|
|
679
|
+
LOGGER.warning(
|
|
680
|
+
f"Target function completed {completed_lines} out of {total_lines} rows. "
|
|
681
|
+
f"{failed_lines} rows failed and will be filled with NaN values."
|
|
682
|
+
)
|
|
498
683
|
|
|
499
|
-
target_output: pd.DataFrame = pf_client.runs.get_details(run, all_results=True)
|
|
500
684
|
# Remove input and output prefix
|
|
501
685
|
generated_columns = {
|
|
502
686
|
col[len(Prefixes.OUTPUTS) :] for col in target_output.columns if col.startswith(Prefixes.OUTPUTS)
|
|
@@ -504,6 +688,13 @@ def _apply_target_to_data(
|
|
|
504
688
|
# Sort output by line numbers
|
|
505
689
|
target_output.set_index(f"inputs.{LINE_NUMBER}", inplace=True)
|
|
506
690
|
target_output.sort_index(inplace=True)
|
|
691
|
+
|
|
692
|
+
initial_data_with_line_numbers = initial_data.copy()
|
|
693
|
+
initial_data_with_line_numbers[LINE_NUMBER] = range(len(initial_data))
|
|
694
|
+
|
|
695
|
+
complete_index = initial_data_with_line_numbers[LINE_NUMBER]
|
|
696
|
+
target_output = target_output.reindex(complete_index)
|
|
697
|
+
|
|
507
698
|
target_output.reset_index(inplace=True, drop=False)
|
|
508
699
|
# target_output contains only input columns, taken by function,
|
|
509
700
|
# so we need to concatenate it to the input data frame.
|
|
@@ -512,8 +703,8 @@ def _apply_target_to_data(
|
|
|
512
703
|
# Rename outputs columns to __outputs
|
|
513
704
|
rename_dict = {col: col.replace(Prefixes.OUTPUTS, Prefixes.TSG_OUTPUTS) for col in target_output.columns}
|
|
514
705
|
target_output.rename(columns=rename_dict, inplace=True)
|
|
515
|
-
# Concatenate output to input
|
|
516
|
-
target_output = pd.concat([
|
|
706
|
+
# Concatenate output to input - now both dataframes have the same number of rows
|
|
707
|
+
target_output = pd.concat([initial_data, target_output], axis=1)
|
|
517
708
|
|
|
518
709
|
return target_output, generated_columns, run
|
|
519
710
|
|
|
@@ -531,7 +722,7 @@ def _process_column_mappings(
|
|
|
531
722
|
|
|
532
723
|
processed_config: Dict[str, Dict[str, str]] = {}
|
|
533
724
|
|
|
534
|
-
|
|
725
|
+
expected_references = re.compile(r"^\$\{(target|data)\.([a-zA-Z0-9_]+(?:\.[a-zA-Z0-9_]+)*)\}$")
|
|
535
726
|
|
|
536
727
|
if column_mapping:
|
|
537
728
|
for evaluator, mapping_config in column_mapping.items():
|
|
@@ -540,7 +731,7 @@ def _process_column_mappings(
|
|
|
540
731
|
|
|
541
732
|
for map_to_key, map_value in mapping_config.items():
|
|
542
733
|
# Check if there's any unexpected reference other than ${target.} or ${data.}
|
|
543
|
-
if
|
|
734
|
+
if not expected_references.search(map_value):
|
|
544
735
|
msg = "Unexpected references detected in 'column_mapping'. Ensure only ${target.} and ${data.} are used."
|
|
545
736
|
raise EvaluationException(
|
|
546
737
|
message=msg,
|
|
@@ -580,27 +771,29 @@ def _rename_columns_conditionally(df: pd.DataFrame) -> pd.DataFrame:
|
|
|
580
771
|
return df
|
|
581
772
|
|
|
582
773
|
|
|
583
|
-
# @log_evaluate_activity
|
|
584
774
|
def evaluate(
|
|
585
775
|
*,
|
|
586
776
|
data: Union[str, os.PathLike],
|
|
587
|
-
evaluators: Dict[str, Callable],
|
|
777
|
+
evaluators: Dict[str, Union[Callable, AzureOpenAIGrader]],
|
|
588
778
|
evaluation_name: Optional[str] = None,
|
|
589
779
|
target: Optional[Callable] = None,
|
|
590
780
|
evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None,
|
|
591
|
-
azure_ai_project: Optional[AzureAIProject] = None,
|
|
781
|
+
azure_ai_project: Optional[Union[str, AzureAIProject]] = None,
|
|
592
782
|
output_path: Optional[Union[str, os.PathLike]] = None,
|
|
783
|
+
fail_on_evaluator_errors: bool = False,
|
|
784
|
+
tags: Optional[Dict[str, str]] = None,
|
|
593
785
|
**kwargs,
|
|
594
786
|
) -> EvaluationResult:
|
|
595
787
|
"""Evaluates target or data with built-in or custom evaluators. If both target and data are provided,
|
|
596
788
|
data will be run through target function and then results will be evaluated.
|
|
597
789
|
|
|
598
790
|
:keyword data: Path to the data to be evaluated or passed to target if target is set.
|
|
599
|
-
|
|
791
|
+
JSONL and CSV files are supported. `target` and `data` both cannot be None. Required.
|
|
600
792
|
:paramtype data: str
|
|
601
793
|
:keyword evaluators: Evaluators to be used for evaluation. It should be a dictionary with key as alias for evaluator
|
|
602
|
-
and value as the evaluator function.
|
|
603
|
-
|
|
794
|
+
and value as the evaluator function. Also accepts AzureOpenAIGrader instances as values, which are processed separately.
|
|
795
|
+
Required.
|
|
796
|
+
:paramtype evaluators: Dict[str, Union[Callable, ~azure.ai.evaluation.AzureOpenAIGrader]]
|
|
604
797
|
:keyword evaluation_name: Display name of the evaluation.
|
|
605
798
|
:paramtype evaluation_name: Optional[str]
|
|
606
799
|
:keyword target: Target to be evaluated. `target` and `data` both cannot be None
|
|
@@ -613,8 +806,20 @@ def evaluate(
|
|
|
613
806
|
:keyword output_path: The local folder or file path to save evaluation results to if set. If folder path is provided
|
|
614
807
|
the results will be saved to a file named `evaluation_results.json` in the folder.
|
|
615
808
|
:paramtype output_path: Optional[str]
|
|
616
|
-
:keyword azure_ai_project:
|
|
617
|
-
|
|
809
|
+
:keyword azure_ai_project: The Azure AI project, which can either be a string representing the project endpoint
|
|
810
|
+
or an instance of AzureAIProject. It contains subscription id, resource group, and project name.
|
|
811
|
+
:paramtype azure_ai_project: Optional[Union[str, ~azure.ai.evaluation.AzureAIProject]]
|
|
812
|
+
:keyword fail_on_evaluator_errors: Whether or not the evaluation should cancel early with an EvaluationException
|
|
813
|
+
if ANY evaluator fails during their evaluation.
|
|
814
|
+
Defaults to false, which means that evaluations will continue regardless of failures.
|
|
815
|
+
If such failures occur, metrics may be missing, and evidence of failures can be found in the evaluation's logs.
|
|
816
|
+
:paramtype fail_on_evaluator_errors: bool
|
|
817
|
+
:keyword tags: A dictionary of tags to be added to the evaluation run for tracking and organization purposes.
|
|
818
|
+
Keys and values must be strings. For more information about tag limits, see:
|
|
819
|
+
https://learn.microsoft.com/en-us/azure/machine-learning/resource-limits-capacity?view=azureml-api-2#runs
|
|
820
|
+
:paramtype tags: Optional[Dict[str, str]]
|
|
821
|
+
:keyword user_agent: A string to append to the default user-agent sent with evaluation http requests
|
|
822
|
+
:paramtype user_agent: Optional[str]
|
|
618
823
|
:return: Evaluation results.
|
|
619
824
|
:rtype: ~azure.ai.evaluation.EvaluationResult
|
|
620
825
|
|
|
@@ -625,19 +830,34 @@ def evaluate(
|
|
|
625
830
|
:end-before: [END evaluate_method]
|
|
626
831
|
:language: python
|
|
627
832
|
:dedent: 8
|
|
628
|
-
:caption: Run an evaluation on local data with
|
|
833
|
+
:caption: Run an evaluation on local data with one or more evaluators using azure.ai.evaluation.AzureAIProject
|
|
834
|
+
|
|
835
|
+
.. admonition:: Example using Azure AI Project URL:
|
|
836
|
+
|
|
837
|
+
.. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
|
|
838
|
+
:start-after: [START evaluate_method]
|
|
839
|
+
:end-before: [END evaluate_method]
|
|
840
|
+
:language: python
|
|
841
|
+
:dedent: 8
|
|
842
|
+
:caption: Run an evaluation on local data with one or more evaluators using Azure AI Project URL in following format
|
|
843
|
+
https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
|
|
629
844
|
"""
|
|
630
845
|
try:
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
846
|
+
user_agent: Optional[str] = kwargs.get("user_agent")
|
|
847
|
+
with UserAgentSingleton().add_useragent_product(user_agent) if user_agent else contextlib.nullcontext():
|
|
848
|
+
results = _evaluate(
|
|
849
|
+
evaluation_name=evaluation_name,
|
|
850
|
+
target=target,
|
|
851
|
+
data=data,
|
|
852
|
+
evaluators_and_graders=evaluators,
|
|
853
|
+
evaluator_config=evaluator_config,
|
|
854
|
+
azure_ai_project=azure_ai_project,
|
|
855
|
+
output_path=output_path,
|
|
856
|
+
fail_on_evaluator_errors=fail_on_evaluator_errors,
|
|
857
|
+
tags=tags,
|
|
858
|
+
**kwargs,
|
|
859
|
+
)
|
|
860
|
+
return results
|
|
641
861
|
except Exception as e:
|
|
642
862
|
# Handle multiprocess bootstrap error
|
|
643
863
|
bootstrap_error = (
|
|
@@ -684,22 +904,468 @@ def _print_summary(per_evaluator_results: Dict[str, Any]) -> None:
|
|
|
684
904
|
print("\n====================================================\n")
|
|
685
905
|
|
|
686
906
|
|
|
907
|
+
def _print_fail_flag_warning() -> None:
|
|
908
|
+
print(
|
|
909
|
+
"Notice: fail_on_evaluator_errors is enabled. It is recommended that you disable "
|
|
910
|
+
+ "this flag for evaluations on large datasets (loosely defined as more than 10 rows of inputs, "
|
|
911
|
+
+ "or more than 4 evaluators). Using this flag on large datasets runs the risk of large runs failing "
|
|
912
|
+
+ "without producing any outputs, since a single failure will cancel the entire run "
|
|
913
|
+
"when fail_on_evaluator_errors is enabled."
|
|
914
|
+
)
|
|
915
|
+
|
|
916
|
+
|
|
687
917
|
def _evaluate( # pylint: disable=too-many-locals,too-many-statements
|
|
688
918
|
*,
|
|
689
|
-
|
|
919
|
+
evaluators_and_graders: Dict[str, Union[Callable, AzureOpenAIGrader]],
|
|
690
920
|
evaluation_name: Optional[str] = None,
|
|
691
921
|
target: Optional[Callable] = None,
|
|
692
922
|
data: Union[str, os.PathLike],
|
|
693
923
|
evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None,
|
|
694
|
-
azure_ai_project: Optional[AzureAIProject] = None,
|
|
924
|
+
azure_ai_project: Optional[Union[str, AzureAIProject]] = None,
|
|
695
925
|
output_path: Optional[Union[str, os.PathLike]] = None,
|
|
926
|
+
fail_on_evaluator_errors: bool = False,
|
|
927
|
+
tags: Optional[Dict[str, str]] = None,
|
|
696
928
|
**kwargs,
|
|
697
929
|
) -> EvaluationResult:
|
|
698
|
-
|
|
930
|
+
if fail_on_evaluator_errors:
|
|
931
|
+
_print_fail_flag_warning()
|
|
932
|
+
|
|
933
|
+
# Turn inputted mess of data into a dataframe, apply targets if needed
|
|
934
|
+
# split graders and evaluators, and verify that column mappings are sensible.
|
|
935
|
+
validated_data = _preprocess_data(
|
|
936
|
+
data=data,
|
|
937
|
+
evaluators_and_graders=evaluators_and_graders,
|
|
938
|
+
evaluator_config=evaluator_config,
|
|
939
|
+
target=target,
|
|
940
|
+
output_path=output_path,
|
|
941
|
+
azure_ai_project=azure_ai_project,
|
|
942
|
+
evaluation_name=evaluation_name,
|
|
943
|
+
fail_on_evaluator_errors=fail_on_evaluator_errors,
|
|
944
|
+
tags=tags,
|
|
945
|
+
**kwargs,
|
|
946
|
+
)
|
|
947
|
+
|
|
948
|
+
# extract relevant info from validated data
|
|
949
|
+
column_mapping = validated_data["column_mapping"]
|
|
950
|
+
evaluators = validated_data["evaluators"]
|
|
951
|
+
graders = validated_data["graders"]
|
|
952
|
+
input_data_df = validated_data["input_data_df"]
|
|
953
|
+
results_df = pd.DataFrame()
|
|
954
|
+
metrics: Dict[str, float] = {}
|
|
955
|
+
eval_run_info_list: List[OAIEvalRunCreationInfo] = []
|
|
956
|
+
eval_run_summary_dict = {}
|
|
957
|
+
|
|
958
|
+
# Start OAI eval runs if any graders are present.
|
|
959
|
+
need_oai_run = len(graders) > 0
|
|
960
|
+
need_local_run = len(evaluators) > 0
|
|
961
|
+
need_get_oai_results = False
|
|
962
|
+
got_local_results = False
|
|
963
|
+
if need_oai_run:
|
|
964
|
+
try:
|
|
965
|
+
aoi_name = evaluation_name if evaluation_name else DEFAULT_OAI_EVAL_RUN_NAME
|
|
966
|
+
eval_run_info_list = _begin_aoai_evaluation(graders, column_mapping, input_data_df, aoi_name, **kwargs)
|
|
967
|
+
need_get_oai_results = len(eval_run_info_list) > 0
|
|
968
|
+
except EvaluationException as e:
|
|
969
|
+
if need_local_run:
|
|
970
|
+
# If there are normal evaluators, don't stop execution and try to run
|
|
971
|
+
# those.
|
|
972
|
+
LOGGER.warning(
|
|
973
|
+
"Remote Azure Open AI grader evaluations failed during run creation."
|
|
974
|
+
+ " Continuing with local evaluators."
|
|
975
|
+
)
|
|
976
|
+
LOGGER.warning(e)
|
|
977
|
+
else:
|
|
978
|
+
raise e
|
|
979
|
+
|
|
980
|
+
# Evaluate 'normal' evaluators. This includes built-in evaluators and any user-supplied callables.
|
|
981
|
+
if need_local_run:
|
|
982
|
+
try:
|
|
983
|
+
eval_result_df, eval_metrics, per_evaluator_results = _run_callable_evaluators(
|
|
984
|
+
validated_data=validated_data, fail_on_evaluator_errors=fail_on_evaluator_errors
|
|
985
|
+
)
|
|
986
|
+
results_df = eval_result_df
|
|
987
|
+
metrics = eval_metrics
|
|
988
|
+
got_local_results = True
|
|
989
|
+
# TODO figure out how to update this printing to include OAI results?
|
|
990
|
+
_print_summary(per_evaluator_results)
|
|
991
|
+
eval_run_summary_dict = {name: result["run_summary"] for name, result in per_evaluator_results.items()}
|
|
992
|
+
LOGGER.info(f"run_summary: \r\n{json.dumps(eval_run_summary_dict, indent=4)}")
|
|
993
|
+
except EvaluationException as e:
|
|
994
|
+
if need_get_oai_results:
|
|
995
|
+
# If there are OAI graders, we only print a warning on local failures.
|
|
996
|
+
LOGGER.warning("Local evaluations failed. Will still attempt to retrieve online grader results.")
|
|
997
|
+
LOGGER.warning(e)
|
|
998
|
+
else:
|
|
999
|
+
raise e
|
|
1000
|
+
|
|
1001
|
+
# Retrieve OAI eval run results if needed.
|
|
1002
|
+
if need_get_oai_results:
|
|
1003
|
+
try:
|
|
1004
|
+
aoai_results, aoai_metrics = _get_evaluation_run_results(eval_run_info_list) # type: ignore
|
|
1005
|
+
# Post build TODO: add equivalent of _print_summary(per_evaluator_results) here
|
|
1006
|
+
|
|
1007
|
+
# Combine results if both evaluators and graders are present
|
|
1008
|
+
if len(evaluators) > 0:
|
|
1009
|
+
results_df = pd.concat([results_df, aoai_results], axis=1)
|
|
1010
|
+
metrics.update(aoai_metrics)
|
|
1011
|
+
else:
|
|
1012
|
+
# Otherwise combine aoai results with input data df to include input columns in outputs.
|
|
1013
|
+
results_df = pd.concat([input_data_df, aoai_results], axis=1)
|
|
1014
|
+
metrics = aoai_metrics
|
|
1015
|
+
except EvaluationException as e:
|
|
1016
|
+
if got_local_results:
|
|
1017
|
+
# If there are local eval results, we only print a warning on OAI failure.
|
|
1018
|
+
LOGGER.warning("Remote Azure Open AI grader evaluations failed. Still returning local results.")
|
|
1019
|
+
LOGGER.warning(e)
|
|
1020
|
+
else:
|
|
1021
|
+
raise e
|
|
1022
|
+
|
|
1023
|
+
# Done with all evaluations, message outputs into final forms, and log results if needed.
|
|
1024
|
+
name_map = _map_names_to_builtins(evaluators, graders)
|
|
1025
|
+
if is_onedp_project(azure_ai_project):
|
|
1026
|
+
studio_url = _log_metrics_and_instance_results_onedp(
|
|
1027
|
+
metrics, results_df, azure_ai_project, evaluation_name, name_map, tags=tags, **kwargs
|
|
1028
|
+
)
|
|
1029
|
+
else:
|
|
1030
|
+
# Since tracing is disabled, pass None for target_run so a dummy evaluation run will be created each time.
|
|
1031
|
+
trace_destination = _trace_destination_from_project_scope(azure_ai_project) if azure_ai_project else None
|
|
1032
|
+
studio_url = None
|
|
1033
|
+
if trace_destination:
|
|
1034
|
+
studio_url = _log_metrics_and_instance_results(
|
|
1035
|
+
metrics, results_df, trace_destination, None, evaluation_name, name_map, tags=tags, **kwargs
|
|
1036
|
+
)
|
|
1037
|
+
|
|
1038
|
+
result_df_dict = results_df.to_dict("records")
|
|
1039
|
+
result: EvaluationResult = {"rows": result_df_dict, "metrics": metrics, "studio_url": studio_url} # type: ignore
|
|
1040
|
+
# _add_aoai_structured_results_to_results(result, LOGGER, kwargs.get("eval_meta_data"))
|
|
1041
|
+
|
|
1042
|
+
eval_id: Optional[str] = kwargs.get("_eval_id")
|
|
1043
|
+
eval_run_id: Optional[str] = kwargs.get("_eval_run_id")
|
|
1044
|
+
eval_meta_data: Optional[Dict[str, Any]] = kwargs.get("_eval_meta_data")
|
|
1045
|
+
if kwargs.get("_convert_to_aoai_evaluation_result", False):
|
|
1046
|
+
_convert_results_to_aoai_evaluation_results(
|
|
1047
|
+
result, LOGGER, eval_id, eval_run_id, evaluators_and_graders, eval_run_summary_dict, eval_meta_data
|
|
1048
|
+
)
|
|
1049
|
+
if app_insights_configuration := kwargs.get("_app_insights_configuration"):
|
|
1050
|
+
emit_eval_result_events_to_app_insights(
|
|
1051
|
+
app_insights_configuration, result["_evaluation_results_list"], evaluator_config
|
|
1052
|
+
)
|
|
1053
|
+
|
|
1054
|
+
if output_path:
|
|
1055
|
+
_write_output(output_path, result)
|
|
1056
|
+
return result
|
|
1057
|
+
|
|
1058
|
+
|
|
1059
|
+
def _build_internal_log_attributes(
|
|
1060
|
+
event_data: Dict[str, Any],
|
|
1061
|
+
metric_name: str,
|
|
1062
|
+
evaluator_config: Optional[Dict[str, EvaluatorConfig]],
|
|
1063
|
+
internal_log_attributes: Dict[str, str],
|
|
1064
|
+
) -> Dict[str, str]:
|
|
1065
|
+
"""
|
|
1066
|
+
Build internal log attributes for OpenTelemetry logging.
|
|
1067
|
+
|
|
1068
|
+
:param event_data: The event data containing threshold and name information
|
|
1069
|
+
:type event_data: Dict[str, Any]
|
|
1070
|
+
:param metric_name: The name of the metric being evaluated
|
|
1071
|
+
:type metric_name: str
|
|
1072
|
+
:param evaluator_config: Configuration for evaluators
|
|
1073
|
+
:type evaluator_config: Optional[Dict[str, EvaluatorConfig]]
|
|
1074
|
+
:return: Dictionary of internal log attributes
|
|
1075
|
+
:rtype: Dict[str, str]
|
|
1076
|
+
"""
|
|
1077
|
+
# Add threshold if present
|
|
1078
|
+
if event_data.get("threshold"):
|
|
1079
|
+
internal_log_attributes["gen_ai.evaluation.threshold"] = str(event_data["threshold"])
|
|
1080
|
+
|
|
1081
|
+
# Add testing criteria details if present
|
|
1082
|
+
testing_criteria_name = event_data.get("name")
|
|
1083
|
+
if testing_criteria_name:
|
|
1084
|
+
internal_log_attributes["gen_ai.evaluation.testing_criteria.name"] = testing_criteria_name
|
|
1085
|
+
|
|
1086
|
+
# Get evaluator definition details
|
|
1087
|
+
if evaluator_config and testing_criteria_name in evaluator_config:
|
|
1088
|
+
testing_criteria_config = evaluator_config[testing_criteria_name]
|
|
1089
|
+
|
|
1090
|
+
if evaluator_name := testing_criteria_config.get("_evaluator_name"):
|
|
1091
|
+
internal_log_attributes["gen_ai.evaluator.name"] = str(evaluator_name)
|
|
1092
|
+
|
|
1093
|
+
if evaluator_version := testing_criteria_config.get("_evaluator_version"):
|
|
1094
|
+
internal_log_attributes["gen_ai.evaluator.version"] = str(evaluator_version)
|
|
1095
|
+
|
|
1096
|
+
if evaluator_id := testing_criteria_config.get("_evaluator_id"):
|
|
1097
|
+
internal_log_attributes["gen_ai.evaluator.id"] = str(evaluator_id)
|
|
1098
|
+
|
|
1099
|
+
if evaluator_definition := testing_criteria_config.get("_evaluator_definition"):
|
|
1100
|
+
metric_config_detail = evaluator_definition.get("metrics").get(metric_name)
|
|
1101
|
+
|
|
1102
|
+
if metric_config_detail:
|
|
1103
|
+
if metric_config_detail.get("min_value") is not None:
|
|
1104
|
+
internal_log_attributes["gen_ai.evaluation.min_value"] = str(metric_config_detail["min_value"])
|
|
1105
|
+
if metric_config_detail.get("max_value") is not None:
|
|
1106
|
+
internal_log_attributes["gen_ai.evaluation.max_value"] = str(metric_config_detail["max_value"])
|
|
1107
|
+
|
|
1108
|
+
return internal_log_attributes
|
|
1109
|
+
|
|
1110
|
+
|
|
1111
|
+
def _log_events_to_app_insights(
|
|
1112
|
+
event_logger,
|
|
1113
|
+
events: List[Dict[str, Any]],
|
|
1114
|
+
log_attributes: Dict[str, Any],
|
|
1115
|
+
app_insights_config: AppInsightsConfig,
|
|
1116
|
+
data_source_item: Optional[Dict[str, Any]] = None,
|
|
1117
|
+
evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None,
|
|
1118
|
+
) -> None:
|
|
1119
|
+
"""
|
|
1120
|
+
Log independent events directly to App Insights using OpenTelemetry event logging.
|
|
1121
|
+
|
|
1122
|
+
:param event_logger: OpenTelemetry event logger instance
|
|
1123
|
+
:type event_logger: EventLogger
|
|
1124
|
+
:param events: List of event data dictionaries to log
|
|
1125
|
+
:type events: List[Dict[str, Any]]
|
|
1126
|
+
:param log_attributes: Attributes dict to use for each event (already includes extra_attributes if present)
|
|
1127
|
+
:type log_attributes: Dict[str, Any]
|
|
1128
|
+
:param app_insights_config: App Insights configuration containing connection string
|
|
1129
|
+
:type app_insights_config: AppInsightsConfig
|
|
1130
|
+
:param data_source_item: Data source item containing trace, response, and agent information
|
|
1131
|
+
:type data_source_item: Optional[Dict[str, Any]]
|
|
1132
|
+
"""
|
|
1133
|
+
|
|
1134
|
+
from opentelemetry._events import Event
|
|
1135
|
+
|
|
1136
|
+
try:
|
|
1137
|
+
# Initialize values from AppInsights config as defaults
|
|
1138
|
+
trace_id = None
|
|
1139
|
+
span_id = None
|
|
1140
|
+
response_id = None
|
|
1141
|
+
conversation_id = None
|
|
1142
|
+
previous_response_id = None
|
|
1143
|
+
agent_id = app_insights_config.get("agent_id", None)
|
|
1144
|
+
agent_version = app_insights_config.get("agent_version", None)
|
|
1145
|
+
agent_name = app_insights_config.get("agent_name", None)
|
|
1146
|
+
|
|
1147
|
+
# Data source item values have higher priority and will override AppInsights config defaults
|
|
1148
|
+
if data_source_item:
|
|
1149
|
+
for key, value in data_source_item.items():
|
|
1150
|
+
if key.endswith("trace_id") and value and isinstance(value, str):
|
|
1151
|
+
# Remove dashes if present
|
|
1152
|
+
trace_id_str = str(value).replace("-", "").lower()
|
|
1153
|
+
if len(trace_id_str) == 32: # Valid trace_id length
|
|
1154
|
+
trace_id = int(trace_id_str, 16)
|
|
1155
|
+
elif key == "previous_response_id" and value and isinstance(value, str):
|
|
1156
|
+
previous_response_id = value
|
|
1157
|
+
elif key == "response_id" and value and isinstance(value, str):
|
|
1158
|
+
response_id = value
|
|
1159
|
+
elif key == "conversation_id" and value and isinstance(value, str):
|
|
1160
|
+
conversation_id = value
|
|
1161
|
+
elif key == "agent_id" and value and isinstance(value, str):
|
|
1162
|
+
agent_id = value
|
|
1163
|
+
elif key.endswith("span_id") and value and isinstance(value, str):
|
|
1164
|
+
# Remove dashes if present and convert to int
|
|
1165
|
+
span_id_str = str(value).replace("-", "").lower()
|
|
1166
|
+
if len(span_id_str) == 16: # Valid span_id length (64-bit = 16 hex chars)
|
|
1167
|
+
span_id = int(span_id_str, 16)
|
|
1168
|
+
elif key == "agent_version" and value and isinstance(value, str):
|
|
1169
|
+
agent_version = value
|
|
1170
|
+
elif key == "agent_name" and value and isinstance(value, str):
|
|
1171
|
+
agent_name = value
|
|
1172
|
+
|
|
1173
|
+
# Log each event as a separate log record
|
|
1174
|
+
for i, event_data in enumerate(events):
|
|
1175
|
+
try:
|
|
1176
|
+
# Prepare log record attributes with specific mappings
|
|
1177
|
+
# The standard attributes are already in https://github.com/open-telemetry/semantic-conventions/blob/main/docs/gen-ai/gen-ai-events.md#event-eventgen_aievaluationresult
|
|
1178
|
+
metric_name = event_data.get("metric")
|
|
1179
|
+
standard_log_attributes = {}
|
|
1180
|
+
# This attributes makes evaluation events to go into customEvents table in App Insights
|
|
1181
|
+
standard_log_attributes["microsoft.custom_event.name"] = EVALUATION_EVENT_NAME
|
|
1182
|
+
standard_log_attributes["gen_ai.evaluation.name"] = metric_name
|
|
1183
|
+
if event_data.get("score") is not None:
|
|
1184
|
+
standard_log_attributes["gen_ai.evaluation.score.value"] = event_data.get("score")
|
|
1185
|
+
if event_data.get("label") is not None:
|
|
1186
|
+
standard_log_attributes["gen_ai.evaluation.score.label"] = event_data.get("label")
|
|
1187
|
+
|
|
1188
|
+
# Internal proposed attributes
|
|
1189
|
+
# Put it in internal property bag for now, will be expanded if we got sign-off to Otel standard later.
|
|
1190
|
+
internal_log_attributes = _build_internal_log_attributes(
|
|
1191
|
+
event_data, metric_name, evaluator_config, log_attributes
|
|
1192
|
+
)
|
|
1193
|
+
|
|
1194
|
+
# Optional field that may not always be present
|
|
1195
|
+
if "reason" in event_data:
|
|
1196
|
+
standard_log_attributes["gen_ai.evaluation.explanation"] = str(event_data["reason"])
|
|
1197
|
+
|
|
1198
|
+
# Handle error from sample if present
|
|
1199
|
+
# Put the error message in error.type to follow OTel semantic conventions
|
|
1200
|
+
error = event_data.get("sample", {}).get("error", {}).get("message", None)
|
|
1201
|
+
if error:
|
|
1202
|
+
standard_log_attributes["error.type"] = error
|
|
1203
|
+
|
|
1204
|
+
# Handle redteam attack properties if present
|
|
1205
|
+
if "properties" in event_data:
|
|
1206
|
+
properties = event_data["properties"]
|
|
1207
|
+
|
|
1208
|
+
if "attack_success" in properties:
|
|
1209
|
+
internal_log_attributes["gen_ai.redteam.attack.success"] = str(properties["attack_success"])
|
|
1210
|
+
|
|
1211
|
+
if "attack_technique" in properties:
|
|
1212
|
+
internal_log_attributes["gen_ai.redteam.attack.technique"] = str(properties["attack_technique"])
|
|
1213
|
+
|
|
1214
|
+
if "attack_complexity" in properties:
|
|
1215
|
+
internal_log_attributes["gen_ai.redteam.attack.complexity"] = str(
|
|
1216
|
+
properties["attack_complexity"]
|
|
1217
|
+
)
|
|
1218
|
+
|
|
1219
|
+
if "attack_success_threshold" in properties:
|
|
1220
|
+
internal_log_attributes["gen_ai.redteam.attack.success_threshold"] = str(
|
|
1221
|
+
properties["attack_success_threshold"]
|
|
1222
|
+
)
|
|
1223
|
+
|
|
1224
|
+
# Add data source item attributes if present
|
|
1225
|
+
if response_id:
|
|
1226
|
+
standard_log_attributes["gen_ai.response.id"] = response_id
|
|
1227
|
+
if conversation_id:
|
|
1228
|
+
standard_log_attributes["gen_ai.conversation.id"] = conversation_id
|
|
1229
|
+
if previous_response_id:
|
|
1230
|
+
internal_log_attributes["gen_ai.previous.response.id"] = previous_response_id
|
|
1231
|
+
if agent_id:
|
|
1232
|
+
standard_log_attributes["gen_ai.agent.id"] = agent_id
|
|
1233
|
+
if agent_name:
|
|
1234
|
+
standard_log_attributes["gen_ai.agent.name"] = agent_name
|
|
1235
|
+
if agent_version:
|
|
1236
|
+
internal_log_attributes["gen_ai.agent.version"] = agent_version
|
|
1237
|
+
|
|
1238
|
+
# Combine standard and internal attributes, put internal under the properties bag
|
|
1239
|
+
standard_log_attributes["internal_properties"] = json.dumps(internal_log_attributes)
|
|
1240
|
+
# Anonymize IP address to prevent Azure GeoIP enrichment and location tracking
|
|
1241
|
+
standard_log_attributes["http.client_ip"] = "0.0.0.0"
|
|
1242
|
+
|
|
1243
|
+
event_logger.emit(
|
|
1244
|
+
Event(
|
|
1245
|
+
name=EVALUATION_EVENT_NAME,
|
|
1246
|
+
attributes=standard_log_attributes,
|
|
1247
|
+
body=EVALUATION_EVENT_NAME,
|
|
1248
|
+
trace_id=trace_id if trace_id is not None else None,
|
|
1249
|
+
span_id=span_id if span_id is not None else None,
|
|
1250
|
+
)
|
|
1251
|
+
)
|
|
1252
|
+
|
|
1253
|
+
except Exception as e:
|
|
1254
|
+
LOGGER.warning(f"Failed to log event {i}: {e}")
|
|
1255
|
+
|
|
1256
|
+
except Exception as e:
|
|
1257
|
+
LOGGER.error(f"Failed to log events to App Insights: {e}")
|
|
699
1258
|
|
|
1259
|
+
|
|
1260
|
+
def emit_eval_result_events_to_app_insights(
|
|
1261
|
+
app_insights_config: AppInsightsConfig,
|
|
1262
|
+
results: List[Dict],
|
|
1263
|
+
evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None,
|
|
1264
|
+
) -> None:
|
|
1265
|
+
"""
|
|
1266
|
+
Emit evaluation result events to App Insights using OpenTelemetry logging.
|
|
1267
|
+
Each result is logged as an independent log record, potentially including trace context.
|
|
1268
|
+
|
|
1269
|
+
:param app_insights_config: App Insights configuration containing connection string
|
|
1270
|
+
:type app_insights_config: AppInsightsConfig
|
|
1271
|
+
:param results: List of evaluation results to log
|
|
1272
|
+
:type results: List[Dict]
|
|
1273
|
+
"""
|
|
1274
|
+
|
|
1275
|
+
from opentelemetry import _logs
|
|
1276
|
+
from opentelemetry.sdk._logs import LoggerProvider
|
|
1277
|
+
from opentelemetry.sdk._logs.export import BatchLogRecordProcessor
|
|
1278
|
+
from opentelemetry.sdk.resources import Resource
|
|
1279
|
+
from opentelemetry.semconv.resource import ResourceAttributes
|
|
1280
|
+
from azure.monitor.opentelemetry.exporter import AzureMonitorLogExporter
|
|
1281
|
+
from opentelemetry._events import get_event_logger
|
|
1282
|
+
from opentelemetry.sdk._events import EventLoggerProvider
|
|
1283
|
+
|
|
1284
|
+
if not results:
|
|
1285
|
+
LOGGER.debug("No results to log to App Insights")
|
|
1286
|
+
return
|
|
1287
|
+
|
|
1288
|
+
try:
|
|
1289
|
+
# Configure OpenTelemetry logging with anonymized Resource attributes
|
|
1290
|
+
|
|
1291
|
+
# Create a resource with minimal attributes to prevent sensitive data collection
|
|
1292
|
+
# SERVICE_INSTANCE_ID maps to cloud_RoleInstance in Azure Monitor and prevents
|
|
1293
|
+
# Azure Monitor from auto-detecting the device hostname
|
|
1294
|
+
anonymized_resource = Resource.create(
|
|
1295
|
+
{
|
|
1296
|
+
ResourceAttributes.SERVICE_NAME: "unknown",
|
|
1297
|
+
ResourceAttributes.SERVICE_INSTANCE_ID: "unknown",
|
|
1298
|
+
}
|
|
1299
|
+
)
|
|
1300
|
+
|
|
1301
|
+
logger_provider = LoggerProvider(resource=anonymized_resource)
|
|
1302
|
+
_logs.set_logger_provider(logger_provider)
|
|
1303
|
+
|
|
1304
|
+
# Create Azure Monitor log exporter
|
|
1305
|
+
azure_log_exporter = AzureMonitorLogExporter(connection_string=app_insights_config["connection_string"])
|
|
1306
|
+
|
|
1307
|
+
# Add the Azure Monitor exporter to the logger provider
|
|
1308
|
+
logger_provider.add_log_record_processor(BatchLogRecordProcessor(azure_log_exporter))
|
|
1309
|
+
|
|
1310
|
+
# Create event logger
|
|
1311
|
+
event_provider = EventLoggerProvider(logger_provider)
|
|
1312
|
+
event_logger = get_event_logger(__name__, event_logger_provider=event_provider)
|
|
1313
|
+
|
|
1314
|
+
# Initialize base log attributes with extra_attributes if present, otherwise empty dict
|
|
1315
|
+
base_log_attributes = app_insights_config.get("extra_attributes", {})
|
|
1316
|
+
|
|
1317
|
+
# Add AppInsights config attributes with proper semantic convention mappings
|
|
1318
|
+
if "run_type" in app_insights_config:
|
|
1319
|
+
base_log_attributes["gen_ai.evaluation.azure_ai_type"] = str(app_insights_config["run_type"])
|
|
1320
|
+
if "schedule_type" in app_insights_config:
|
|
1321
|
+
base_log_attributes["gen_ai.evaluation.azure_ai_scheduled"] = str(app_insights_config["schedule_type"])
|
|
1322
|
+
if "run_id" in app_insights_config:
|
|
1323
|
+
base_log_attributes["gen_ai.evaluation.run.id"] = str(app_insights_config["run_id"])
|
|
1324
|
+
if "project_id" in app_insights_config:
|
|
1325
|
+
base_log_attributes["gen_ai.azure_ai_project.id"] = str(app_insights_config["project_id"])
|
|
1326
|
+
|
|
1327
|
+
for result in results:
|
|
1328
|
+
# Create a copy of base attributes for this result's events
|
|
1329
|
+
log_attributes = base_log_attributes.copy()
|
|
1330
|
+
|
|
1331
|
+
_log_events_to_app_insights(
|
|
1332
|
+
event_logger=event_logger,
|
|
1333
|
+
events=result["results"],
|
|
1334
|
+
log_attributes=log_attributes,
|
|
1335
|
+
data_source_item=result["datasource_item"] if "datasource_item" in result else None,
|
|
1336
|
+
evaluator_config=evaluator_config,
|
|
1337
|
+
app_insights_config=app_insights_config,
|
|
1338
|
+
)
|
|
1339
|
+
# Force flush to ensure events are sent
|
|
1340
|
+
logger_provider.force_flush()
|
|
1341
|
+
LOGGER.info(f"Successfully logged {len(results)} evaluation results to App Insights")
|
|
1342
|
+
|
|
1343
|
+
except Exception as e:
|
|
1344
|
+
LOGGER.error(f"Failed to emit evaluation results to App Insights: {e}")
|
|
1345
|
+
|
|
1346
|
+
|
|
1347
|
+
def _preprocess_data(
|
|
1348
|
+
data: Union[str, os.PathLike],
|
|
1349
|
+
evaluators_and_graders: Dict[str, Union[Callable, AzureOpenAIGrader]],
|
|
1350
|
+
evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None,
|
|
1351
|
+
target: Optional[Callable] = None,
|
|
1352
|
+
output_path: Optional[Union[str, os.PathLike]] = None,
|
|
1353
|
+
azure_ai_project: Optional[Union[str, AzureAIProject]] = None,
|
|
1354
|
+
evaluation_name: Optional[str] = None,
|
|
1355
|
+
fail_on_evaluator_errors: bool = False,
|
|
1356
|
+
tags: Optional[Dict[str, str]] = None,
|
|
1357
|
+
**kwargs,
|
|
1358
|
+
) -> __ValidatedData:
|
|
700
1359
|
# Process evaluator config to replace ${target.} with ${data.}
|
|
701
1360
|
if evaluator_config is None:
|
|
702
1361
|
evaluator_config = {}
|
|
1362
|
+
|
|
1363
|
+
input_data_df = _validate_and_load_data(
|
|
1364
|
+
target, data, evaluators_and_graders, output_path, azure_ai_project, evaluation_name, tags
|
|
1365
|
+
)
|
|
1366
|
+
if target is not None:
|
|
1367
|
+
_validate_columns_for_target(input_data_df, target)
|
|
1368
|
+
|
|
703
1369
|
# extract column mapping dicts into dictionary mapping evaluator name to column mapping
|
|
704
1370
|
column_mapping = _process_column_mappings(
|
|
705
1371
|
{
|
|
@@ -708,35 +1374,115 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
|
|
|
708
1374
|
}
|
|
709
1375
|
)
|
|
710
1376
|
|
|
711
|
-
if target is not None:
|
|
712
|
-
_validate_columns_for_target(input_data_df, target)
|
|
713
|
-
|
|
714
|
-
pf_client = PFClient(user_agent=USER_AGENT)
|
|
715
|
-
target_run: Optional[Run] = None
|
|
716
|
-
|
|
717
1377
|
# Create default configuration for evaluators that directly maps
|
|
718
1378
|
# input data names to keyword inputs of the same name in the evaluators.
|
|
719
1379
|
column_mapping = column_mapping or {}
|
|
720
1380
|
column_mapping.setdefault("default", {})
|
|
721
1381
|
|
|
722
|
-
#
|
|
1382
|
+
# Split normal evaluators and OAI graders
|
|
1383
|
+
evaluators, graders = _split_evaluators_and_grader_configs(evaluators_and_graders)
|
|
1384
|
+
|
|
1385
|
+
target_run: Optional[BatchClientRun] = None
|
|
723
1386
|
target_generated_columns: Set[str] = set()
|
|
1387
|
+
batch_run_client: BatchClient
|
|
1388
|
+
batch_run_data: Union[str, os.PathLike, pd.DataFrame] = data
|
|
1389
|
+
|
|
1390
|
+
def get_client_type(evaluate_kwargs: Dict[str, Any]) -> Literal["run_submitter", "pf_client", "code_client"]:
|
|
1391
|
+
"""Determines the BatchClient to use from provided kwargs (_use_run_submitter_client and _use_pf_client)"""
|
|
1392
|
+
_use_run_submitter_client = cast(Optional[bool], kwargs.pop("_use_run_submitter_client", None))
|
|
1393
|
+
_use_pf_client = cast(Optional[bool], kwargs.pop("_use_pf_client", None))
|
|
1394
|
+
|
|
1395
|
+
if _use_run_submitter_client is None and _use_pf_client is None:
|
|
1396
|
+
# If both are unset, return default
|
|
1397
|
+
return "run_submitter"
|
|
1398
|
+
|
|
1399
|
+
if _use_run_submitter_client and _use_pf_client:
|
|
1400
|
+
raise EvaluationException(
|
|
1401
|
+
message="Only one of _use_pf_client and _use_run_submitter_client should be set to True.",
|
|
1402
|
+
target=ErrorTarget.EVALUATE,
|
|
1403
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
1404
|
+
blame=ErrorBlame.USER_ERROR,
|
|
1405
|
+
)
|
|
1406
|
+
|
|
1407
|
+
if _use_run_submitter_client == False and _use_pf_client == False:
|
|
1408
|
+
return "code_client"
|
|
1409
|
+
|
|
1410
|
+
if _use_run_submitter_client:
|
|
1411
|
+
return "run_submitter"
|
|
1412
|
+
if _use_pf_client:
|
|
1413
|
+
return "pf_client"
|
|
1414
|
+
|
|
1415
|
+
if _use_run_submitter_client is None and _use_pf_client == False:
|
|
1416
|
+
return "run_submitter"
|
|
1417
|
+
if _use_run_submitter_client == False and _use_pf_client is None:
|
|
1418
|
+
return "pf_client"
|
|
1419
|
+
|
|
1420
|
+
assert False, "This should be impossible"
|
|
1421
|
+
|
|
1422
|
+
client_type: Literal["run_submitter", "pf_client", "code_client"] = get_client_type(kwargs)
|
|
1423
|
+
|
|
1424
|
+
if client_type == "run_submitter":
|
|
1425
|
+
batch_run_client = RunSubmitterClient(raise_on_errors=fail_on_evaluator_errors)
|
|
1426
|
+
batch_run_data = input_data_df
|
|
1427
|
+
elif client_type == "pf_client":
|
|
1428
|
+
batch_run_client = ProxyClient(user_agent=UserAgentSingleton().value)
|
|
1429
|
+
# Ensure the absolute path is Re to pf.run, as relative path doesn't work with
|
|
1430
|
+
# multiple evaluators. If the path is already absolute, abspath will return the original path.
|
|
1431
|
+
batch_run_data = os.path.abspath(data)
|
|
1432
|
+
elif client_type == "code_client":
|
|
1433
|
+
batch_run_client = CodeClient()
|
|
1434
|
+
batch_run_data = input_data_df
|
|
1435
|
+
|
|
1436
|
+
# If target is set, apply 1-1 column mapping from target outputs to evaluator inputs
|
|
724
1437
|
if data is not None and target is not None:
|
|
725
1438
|
input_data_df, target_generated_columns, target_run = _apply_target_to_data(
|
|
726
|
-
target,
|
|
1439
|
+
target, batch_run_data, batch_run_client, input_data_df, evaluation_name, **kwargs
|
|
727
1440
|
)
|
|
728
1441
|
|
|
729
|
-
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
|
|
733
|
-
|
|
734
|
-
|
|
735
|
-
|
|
736
|
-
|
|
737
|
-
|
|
738
|
-
|
|
739
|
-
|
|
1442
|
+
# IMPORTANT FIX: For ProxyClient, create a temporary file with the complete dataframe
|
|
1443
|
+
# This ensures that evaluators get all rows (including failed ones with NaN values)
|
|
1444
|
+
if isinstance(batch_run_client, ProxyClient):
|
|
1445
|
+
# Create a temporary JSONL file with the complete dataframe
|
|
1446
|
+
temp_file = tempfile.NamedTemporaryFile(mode="w", suffix=".jsonl", delete=False)
|
|
1447
|
+
try:
|
|
1448
|
+
for _, row in input_data_df.iterrows():
|
|
1449
|
+
row_dict = row.to_dict()
|
|
1450
|
+
temp_file.write(json.dumps(row_dict) + "\n")
|
|
1451
|
+
temp_file.close()
|
|
1452
|
+
batch_run_data = temp_file.name
|
|
1453
|
+
|
|
1454
|
+
# Update column mappings to use data references instead of run outputs
|
|
1455
|
+
for evaluator_name, mapping in column_mapping.items():
|
|
1456
|
+
mapped_to_values = set(mapping.values())
|
|
1457
|
+
for col in target_generated_columns:
|
|
1458
|
+
# Use data reference instead of run output to ensure we get all rows
|
|
1459
|
+
target_reference = f"${{data.{Prefixes.TSG_OUTPUTS}{col}}}"
|
|
1460
|
+
|
|
1461
|
+
# We will add our mapping only if customer did not map target output.
|
|
1462
|
+
if col not in mapping and target_reference not in mapped_to_values:
|
|
1463
|
+
column_mapping[evaluator_name][col] = target_reference
|
|
1464
|
+
|
|
1465
|
+
# Don't pass the target_run since we're now using the complete dataframe
|
|
1466
|
+
target_run = None
|
|
1467
|
+
|
|
1468
|
+
except Exception as e:
|
|
1469
|
+
# Clean up the temp file if something goes wrong
|
|
1470
|
+
if os.path.exists(temp_file.name):
|
|
1471
|
+
os.unlink(temp_file.name)
|
|
1472
|
+
raise e
|
|
1473
|
+
else:
|
|
1474
|
+
# For DataFrame-based clients, update batch_run_data to use the updated input_data_df
|
|
1475
|
+
batch_run_data = input_data_df
|
|
1476
|
+
|
|
1477
|
+
# Update column mappings for DataFrame clients
|
|
1478
|
+
for evaluator_name, mapping in column_mapping.items():
|
|
1479
|
+
mapped_to_values = set(mapping.values())
|
|
1480
|
+
for col in target_generated_columns:
|
|
1481
|
+
target_reference = f"${{data.{Prefixes.TSG_OUTPUTS}{col}}}"
|
|
1482
|
+
|
|
1483
|
+
# We will add our mapping only if customer did not map target output.
|
|
1484
|
+
if col not in mapping and target_reference not in mapped_to_values:
|
|
1485
|
+
column_mapping[evaluator_name][col] = target_reference
|
|
740
1486
|
|
|
741
1487
|
# After we have generated all columns, we can check if we have everything we need for evaluators.
|
|
742
1488
|
_validate_columns_for_evaluators(input_data_df, evaluators, target, target_generated_columns, column_mapping)
|
|
@@ -745,24 +1491,156 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
|
|
|
745
1491
|
# via target mapping.
|
|
746
1492
|
# If both the data and the output dictionary of the target function
|
|
747
1493
|
# have the same column, then the target function value is used.
|
|
1494
|
+
# NEW: flatten nested object columns (e.g., 'item') so we can map leaf values automatically.
|
|
1495
|
+
# Ensure the data does not contain top-level 'conversation' or 'messages' columns (which indicate chat/conversation data)
|
|
1496
|
+
if input_data_df is not None:
|
|
1497
|
+
if "conversation" in input_data_df.columns or "messages" in input_data_df.columns:
|
|
1498
|
+
# No action is taken when 'conversation' or 'messages' columns are present,
|
|
1499
|
+
# as these indicate chat/conversation data which should not be flattened or mapped by default.
|
|
1500
|
+
pass
|
|
1501
|
+
else:
|
|
1502
|
+
input_data_df = _flatten_object_columns_for_default_mapping(input_data_df)
|
|
1503
|
+
|
|
1504
|
+
# Build default mapping for leaves:
|
|
748
1505
|
if input_data_df is not None:
|
|
1506
|
+
# First, map flattened nested columns (those containing a dot) to leaf names.
|
|
1507
|
+
for col in input_data_df.columns:
|
|
1508
|
+
# Skip target output columns
|
|
1509
|
+
if col.startswith(Prefixes.TSG_OUTPUTS):
|
|
1510
|
+
continue
|
|
1511
|
+
# Skip root container columns (no dot) here; they'll be handled below if truly primitive.
|
|
1512
|
+
if "." in col:
|
|
1513
|
+
leaf_name = col.split(".")[-1]
|
|
1514
|
+
if leaf_name not in column_mapping["default"]:
|
|
1515
|
+
column_mapping["default"][leaf_name] = f"${{data.{col}}}"
|
|
1516
|
+
|
|
1517
|
+
# Then, handle remaining top-level primitive columns (original logic).
|
|
749
1518
|
for col in input_data_df.columns:
|
|
750
|
-
|
|
751
|
-
|
|
752
|
-
|
|
1519
|
+
if (
|
|
1520
|
+
not col.startswith(Prefixes.TSG_OUTPUTS)
|
|
1521
|
+
and col not in column_mapping["default"].keys()
|
|
1522
|
+
and "." not in col # only pure top-level primitives
|
|
1523
|
+
):
|
|
753
1524
|
column_mapping["default"][col] = f"${{data.{col}}}"
|
|
754
1525
|
|
|
755
|
-
|
|
756
|
-
|
|
757
|
-
|
|
1526
|
+
return __ValidatedData(
|
|
1527
|
+
evaluators=evaluators,
|
|
1528
|
+
graders=graders,
|
|
1529
|
+
input_data_df=input_data_df,
|
|
1530
|
+
column_mapping=column_mapping,
|
|
1531
|
+
target_run=target_run,
|
|
1532
|
+
batch_run_client=batch_run_client,
|
|
1533
|
+
batch_run_data=batch_run_data,
|
|
1534
|
+
)
|
|
1535
|
+
|
|
1536
|
+
|
|
1537
|
+
def _flatten_object_columns_for_default_mapping(
|
|
1538
|
+
df: pd.DataFrame, root_prefixes: Optional[Iterable[str]] = None
|
|
1539
|
+
) -> pd.DataFrame:
|
|
1540
|
+
"""Flatten nested dictionary-valued columns into dotted leaf columns.
|
|
1541
|
+
|
|
1542
|
+
For any column whose cells (in at least one row) are ``dict`` objects, this utility discovers all
|
|
1543
|
+
leaf paths (recursively descending only through ``dict`` nodes) and materializes new DataFrame
|
|
1544
|
+
columns named ``"<original_col>.<nested.path.leaf>"`` for every unique leaf encountered across
|
|
1545
|
+
all rows. A *leaf* is defined as any value that is **not** a ``dict`` (lists / primitives / ``None``
|
|
1546
|
+
are all treated as leaves). Existing columns are never overwritten (idempotent behavior).
|
|
1547
|
+
|
|
1548
|
+
Example
|
|
1549
|
+
If a column ``item`` contains objects like ``{"a": {"b": 1, "c": 2}}`` a pair of new
|
|
1550
|
+
columns ``item.a.b`` and ``item.a.c`` will be added with the corresponding scalar values.
|
|
1551
|
+
|
|
1552
|
+
:param df: Input DataFrame to flatten in place.
|
|
1553
|
+
:type df: ~pandas.DataFrame
|
|
1554
|
+
:param root_prefixes: Optional iterable restricting which top-level columns are considered
|
|
1555
|
+
for flattening. If ``None``, all columns containing at least one ``dict`` value are processed.
|
|
1556
|
+
:type root_prefixes: Optional[Iterable[str]]
|
|
1557
|
+
:return: The same DataFrame instance (returned for convenient chaining).
|
|
1558
|
+
:rtype: ~pandas.DataFrame
|
|
1559
|
+
"""
|
|
1560
|
+
candidate_cols = []
|
|
1561
|
+
if root_prefixes is not None:
|
|
1562
|
+
candidate_cols = [c for c in root_prefixes if c in df.columns]
|
|
1563
|
+
else:
|
|
1564
|
+
# pick columns where at least one non-null value is a dict
|
|
1565
|
+
for c in df.columns:
|
|
1566
|
+
series = df[c]
|
|
1567
|
+
if series.map(lambda v: isinstance(v, dict)).any():
|
|
1568
|
+
candidate_cols.append(c)
|
|
1569
|
+
|
|
1570
|
+
def _extract_leaves(obj: Any, prefix: str) -> Iterator[Tuple[str, Any]]:
|
|
1571
|
+
if isinstance(obj, dict):
|
|
1572
|
+
for k, v in obj.items():
|
|
1573
|
+
new_prefix = f"{prefix}.{k}" if prefix else k
|
|
1574
|
+
if isinstance(v, dict):
|
|
1575
|
+
yield from _extract_leaves(v, new_prefix)
|
|
1576
|
+
else:
|
|
1577
|
+
# treat list / primitive / None as leaf
|
|
1578
|
+
yield new_prefix, v
|
|
1579
|
+
|
|
1580
|
+
for root_col in candidate_cols:
|
|
1581
|
+
# Build a union of leaf paths across rows to ensure consistent columns
|
|
1582
|
+
leaf_paths: Set[str] = set()
|
|
1583
|
+
for val in df[root_col]:
|
|
1584
|
+
if isinstance(val, dict):
|
|
1585
|
+
for path, _ in _extract_leaves(val, root_col):
|
|
1586
|
+
leaf_paths.add(path)
|
|
1587
|
+
|
|
1588
|
+
if not leaf_paths:
|
|
1589
|
+
continue
|
|
1590
|
+
|
|
1591
|
+
# Create each flattened column if absent
|
|
1592
|
+
for path in leaf_paths:
|
|
1593
|
+
if path in df.columns:
|
|
1594
|
+
continue # already present
|
|
1595
|
+
relative_keys = path[len(root_col) + 1 :].split(".") if len(path) > len(root_col) else []
|
|
1596
|
+
|
|
1597
|
+
def getter(root_val: Any) -> Any:
|
|
1598
|
+
cur = root_val
|
|
1599
|
+
for rk in relative_keys:
|
|
1600
|
+
if not isinstance(cur, dict):
|
|
1601
|
+
return None
|
|
1602
|
+
cur = cur.get(rk, None)
|
|
1603
|
+
return cur
|
|
1604
|
+
|
|
1605
|
+
df[path] = df[root_col].map(lambda rv: getter(rv) if isinstance(rv, dict) else None)
|
|
1606
|
+
|
|
1607
|
+
return df
|
|
1608
|
+
|
|
1609
|
+
|
|
1610
|
+
def _run_callable_evaluators(
|
|
1611
|
+
validated_data: __ValidatedData,
|
|
1612
|
+
fail_on_evaluator_errors: bool = False,
|
|
1613
|
+
**kwargs,
|
|
1614
|
+
) -> Tuple[pd.DataFrame, Dict[str, Any], Dict[str, __EvaluatorInfo]]:
|
|
1615
|
+
|
|
1616
|
+
# Extract needed values
|
|
1617
|
+
batch_run_client = validated_data["batch_run_client"]
|
|
1618
|
+
target_run = validated_data["target_run"]
|
|
1619
|
+
batch_run_data = validated_data["batch_run_data"]
|
|
1620
|
+
column_mapping = validated_data["column_mapping"]
|
|
1621
|
+
evaluators = validated_data["evaluators"]
|
|
1622
|
+
|
|
1623
|
+
# Clean up temporary file after evaluation if it was created
|
|
1624
|
+
temp_file_to_cleanup = None
|
|
1625
|
+
if (
|
|
1626
|
+
isinstance(batch_run_client, ProxyClient)
|
|
1627
|
+
and isinstance(batch_run_data, str)
|
|
1628
|
+
and batch_run_data.endswith(".jsonl")
|
|
1629
|
+
):
|
|
1630
|
+
# Check if it's a temporary file (contains temp directory path)
|
|
1631
|
+
if tempfile.gettempdir() in batch_run_data:
|
|
1632
|
+
temp_file_to_cleanup = batch_run_data
|
|
1633
|
+
|
|
1634
|
+
try:
|
|
758
1635
|
with EvalRunContext(batch_run_client):
|
|
759
1636
|
runs = {
|
|
760
1637
|
evaluator_name: batch_run_client.run(
|
|
761
1638
|
flow=evaluator,
|
|
1639
|
+
data=batch_run_data,
|
|
1640
|
+
# Don't pass target_run when using complete dataframe
|
|
762
1641
|
run=target_run,
|
|
763
1642
|
evaluator_name=evaluator_name,
|
|
764
1643
|
column_mapping=column_mapping.get(evaluator_name, column_mapping.get("default", None)),
|
|
765
|
-
data=data,
|
|
766
1644
|
stream=True,
|
|
767
1645
|
name=kwargs.get("_run_name"),
|
|
768
1646
|
)
|
|
@@ -770,7 +1648,7 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
|
|
|
770
1648
|
}
|
|
771
1649
|
|
|
772
1650
|
# get_details needs to be called within EvalRunContext scope in order to have user agent populated
|
|
773
|
-
|
|
1651
|
+
per_evaluator_results: Dict[str, __EvaluatorInfo] = {
|
|
774
1652
|
evaluator_name: {
|
|
775
1653
|
"result": batch_run_client.get_details(run, all_results=True),
|
|
776
1654
|
"metrics": batch_run_client.get_metrics(run),
|
|
@@ -778,22 +1656,21 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
|
|
|
778
1656
|
}
|
|
779
1657
|
for evaluator_name, run in runs.items()
|
|
780
1658
|
}
|
|
781
|
-
|
|
782
|
-
|
|
783
|
-
|
|
784
|
-
|
|
785
|
-
|
|
786
|
-
|
|
787
|
-
|
|
788
|
-
per_evaluator_results = eval_batch_run(ProxyClient(pf_client), data=data)
|
|
789
|
-
else:
|
|
790
|
-
data = input_data_df
|
|
791
|
-
per_evaluator_results = eval_batch_run(CodeClient(), data=input_data_df)
|
|
792
|
-
|
|
1659
|
+
finally:
|
|
1660
|
+
# Clean up temporary file if it was created
|
|
1661
|
+
if temp_file_to_cleanup and os.path.exists(temp_file_to_cleanup):
|
|
1662
|
+
try:
|
|
1663
|
+
os.unlink(temp_file_to_cleanup)
|
|
1664
|
+
except Exception as e:
|
|
1665
|
+
LOGGER.warning(f"Failed to clean up temporary file {temp_file_to_cleanup}: {e}")
|
|
793
1666
|
# Concatenate all results
|
|
794
|
-
evaluators_result_df =
|
|
1667
|
+
evaluators_result_df = pd.DataFrame()
|
|
795
1668
|
evaluators_metric = {}
|
|
796
1669
|
for evaluator_name, evaluator_result in per_evaluator_results.items():
|
|
1670
|
+
if fail_on_evaluator_errors and evaluator_result["run_summary"]["failed_lines"] > 0:
|
|
1671
|
+
_print_summary(per_evaluator_results)
|
|
1672
|
+
_turn_error_logs_into_exception(evaluator_result["run_summary"]["log_path"] + "/error.json")
|
|
1673
|
+
|
|
797
1674
|
evaluator_result_df = evaluator_result["result"]
|
|
798
1675
|
|
|
799
1676
|
# drop input columns
|
|
@@ -822,31 +1699,821 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
|
|
|
822
1699
|
# Rename columns, generated by target function to outputs instead of inputs.
|
|
823
1700
|
# If target generates columns, already present in the input data, these columns
|
|
824
1701
|
# will be marked as outputs already so we do not need to rename them.
|
|
825
|
-
input_data_df = _rename_columns_conditionally(input_data_df)
|
|
826
|
-
|
|
827
|
-
result_df = pd.concat([input_data_df, evaluators_result_df], axis=1, verify_integrity=True)
|
|
828
|
-
metrics = _aggregate_metrics(evaluators_result_df, evaluators)
|
|
829
|
-
metrics.update(evaluators_metric)
|
|
830
|
-
|
|
831
|
-
# Since tracing is disabled, pass None for target_run so a dummy evaluation run will be created each time.
|
|
832
|
-
target_run = None
|
|
833
|
-
trace_destination = _trace_destination_from_project_scope(azure_ai_project) if azure_ai_project else None
|
|
834
|
-
studio_url = None
|
|
835
|
-
if trace_destination:
|
|
836
|
-
studio_url = _log_metrics_and_instance_results(
|
|
837
|
-
metrics,
|
|
838
|
-
result_df,
|
|
839
|
-
trace_destination,
|
|
840
|
-
target_run,
|
|
841
|
-
evaluation_name,
|
|
842
|
-
)
|
|
843
1702
|
|
|
844
|
-
|
|
845
|
-
|
|
1703
|
+
input_data_df = _rename_columns_conditionally(validated_data["input_data_df"])
|
|
1704
|
+
eval_result_df = pd.concat([input_data_df, evaluators_result_df], axis=1, verify_integrity=True)
|
|
1705
|
+
eval_metrics = _aggregate_metrics(evaluators_result_df, evaluators)
|
|
1706
|
+
eval_metrics.update(evaluators_metric)
|
|
846
1707
|
|
|
847
|
-
|
|
1708
|
+
return eval_result_df, eval_metrics, per_evaluator_results
|
|
848
1709
|
|
|
849
|
-
if output_path:
|
|
850
|
-
_write_output(output_path, result)
|
|
851
1710
|
|
|
852
|
-
|
|
1711
|
+
def _map_names_to_builtins(
|
|
1712
|
+
evaluators: Dict[str, Callable],
|
|
1713
|
+
graders: Dict[str, AzureOpenAIGrader],
|
|
1714
|
+
) -> Dict[str, str]:
|
|
1715
|
+
"""
|
|
1716
|
+
Construct a mapping from user-supplied evaluator names to which known, built-in
|
|
1717
|
+
evaluator or grader they refer to. Custom evaluators are excluded from the mapping
|
|
1718
|
+
as we only want to track built-in evaluators and graders.
|
|
1719
|
+
|
|
1720
|
+
:param evaluators: The dictionary of evaluators.
|
|
1721
|
+
:type evaluators: Dict[str, Callable]
|
|
1722
|
+
:param graders: The dictionary of graders.
|
|
1723
|
+
:type graders: Dict[str, AzureOpenAIGrader]
|
|
1724
|
+
:param evaluator_config: The configuration for evaluators.
|
|
1725
|
+
:type evaluator_config: Optional[Dict[str, EvaluatorConfig]]
|
|
1726
|
+
|
|
1727
|
+
"""
|
|
1728
|
+
from .._eval_mapping import EVAL_CLASS_MAP
|
|
1729
|
+
|
|
1730
|
+
name_map = {}
|
|
1731
|
+
|
|
1732
|
+
for name, evaluator in evaluators.items():
|
|
1733
|
+
# Check if the evaluator is a known built-in evaluator
|
|
1734
|
+
found_eval = False
|
|
1735
|
+
for eval_class, eval_id in EVAL_CLASS_MAP.items():
|
|
1736
|
+
if isinstance(evaluator, eval_class):
|
|
1737
|
+
name_map[name] = eval_id
|
|
1738
|
+
found_eval = True
|
|
1739
|
+
break
|
|
1740
|
+
if not found_eval:
|
|
1741
|
+
# Skip custom evaluators - we only want to track built-in evaluators
|
|
1742
|
+
pass
|
|
1743
|
+
|
|
1744
|
+
for name, grader in graders.items():
|
|
1745
|
+
name_map[name] = grader.id
|
|
1746
|
+
|
|
1747
|
+
return name_map
|
|
1748
|
+
|
|
1749
|
+
|
|
1750
|
+
def _turn_error_logs_into_exception(log_path: str) -> None:
|
|
1751
|
+
"""Produce an EvaluationException using the contents of the inputted
|
|
1752
|
+
file as the error message.
|
|
1753
|
+
|
|
1754
|
+
:param log_path: The path to the error log file.
|
|
1755
|
+
:type log_path: str
|
|
1756
|
+
"""
|
|
1757
|
+
with open(log_path, "r", encoding=DefaultOpenEncoding.READ) as file:
|
|
1758
|
+
error_message = file.read()
|
|
1759
|
+
raise EvaluationException(
|
|
1760
|
+
message=error_message,
|
|
1761
|
+
target=ErrorTarget.EVALUATE,
|
|
1762
|
+
category=ErrorCategory.FAILED_EXECUTION,
|
|
1763
|
+
blame=ErrorBlame.UNKNOWN,
|
|
1764
|
+
)
|
|
1765
|
+
|
|
1766
|
+
|
|
1767
|
+
def _convert_results_to_aoai_evaluation_results(
|
|
1768
|
+
results: EvaluationResult,
|
|
1769
|
+
logger: logging.Logger,
|
|
1770
|
+
eval_id: Optional[str] = None,
|
|
1771
|
+
eval_run_id: Optional[str] = None,
|
|
1772
|
+
evaluators: Dict[str, Union[Callable, AzureOpenAIGrader]] = None,
|
|
1773
|
+
eval_run_summary: Optional[Dict[str, Any]] = None,
|
|
1774
|
+
eval_meta_data: Optional[Dict[str, Any]] = None,
|
|
1775
|
+
) -> None:
|
|
1776
|
+
"""
|
|
1777
|
+
Convert evaluation results to AOAI evaluation results format.
|
|
1778
|
+
|
|
1779
|
+
Each row of input results.rows looks like:
|
|
1780
|
+
{"inputs.query":"What is the capital of France?","inputs.context":"France is in Europe",
|
|
1781
|
+
"inputs.generated_response":"Paris is the capital of France.","inputs.ground_truth":"Paris is the capital of France.",
|
|
1782
|
+
"outputs.F1_score.f1_score":1.0,"outputs.F1_score.f1_result":"pass","outputs.F1_score.f1_threshold":0.5}
|
|
1783
|
+
|
|
1784
|
+
Convert each row into new RunOutputItem object with results array.
|
|
1785
|
+
|
|
1786
|
+
:param results: The evaluation results to convert
|
|
1787
|
+
:type results: EvaluationResult
|
|
1788
|
+
:param eval_meta_data: The evaluation metadata, containing eval_id, eval_run_id, and testing_criteria
|
|
1789
|
+
:type eval_meta_data: Dict[str, Any]
|
|
1790
|
+
:param logger: Logger instance
|
|
1791
|
+
:type logger: logging.Logger
|
|
1792
|
+
:return: EvaluationResult with converted evaluation results in AOAI format
|
|
1793
|
+
:rtype: EvaluationResult
|
|
1794
|
+
"""
|
|
1795
|
+
|
|
1796
|
+
if evaluators is None:
|
|
1797
|
+
return
|
|
1798
|
+
|
|
1799
|
+
# Get the testing_criteria_name and testing_criteria_type from evaluators
|
|
1800
|
+
testing_criteria_name_types_metrics: Optional[Dict[str, Any]] = {}
|
|
1801
|
+
criteria_name_types_from_meta: Optional[Dict[str, str]] = {}
|
|
1802
|
+
if eval_meta_data and "testing_criteria" in eval_meta_data:
|
|
1803
|
+
testing_criteria_list: Optional[List[Dict[str, Any]]] = eval_meta_data.get("testing_criteria")
|
|
1804
|
+
if testing_criteria_list is not None:
|
|
1805
|
+
for criteria in testing_criteria_list:
|
|
1806
|
+
criteria_name = criteria.get("name")
|
|
1807
|
+
criteria_type = criteria.get("type")
|
|
1808
|
+
if criteria_name is not None and criteria_type is not None:
|
|
1809
|
+
criteria_name_types_from_meta[criteria_name] = criteria
|
|
1810
|
+
|
|
1811
|
+
for criteria_name, evaluator in evaluators.items():
|
|
1812
|
+
criteria_type = None
|
|
1813
|
+
metrics = []
|
|
1814
|
+
if criteria_name in criteria_name_types_from_meta:
|
|
1815
|
+
criteria_type = criteria_name_types_from_meta[criteria_name].get("type", None)
|
|
1816
|
+
evaluator_name = criteria_name_types_from_meta[criteria_name].get("evaluator_name", None)
|
|
1817
|
+
current_evaluator_metrics = criteria_name_types_from_meta[criteria_name].get("metrics", None)
|
|
1818
|
+
if current_evaluator_metrics and len(current_evaluator_metrics) > 0:
|
|
1819
|
+
metrics.extend(current_evaluator_metrics)
|
|
1820
|
+
elif evaluator_name:
|
|
1821
|
+
if criteria_type == "azure_ai_evaluator" and evaluator_name.startswith("builtin."):
|
|
1822
|
+
evaluator_name = evaluator_name.replace("builtin.", "")
|
|
1823
|
+
metrics_mapped = _EvaluatorMetricMapping.EVALUATOR_NAME_METRICS_MAPPINGS.get(evaluator_name, [])
|
|
1824
|
+
if metrics_mapped and len(metrics_mapped) > 0:
|
|
1825
|
+
metrics.extend(metrics_mapped)
|
|
1826
|
+
else:
|
|
1827
|
+
metrics.append(criteria_name)
|
|
1828
|
+
else:
|
|
1829
|
+
metrics.append(criteria_name)
|
|
1830
|
+
elif isinstance(evaluator, AzureOpenAIGrader):
|
|
1831
|
+
criteria_type = evaluator._type # pylint: disable=protected-access
|
|
1832
|
+
metrics.append(criteria_name)
|
|
1833
|
+
elif isinstance(evaluator, EvaluatorBase):
|
|
1834
|
+
criteria_type = "azure_ai_evaluator"
|
|
1835
|
+
evaluator_class_name = evaluator.__class__.__name__
|
|
1836
|
+
eval_name = _EvaluatorMetricMapping.EVAL_CLASS_NAME_MAP.get(evaluator_class_name, None)
|
|
1837
|
+
if eval_name:
|
|
1838
|
+
metrics_mapped = _EvaluatorMetricMapping.EVALUATOR_NAME_METRICS_MAPPINGS.get(eval_name, [])
|
|
1839
|
+
if metrics_mapped and len(metrics_mapped) > 0:
|
|
1840
|
+
metrics.extend(metrics_mapped)
|
|
1841
|
+
else:
|
|
1842
|
+
metrics.append(criteria_name)
|
|
1843
|
+
else:
|
|
1844
|
+
criteria_type = "unknown"
|
|
1845
|
+
metrics.append(criteria_name)
|
|
1846
|
+
testing_criteria_name_types_metrics[criteria_name] = {"type": criteria_type, "metrics": metrics}
|
|
1847
|
+
|
|
1848
|
+
created_time = int(time.time())
|
|
1849
|
+
converted_rows = []
|
|
1850
|
+
|
|
1851
|
+
for row_idx, row in enumerate(results.get("rows", [])):
|
|
1852
|
+
# Group outputs by test criteria name
|
|
1853
|
+
criteria_groups = {criteria: {} for criteria in testing_criteria_name_types_metrics.keys()}
|
|
1854
|
+
input_groups = {}
|
|
1855
|
+
top_sample = {}
|
|
1856
|
+
for key, value in row.items():
|
|
1857
|
+
if key.startswith("outputs."):
|
|
1858
|
+
# Parse key: outputs.<test-criteria-name>.<metric>
|
|
1859
|
+
parts = key.split(".", 2) # Split into max 3 parts: ['outputs', '<criteria-name>', '<metric>']
|
|
1860
|
+
if len(parts) >= 3:
|
|
1861
|
+
criteria_name = parts[1]
|
|
1862
|
+
metric_name = parts[2]
|
|
1863
|
+
|
|
1864
|
+
if criteria_name not in criteria_groups:
|
|
1865
|
+
criteria_groups[criteria_name] = {}
|
|
1866
|
+
|
|
1867
|
+
criteria_groups[criteria_name][metric_name] = value
|
|
1868
|
+
elif key.startswith("inputs."):
|
|
1869
|
+
input_key = key.replace("inputs.", "")
|
|
1870
|
+
if input_key not in input_groups:
|
|
1871
|
+
input_groups[input_key] = value
|
|
1872
|
+
|
|
1873
|
+
# Convert each criteria group to RunOutputItem result
|
|
1874
|
+
run_output_results = []
|
|
1875
|
+
for criteria_name, metrics in criteria_groups.items():
|
|
1876
|
+
# Extract metrics for this criteria
|
|
1877
|
+
expected_metrics = testing_criteria_name_types_metrics.get(criteria_name, {}).get("metrics", [])
|
|
1878
|
+
criteria_type = testing_criteria_name_types_metrics.get(criteria_name, {}).get("type", "unknown")
|
|
1879
|
+
result_per_metric = {}
|
|
1880
|
+
# Find score - look for various score patterns
|
|
1881
|
+
for metric_key, metric_value in metrics.items():
|
|
1882
|
+
if metric_key.endswith("_score") or metric_key == "score":
|
|
1883
|
+
metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
|
|
1884
|
+
if metric not in result_per_metric:
|
|
1885
|
+
result_per_metric[metric] = {"score": metric_value}
|
|
1886
|
+
else:
|
|
1887
|
+
result_per_metric[metric]["score"] = metric_value
|
|
1888
|
+
_append_indirect_attachments_to_results(result_per_metric, "score", metric, metric_value)
|
|
1889
|
+
if metric_key == "passed":
|
|
1890
|
+
metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
|
|
1891
|
+
if metric not in result_per_metric:
|
|
1892
|
+
result_per_metric[metric] = {"passed": metric_value}
|
|
1893
|
+
else:
|
|
1894
|
+
result_per_metric[metric]["passed"] = metric_value
|
|
1895
|
+
_append_indirect_attachments_to_results(result_per_metric, "passed", metric, metric_value)
|
|
1896
|
+
elif metric_key.endswith("_result") or metric_key == "result" or metric_key.endswith("_label"):
|
|
1897
|
+
metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
|
|
1898
|
+
label = metric_value
|
|
1899
|
+
passed = (
|
|
1900
|
+
True if (str(metric_value).lower() == "pass" or str(metric_value).lower() == "true") else False
|
|
1901
|
+
)
|
|
1902
|
+
if metric not in result_per_metric:
|
|
1903
|
+
if criteria_type == "azure_ai_evaluator":
|
|
1904
|
+
result_per_metric[metric] = {"label": label, "passed": passed}
|
|
1905
|
+
else:
|
|
1906
|
+
result_per_metric[metric] = {"label": label}
|
|
1907
|
+
else:
|
|
1908
|
+
result_per_metric[metric]["label"] = metric_value
|
|
1909
|
+
if criteria_type == "azure_ai_evaluator":
|
|
1910
|
+
result_per_metric[metric]["passed"] = passed
|
|
1911
|
+
_append_indirect_attachments_to_results(result_per_metric, "label", metric, label)
|
|
1912
|
+
if criteria_type == "azure_ai_evaluator":
|
|
1913
|
+
_append_indirect_attachments_to_results(result_per_metric, "passed", metric, passed)
|
|
1914
|
+
elif (
|
|
1915
|
+
metric_key.endswith("_reason") and not metric_key.endswith("_finish_reason")
|
|
1916
|
+
) or metric_key == "reason":
|
|
1917
|
+
metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
|
|
1918
|
+
if metric not in result_per_metric:
|
|
1919
|
+
result_per_metric[metric] = {"reason": metric_value}
|
|
1920
|
+
else:
|
|
1921
|
+
result_per_metric[metric]["reason"] = metric_value
|
|
1922
|
+
_append_indirect_attachments_to_results(result_per_metric, "reason", metric, metric_value)
|
|
1923
|
+
elif metric_key.endswith("_threshold") or metric_key == "threshold":
|
|
1924
|
+
metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
|
|
1925
|
+
if metric not in result_per_metric:
|
|
1926
|
+
result_per_metric[metric] = {"threshold": metric_value}
|
|
1927
|
+
else:
|
|
1928
|
+
result_per_metric[metric]["threshold"] = metric_value
|
|
1929
|
+
_append_indirect_attachments_to_results(result_per_metric, "threshold", metric, metric_value)
|
|
1930
|
+
elif metric_key == "sample":
|
|
1931
|
+
metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
|
|
1932
|
+
if metric not in result_per_metric:
|
|
1933
|
+
result_per_metric[metric] = {"sample": metric_value}
|
|
1934
|
+
else:
|
|
1935
|
+
result_per_metric[metric]["sample"] = metric_value
|
|
1936
|
+
_append_indirect_attachments_to_results(result_per_metric, "sample", metric, metric_value)
|
|
1937
|
+
elif metric_key.endswith("_finish_reason"):
|
|
1938
|
+
metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
|
|
1939
|
+
if metric not in result_per_metric:
|
|
1940
|
+
result_per_metric[metric] = {"sample": {"finish_reason": metric_value}}
|
|
1941
|
+
elif metric in result_per_metric and "sample" not in result_per_metric[metric]:
|
|
1942
|
+
result_per_metric[metric]["sample"] = {"finish_reason": metric_value}
|
|
1943
|
+
elif (
|
|
1944
|
+
metric in result_per_metric
|
|
1945
|
+
and "sample" in result_per_metric[metric]
|
|
1946
|
+
and "finish_reason" not in result_per_metric[metric]["sample"]
|
|
1947
|
+
):
|
|
1948
|
+
result_per_metric[metric]["sample"]["finish_reason"] = metric_value
|
|
1949
|
+
_append_indirect_attachments_to_results(
|
|
1950
|
+
result_per_metric, "sample", metric, metric_value, "finish_reason"
|
|
1951
|
+
)
|
|
1952
|
+
elif metric_key.endswith("_model"):
|
|
1953
|
+
metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
|
|
1954
|
+
if metric not in result_per_metric:
|
|
1955
|
+
result_per_metric[metric] = {"sample": {"model": metric_value}}
|
|
1956
|
+
elif metric in result_per_metric and "sample" not in result_per_metric[metric]:
|
|
1957
|
+
result_per_metric[metric]["sample"] = {"model": metric_value}
|
|
1958
|
+
elif (
|
|
1959
|
+
metric in result_per_metric
|
|
1960
|
+
and "sample" in result_per_metric[metric]
|
|
1961
|
+
and "model" not in result_per_metric[metric]["sample"]
|
|
1962
|
+
):
|
|
1963
|
+
result_per_metric[metric]["sample"]["model"] = metric_value
|
|
1964
|
+
_append_indirect_attachments_to_results(result_per_metric, "sample", metric, metric_value, "model")
|
|
1965
|
+
elif metric_key.endswith("_sample_input"):
|
|
1966
|
+
metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
|
|
1967
|
+
input_metric_val_json: Optional[List[Dict[str, Any]]] = []
|
|
1968
|
+
try:
|
|
1969
|
+
input_metric_val_json = json.loads(metric_value)
|
|
1970
|
+
except Exception as e:
|
|
1971
|
+
logger.warning(f"Failed to parse _sample_input value as JSON: {e}")
|
|
1972
|
+
if metric not in result_per_metric:
|
|
1973
|
+
result_per_metric[metric] = {"sample": {"input": input_metric_val_json}}
|
|
1974
|
+
elif metric in result_per_metric and "sample" not in result_per_metric[metric]:
|
|
1975
|
+
result_per_metric[metric]["sample"] = {"input": input_metric_val_json}
|
|
1976
|
+
elif (
|
|
1977
|
+
metric in result_per_metric
|
|
1978
|
+
and "sample" in result_per_metric[metric]
|
|
1979
|
+
and "input" not in result_per_metric[metric]["sample"]
|
|
1980
|
+
):
|
|
1981
|
+
result_per_metric[metric]["sample"]["input"] = input_metric_val_json
|
|
1982
|
+
_append_indirect_attachments_to_results(
|
|
1983
|
+
result_per_metric, "sample", metric, input_metric_val_json, "input"
|
|
1984
|
+
)
|
|
1985
|
+
elif metric_key.endswith("_sample_output"):
|
|
1986
|
+
metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
|
|
1987
|
+
output_metric_val_json: Optional[List[Dict[str, Any]]] = []
|
|
1988
|
+
try:
|
|
1989
|
+
output_metric_val_json = json.loads(metric_value)
|
|
1990
|
+
except Exception as e:
|
|
1991
|
+
logger.warning(f"Failed to parse _sample_output value as JSON: {e}")
|
|
1992
|
+
if metric not in result_per_metric:
|
|
1993
|
+
result_per_metric[metric] = {"sample": {"output": output_metric_val_json}}
|
|
1994
|
+
elif metric in result_per_metric and "sample" not in result_per_metric[metric]:
|
|
1995
|
+
result_per_metric[metric]["sample"] = {"output": output_metric_val_json}
|
|
1996
|
+
elif (
|
|
1997
|
+
metric in result_per_metric
|
|
1998
|
+
and "sample" in result_per_metric[metric]
|
|
1999
|
+
and "output" not in result_per_metric[metric]["sample"]
|
|
2000
|
+
):
|
|
2001
|
+
result_per_metric[metric]["sample"]["output"] = output_metric_val_json
|
|
2002
|
+
_append_indirect_attachments_to_results(
|
|
2003
|
+
result_per_metric, "sample", metric, output_metric_val_json, "output"
|
|
2004
|
+
)
|
|
2005
|
+
elif metric_key.endswith("_total_tokens"):
|
|
2006
|
+
metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
|
|
2007
|
+
metric_value = None if _is_none_or_nan(metric_value) else metric_value
|
|
2008
|
+
if metric not in result_per_metric:
|
|
2009
|
+
result_per_metric[metric] = {"sample": {"usage": {"total_tokens": metric_value}}}
|
|
2010
|
+
elif metric in result_per_metric and "sample" not in result_per_metric[metric]:
|
|
2011
|
+
result_per_metric[metric]["sample"] = {"usage": {"total_tokens": metric_value}}
|
|
2012
|
+
elif (
|
|
2013
|
+
metric in result_per_metric
|
|
2014
|
+
and "sample" in result_per_metric[metric]
|
|
2015
|
+
and "usage" not in result_per_metric[metric]["sample"]
|
|
2016
|
+
):
|
|
2017
|
+
result_per_metric[metric]["sample"]["usage"] = {"total_tokens": metric_value}
|
|
2018
|
+
else:
|
|
2019
|
+
result_per_metric[metric]["sample"]["usage"]["total_tokens"] = metric_value
|
|
2020
|
+
_append_indirect_attachments_to_results(
|
|
2021
|
+
result_per_metric, "sample", metric, metric_value, "usage", "total_tokens"
|
|
2022
|
+
)
|
|
2023
|
+
elif metric_key.endswith("_prompt_tokens"):
|
|
2024
|
+
metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
|
|
2025
|
+
metric_value = None if _is_none_or_nan(metric_value) else metric_value
|
|
2026
|
+
if metric not in result_per_metric:
|
|
2027
|
+
result_per_metric[metric] = {"sample": {"usage": {"prompt_tokens": metric_value}}}
|
|
2028
|
+
elif metric in result_per_metric and "sample" not in result_per_metric[metric]:
|
|
2029
|
+
result_per_metric[metric]["sample"] = {"usage": {"prompt_tokens": metric_value}}
|
|
2030
|
+
elif (
|
|
2031
|
+
metric in result_per_metric
|
|
2032
|
+
and "sample" in result_per_metric[metric]
|
|
2033
|
+
and "usage" not in result_per_metric[metric]["sample"]
|
|
2034
|
+
):
|
|
2035
|
+
result_per_metric[metric]["sample"]["usage"] = {"prompt_tokens": metric_value}
|
|
2036
|
+
else:
|
|
2037
|
+
result_per_metric[metric]["sample"]["usage"]["prompt_tokens"] = metric_value
|
|
2038
|
+
_append_indirect_attachments_to_results(
|
|
2039
|
+
result_per_metric, "sample", metric, metric_value, "usage", "prompt_tokens"
|
|
2040
|
+
)
|
|
2041
|
+
elif metric_key.endswith("_completion_tokens"):
|
|
2042
|
+
metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
|
|
2043
|
+
metric_value = None if _is_none_or_nan(metric_value) else metric_value
|
|
2044
|
+
if metric not in result_per_metric:
|
|
2045
|
+
result_per_metric[metric] = {"sample": {"usage": {"completion_tokens": metric_value}}}
|
|
2046
|
+
elif metric in result_per_metric and "sample" not in result_per_metric[metric]:
|
|
2047
|
+
result_per_metric[metric]["sample"] = {"usage": {"completion_tokens": metric_value}}
|
|
2048
|
+
elif (
|
|
2049
|
+
metric in result_per_metric
|
|
2050
|
+
and "sample" in result_per_metric[metric]
|
|
2051
|
+
and "usage" not in result_per_metric[metric]["sample"]
|
|
2052
|
+
):
|
|
2053
|
+
result_per_metric[metric]["sample"]["usage"] = {"completion_tokens": metric_value}
|
|
2054
|
+
else:
|
|
2055
|
+
result_per_metric[metric]["sample"]["usage"]["completion_tokens"] = metric_value
|
|
2056
|
+
_append_indirect_attachments_to_results(
|
|
2057
|
+
result_per_metric, "sample", metric, metric_value, "usage", "completion_tokens"
|
|
2058
|
+
)
|
|
2059
|
+
elif not any(
|
|
2060
|
+
metric_key.endswith(suffix)
|
|
2061
|
+
for suffix in [
|
|
2062
|
+
"_result",
|
|
2063
|
+
"_reason",
|
|
2064
|
+
"_threshold",
|
|
2065
|
+
"_label",
|
|
2066
|
+
"_score",
|
|
2067
|
+
"_model",
|
|
2068
|
+
"_finish_reason",
|
|
2069
|
+
"_sample_input",
|
|
2070
|
+
"_sample_output",
|
|
2071
|
+
"_total_tokens",
|
|
2072
|
+
"_prompt_tokens",
|
|
2073
|
+
"_completion_tokens",
|
|
2074
|
+
]
|
|
2075
|
+
):
|
|
2076
|
+
metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
|
|
2077
|
+
# If no score found yet and this doesn't match other patterns, use as score
|
|
2078
|
+
if metric_key == metric and metric not in result_per_metric:
|
|
2079
|
+
result_per_metric[metric] = {"score": metric_value}
|
|
2080
|
+
elif metric_key == metric and result_per_metric[metric].get("score", None) is None:
|
|
2081
|
+
result_per_metric[metric]["score"] = metric_value
|
|
2082
|
+
|
|
2083
|
+
for metric, metric_values in result_per_metric.items():
|
|
2084
|
+
score = metric_values.get("score", None)
|
|
2085
|
+
label = metric_values.get("label", None)
|
|
2086
|
+
reason = metric_values.get("reason", None)
|
|
2087
|
+
threshold = metric_values.get("threshold", None)
|
|
2088
|
+
passed = metric_values.get("passed", None)
|
|
2089
|
+
sample = metric_values.get("sample", None)
|
|
2090
|
+
|
|
2091
|
+
# Create result object for this criteria
|
|
2092
|
+
result_obj = {
|
|
2093
|
+
"type": testing_criteria_name_types_metrics.get(criteria_name, {}).get(
|
|
2094
|
+
"type", "azure_ai_evaluator"
|
|
2095
|
+
),
|
|
2096
|
+
"name": criteria_name, # Use criteria name as name
|
|
2097
|
+
"metric": metric if metric is not None else criteria_name, # Use criteria name as metric
|
|
2098
|
+
}
|
|
2099
|
+
# Add optional fields
|
|
2100
|
+
if (
|
|
2101
|
+
metric in _EvaluatorMetricMapping.EVALUATOR_NAME_METRICS_MAPPINGS["indirect_attack"]
|
|
2102
|
+
or metric in _EvaluatorMetricMapping.EVALUATOR_NAME_METRICS_MAPPINGS["code_vulnerability"]
|
|
2103
|
+
or metric in _EvaluatorMetricMapping.EVALUATOR_NAME_METRICS_MAPPINGS["protected_material"]
|
|
2104
|
+
):
|
|
2105
|
+
copy_label = label
|
|
2106
|
+
if copy_label is not None and isinstance(copy_label, bool) and copy_label == True:
|
|
2107
|
+
label = "fail"
|
|
2108
|
+
score = 0.0
|
|
2109
|
+
passed = False
|
|
2110
|
+
else:
|
|
2111
|
+
label = "pass"
|
|
2112
|
+
score = 1.0
|
|
2113
|
+
passed = True
|
|
2114
|
+
result_obj["score"] = (
|
|
2115
|
+
score if not (score is None or (isinstance(score, float) and math.isnan(score))) else None
|
|
2116
|
+
)
|
|
2117
|
+
result_obj["label"] = label
|
|
2118
|
+
result_obj["reason"] = reason
|
|
2119
|
+
result_obj["threshold"] = threshold
|
|
2120
|
+
result_obj["passed"] = passed
|
|
2121
|
+
|
|
2122
|
+
if sample is not None:
|
|
2123
|
+
result_obj["sample"] = sample
|
|
2124
|
+
top_sample = sample # Save top sample for the row
|
|
2125
|
+
run_output_results.append(result_obj)
|
|
2126
|
+
|
|
2127
|
+
if (
|
|
2128
|
+
eval_run_summary
|
|
2129
|
+
and criteria_name in eval_run_summary
|
|
2130
|
+
and isinstance(eval_run_summary[criteria_name], dict)
|
|
2131
|
+
and "error_code" in eval_run_summary[criteria_name]
|
|
2132
|
+
) and eval_run_summary[criteria_name].get("error_code", None) is not None:
|
|
2133
|
+
error_info = (
|
|
2134
|
+
{
|
|
2135
|
+
"code": eval_run_summary[criteria_name].get("error_code", None),
|
|
2136
|
+
"message": eval_run_summary[criteria_name].get("error_message", None),
|
|
2137
|
+
}
|
|
2138
|
+
if eval_run_summary[criteria_name].get("error_code", None) is not None
|
|
2139
|
+
else None
|
|
2140
|
+
)
|
|
2141
|
+
sample = {"error": error_info} if error_info is not None else None
|
|
2142
|
+
# Create result object for this criteria
|
|
2143
|
+
metrics = testing_criteria_name_types_metrics.get(criteria_name, {}).get("metrics", [])
|
|
2144
|
+
for metric in metrics:
|
|
2145
|
+
should_add_error_summary = True
|
|
2146
|
+
for result in run_output_results:
|
|
2147
|
+
if result.get("name", None) == criteria_name and result.get("metric", None) == metric:
|
|
2148
|
+
rs_score = result.get("score", None)
|
|
2149
|
+
rs_threshold = result.get("threshold", None)
|
|
2150
|
+
rs_label = result.get("label", None)
|
|
2151
|
+
rs_reason = result.get("reason", None)
|
|
2152
|
+
if (
|
|
2153
|
+
_is_none_or_nan(rs_score)
|
|
2154
|
+
and _is_none_or_nan(rs_threshold)
|
|
2155
|
+
and _is_none_or_nan(rs_label)
|
|
2156
|
+
and _is_none_or_nan(rs_reason)
|
|
2157
|
+
):
|
|
2158
|
+
run_output_results.remove(result)
|
|
2159
|
+
else:
|
|
2160
|
+
should_add_error_summary = False
|
|
2161
|
+
break # Skip if already have result for this criteria and metric
|
|
2162
|
+
if should_add_error_summary:
|
|
2163
|
+
result_obj = {
|
|
2164
|
+
"type": testing_criteria_name_types_metrics.get(criteria_name, {}).get(
|
|
2165
|
+
"type", "azure_ai_evaluator"
|
|
2166
|
+
),
|
|
2167
|
+
"name": criteria_name, # Use criteria name as name
|
|
2168
|
+
"metric": metric if metric is not None else criteria_name, # Use criteria name as metric
|
|
2169
|
+
"score": None,
|
|
2170
|
+
"label": None,
|
|
2171
|
+
"reason": None,
|
|
2172
|
+
"threshold": None,
|
|
2173
|
+
"passed": None,
|
|
2174
|
+
"sample": sample,
|
|
2175
|
+
}
|
|
2176
|
+
run_output_results.append(result_obj)
|
|
2177
|
+
|
|
2178
|
+
# Create RunOutputItem structure
|
|
2179
|
+
run_output_item = {
|
|
2180
|
+
"object": "eval.run.output_item",
|
|
2181
|
+
"id": f"{row_idx+1}",
|
|
2182
|
+
"run_id": eval_run_id,
|
|
2183
|
+
"eval_id": eval_id,
|
|
2184
|
+
"created_at": created_time,
|
|
2185
|
+
"datasource_item_id": row_idx,
|
|
2186
|
+
"datasource_item": input_groups,
|
|
2187
|
+
"results": run_output_results,
|
|
2188
|
+
"status": "completed" if len(run_output_results) > 0 else "error",
|
|
2189
|
+
}
|
|
2190
|
+
|
|
2191
|
+
run_output_item["sample"] = top_sample
|
|
2192
|
+
|
|
2193
|
+
converted_rows.append(run_output_item)
|
|
2194
|
+
|
|
2195
|
+
# Create converted results maintaining the same structure
|
|
2196
|
+
results["_evaluation_results_list"] = converted_rows
|
|
2197
|
+
logger.info(
|
|
2198
|
+
f"Converted {len(converted_rows)} rows to AOAI evaluation format, eval_id: {eval_id}, eval_run_id: {eval_run_id}"
|
|
2199
|
+
)
|
|
2200
|
+
# Calculate summary statistics
|
|
2201
|
+
evaluation_summary = _calculate_aoai_evaluation_summary(converted_rows, logger, criteria_name_types_from_meta)
|
|
2202
|
+
results["_evaluation_summary"] = evaluation_summary
|
|
2203
|
+
logger.info(
|
|
2204
|
+
f"Summary statistics calculated for {len(converted_rows)} rows, eval_id: {eval_id}, eval_run_id: {eval_run_id}"
|
|
2205
|
+
)
|
|
2206
|
+
|
|
2207
|
+
|
|
2208
|
+
def _is_none_or_nan(value: Any) -> bool:
|
|
2209
|
+
"""
|
|
2210
|
+
Check if a value is None or NaN.
|
|
2211
|
+
|
|
2212
|
+
:param value: The value to check
|
|
2213
|
+
:type value: Any
|
|
2214
|
+
:return: True if the value is None or NaN, False otherwise
|
|
2215
|
+
:rtype: bool
|
|
2216
|
+
"""
|
|
2217
|
+
if value is None:
|
|
2218
|
+
return True
|
|
2219
|
+
if isinstance(value, float) and math.isnan(value):
|
|
2220
|
+
return True
|
|
2221
|
+
if isinstance(value, str) and value.lower() in ["nan", "null", "none", ""]:
|
|
2222
|
+
return True
|
|
2223
|
+
return False
|
|
2224
|
+
|
|
2225
|
+
|
|
2226
|
+
def _append_indirect_attachments_to_results(
|
|
2227
|
+
current_result_dict: Dict[str, Any],
|
|
2228
|
+
result_name: str,
|
|
2229
|
+
metric: str,
|
|
2230
|
+
metric_value: Any,
|
|
2231
|
+
nested_result_name: Optional[str] = None,
|
|
2232
|
+
secondnested_result_name: Optional[str] = None,
|
|
2233
|
+
) -> None:
|
|
2234
|
+
"""
|
|
2235
|
+
Append indirect attachments to the current result dictionary.
|
|
2236
|
+
|
|
2237
|
+
:param current_result_dict: The current result dictionary to update
|
|
2238
|
+
:type current_result_dict: Dict[str, Any]
|
|
2239
|
+
:param result_name: The result name
|
|
2240
|
+
:type result_name: str
|
|
2241
|
+
:param metric: The metric name
|
|
2242
|
+
:type metric: str
|
|
2243
|
+
:param metric_value: The value of the metric
|
|
2244
|
+
:type metric_value: Any
|
|
2245
|
+
"""
|
|
2246
|
+
if metric == "xpia" and result_name:
|
|
2247
|
+
for metric_extended in ["xpia_manipulated_content", "xpia_intrusion", "xpia_information_gathering"]:
|
|
2248
|
+
if nested_result_name is None:
|
|
2249
|
+
if metric_extended not in current_result_dict:
|
|
2250
|
+
current_result_dict[metric_extended] = {result_name: metric_value}
|
|
2251
|
+
else:
|
|
2252
|
+
current_result_dict[metric_extended][result_name] = metric_value
|
|
2253
|
+
elif nested_result_name is not None and secondnested_result_name is None:
|
|
2254
|
+
if metric_extended not in current_result_dict:
|
|
2255
|
+
current_result_dict[metric_extended] = {result_name: {nested_result_name: metric_value}}
|
|
2256
|
+
elif metric_extended in current_result_dict and result_name not in current_result_dict[metric_extended]:
|
|
2257
|
+
current_result_dict[metric_extended][result_name] = {nested_result_name: metric_value}
|
|
2258
|
+
elif (
|
|
2259
|
+
metric_extended in current_result_dict
|
|
2260
|
+
and result_name in current_result_dict[metric_extended]
|
|
2261
|
+
and nested_result_name not in current_result_dict[metric_extended][result_name]
|
|
2262
|
+
):
|
|
2263
|
+
current_result_dict[metric_extended][result_name][nested_result_name] = metric_value
|
|
2264
|
+
elif nested_result_name is not None and secondnested_result_name is not None:
|
|
2265
|
+
if metric_extended not in current_result_dict:
|
|
2266
|
+
current_result_dict[metric_extended] = {
|
|
2267
|
+
result_name: {nested_result_name: {secondnested_result_name: metric_value}}
|
|
2268
|
+
}
|
|
2269
|
+
elif metric_extended in current_result_dict and result_name not in current_result_dict[metric_extended]:
|
|
2270
|
+
current_result_dict[metric_extended][result_name] = {
|
|
2271
|
+
nested_result_name: {secondnested_result_name: metric_value}
|
|
2272
|
+
}
|
|
2273
|
+
elif (
|
|
2274
|
+
metric_extended in current_result_dict
|
|
2275
|
+
and result_name in current_result_dict[metric_extended]
|
|
2276
|
+
and nested_result_name not in current_result_dict[metric_extended][result_name]
|
|
2277
|
+
):
|
|
2278
|
+
current_result_dict[metric_extended][result_name][nested_result_name] = {
|
|
2279
|
+
secondnested_result_name: metric_value
|
|
2280
|
+
}
|
|
2281
|
+
else:
|
|
2282
|
+
(
|
|
2283
|
+
current_result_dict[metric_extended][result_name][nested_result_name][secondnested_result_name]
|
|
2284
|
+
) = metric_value
|
|
2285
|
+
|
|
2286
|
+
|
|
2287
|
+
def _get_metric_from_criteria(testing_criteria_name: str, metric_key: str, metric_list: List[str]) -> str:
|
|
2288
|
+
"""
|
|
2289
|
+
Get the metric name from the testing criteria and metric key.
|
|
2290
|
+
|
|
2291
|
+
:param testing_criteria_name: The name of the testing criteria
|
|
2292
|
+
:type testing_criteria_name: str
|
|
2293
|
+
:param metric_key: The metric key to look for
|
|
2294
|
+
:type metric_key: str
|
|
2295
|
+
:param metric_list: List of expected metrics for the testing criteria
|
|
2296
|
+
:type metric_list: List[str]
|
|
2297
|
+
:return: The metric name if found, otherwise the testing criteria name
|
|
2298
|
+
:rtype: str
|
|
2299
|
+
"""
|
|
2300
|
+
metric = None
|
|
2301
|
+
|
|
2302
|
+
if metric_key == "xpia_manipulated_content":
|
|
2303
|
+
metric = "xpia_manipulated_content"
|
|
2304
|
+
return metric
|
|
2305
|
+
elif metric_key == "xpia_intrusion":
|
|
2306
|
+
metric = "xpia_intrusion"
|
|
2307
|
+
return metric
|
|
2308
|
+
elif metric_key == "xpia_information_gathering":
|
|
2309
|
+
metric = "xpia_information_gathering"
|
|
2310
|
+
return metric
|
|
2311
|
+
for expected_metric in metric_list:
|
|
2312
|
+
if metric_key.startswith(expected_metric):
|
|
2313
|
+
metric = expected_metric
|
|
2314
|
+
break
|
|
2315
|
+
if metric is None:
|
|
2316
|
+
metric = testing_criteria_name
|
|
2317
|
+
return metric
|
|
2318
|
+
|
|
2319
|
+
|
|
2320
|
+
def _is_primary_metric(metric_name: str, evaluator_name: str) -> bool:
|
|
2321
|
+
"""
|
|
2322
|
+
Check if the given metric name is a primary metric.
|
|
2323
|
+
|
|
2324
|
+
:param metric_name: The name of the metric
|
|
2325
|
+
:type metric_name: str
|
|
2326
|
+
:param evaluator_name: The name of the evaluator
|
|
2327
|
+
:type evaluator_name: str
|
|
2328
|
+
:return: True if the metric is a primary metric, False otherwise
|
|
2329
|
+
:rtype: bool
|
|
2330
|
+
"""
|
|
2331
|
+
if (
|
|
2332
|
+
not _is_none_or_nan(metric_name)
|
|
2333
|
+
and not _is_none_or_nan(evaluator_name)
|
|
2334
|
+
and evaluator_name in _EvaluatorMetricMapping.EVALUATOR_NAME_METRICS_MAPPINGS
|
|
2335
|
+
and isinstance(_EvaluatorMetricMapping.EVALUATOR_NAME_METRICS_MAPPINGS[evaluator_name], list)
|
|
2336
|
+
and len(_EvaluatorMetricMapping.EVALUATOR_NAME_METRICS_MAPPINGS[evaluator_name]) > 1
|
|
2337
|
+
and metric_name in _EvaluatorMetricMapping.EVALUATOR_NAME_METRICS_MAPPINGS[evaluator_name]
|
|
2338
|
+
and metric_name.lower() != _EvaluatorMetricMapping.EVALUATOR_NAME_METRICS_MAPPINGS[evaluator_name][0].lower()
|
|
2339
|
+
):
|
|
2340
|
+
return False
|
|
2341
|
+
else:
|
|
2342
|
+
return True
|
|
2343
|
+
|
|
2344
|
+
|
|
2345
|
+
def _calculate_aoai_evaluation_summary(
|
|
2346
|
+
aoai_results: list, logger: logging.Logger, criteria_name_types_from_meta: Optional[Dict[str, Any]]
|
|
2347
|
+
) -> Dict[str, Any]:
|
|
2348
|
+
"""
|
|
2349
|
+
Calculate summary statistics for AOAI evaluation results.
|
|
2350
|
+
|
|
2351
|
+
:param aoai_results: List of AOAI result objects (run_output_items)
|
|
2352
|
+
:type aoai_results: list
|
|
2353
|
+
:return: Summary statistics dictionary
|
|
2354
|
+
:rtype: Dict[str, Any]
|
|
2355
|
+
"""
|
|
2356
|
+
# Calculate result counts based on aoaiResults
|
|
2357
|
+
result_counts = {"total": 0, "errored": 0, "failed": 0, "passed": 0}
|
|
2358
|
+
|
|
2359
|
+
# Count results by status and calculate per model usage
|
|
2360
|
+
model_usage_stats = {} # Dictionary to aggregate usage by model
|
|
2361
|
+
result_counts_stats = {} # Dictionary to aggregate usage by model
|
|
2362
|
+
|
|
2363
|
+
for aoai_result in aoai_results:
|
|
2364
|
+
logger.info(
|
|
2365
|
+
f"Processing aoai_result with id: {getattr(aoai_result, 'id', 'unknown')}, row keys: {aoai_result.keys() if hasattr(aoai_result, 'keys') else 'N/A'}"
|
|
2366
|
+
)
|
|
2367
|
+
result_counts["total"] += 1
|
|
2368
|
+
passed_count = 0
|
|
2369
|
+
failed_count = 0
|
|
2370
|
+
error_count = 0
|
|
2371
|
+
if isinstance(aoai_result, dict) and "results" in aoai_result:
|
|
2372
|
+
logger.info(
|
|
2373
|
+
f"Processing aoai_result with id: {getattr(aoai_result, 'id', 'unknown')}, results count: {len(aoai_result['results'])}"
|
|
2374
|
+
)
|
|
2375
|
+
for result_item in aoai_result["results"]:
|
|
2376
|
+
if isinstance(result_item, dict):
|
|
2377
|
+
testing_criteria = result_item.get("name", "")
|
|
2378
|
+
is_primary_metric = True
|
|
2379
|
+
if (
|
|
2380
|
+
criteria_name_types_from_meta is not None
|
|
2381
|
+
and isinstance(criteria_name_types_from_meta, dict)
|
|
2382
|
+
and testing_criteria in criteria_name_types_from_meta
|
|
2383
|
+
):
|
|
2384
|
+
evaluator_name = criteria_name_types_from_meta[testing_criteria].get("evaluator_name", None)
|
|
2385
|
+
criteria_type = criteria_name_types_from_meta[testing_criteria].get("type", None)
|
|
2386
|
+
if criteria_type == "azure_ai_evaluator" and evaluator_name.startswith("builtin."):
|
|
2387
|
+
evaluator_name = evaluator_name.replace("builtin.", "")
|
|
2388
|
+
is_primary_metric = _is_primary_metric(result_item.get("metric", ""), evaluator_name)
|
|
2389
|
+
if not is_primary_metric:
|
|
2390
|
+
logger.info(
|
|
2391
|
+
f"Skip counts for non-primary metric for testing_criteria: {testing_criteria}, metric: {result_item.get('metric', '')}"
|
|
2392
|
+
)
|
|
2393
|
+
continue
|
|
2394
|
+
# Check if the result has a 'passed' field
|
|
2395
|
+
if "passed" in result_item and result_item["passed"] is not None:
|
|
2396
|
+
if testing_criteria not in result_counts_stats:
|
|
2397
|
+
result_counts_stats[testing_criteria] = {
|
|
2398
|
+
"testing_criteria": testing_criteria,
|
|
2399
|
+
"failed": 0,
|
|
2400
|
+
"passed": 0,
|
|
2401
|
+
}
|
|
2402
|
+
if result_item["passed"] is True:
|
|
2403
|
+
passed_count += 1
|
|
2404
|
+
result_counts_stats[testing_criteria]["passed"] += 1
|
|
2405
|
+
|
|
2406
|
+
elif result_item["passed"] is False:
|
|
2407
|
+
failed_count += 1
|
|
2408
|
+
result_counts_stats[testing_criteria]["failed"] += 1
|
|
2409
|
+
# Check if the result indicates an error status
|
|
2410
|
+
elif ("status" in result_item and result_item["status"] in ["error", "errored"]) or (
|
|
2411
|
+
"sample" in result_item
|
|
2412
|
+
and isinstance(result_item["sample"], dict)
|
|
2413
|
+
and result_item["sample"].get("error", None) is not None
|
|
2414
|
+
):
|
|
2415
|
+
error_count += 1
|
|
2416
|
+
elif hasattr(aoai_result, "status") and aoai_result.status == "error":
|
|
2417
|
+
error_count += 1
|
|
2418
|
+
elif isinstance(aoai_result, dict) and aoai_result.get("status") == "error":
|
|
2419
|
+
error_count += 1
|
|
2420
|
+
|
|
2421
|
+
# Update overall result counts, error counts will not be considered for passed/failed
|
|
2422
|
+
if error_count > 0:
|
|
2423
|
+
result_counts["errored"] += 1
|
|
2424
|
+
|
|
2425
|
+
if failed_count > 0:
|
|
2426
|
+
result_counts["failed"] += 1
|
|
2427
|
+
elif (
|
|
2428
|
+
failed_count == 0 and passed_count > 0 and passed_count == len(aoai_result.get("results", [])) - error_count
|
|
2429
|
+
):
|
|
2430
|
+
result_counts["passed"] += 1
|
|
2431
|
+
|
|
2432
|
+
# Extract usage statistics from aoai_result.sample
|
|
2433
|
+
sample_data_list = []
|
|
2434
|
+
dup_usage_list = _EvaluatorMetricMapping.EVALUATOR_NAME_METRICS_MAPPINGS["indirect_attack"].copy()
|
|
2435
|
+
dup_usage_list.remove("xpia")
|
|
2436
|
+
if isinstance(aoai_result, dict) and aoai_result["results"] and isinstance(aoai_result["results"], list):
|
|
2437
|
+
for result_item in aoai_result["results"]:
|
|
2438
|
+
if (
|
|
2439
|
+
isinstance(result_item, dict)
|
|
2440
|
+
and "sample" in result_item
|
|
2441
|
+
and result_item["sample"]
|
|
2442
|
+
and result_item["metric"] not in dup_usage_list
|
|
2443
|
+
):
|
|
2444
|
+
sample_data_list.append(result_item["sample"])
|
|
2445
|
+
|
|
2446
|
+
for sample_data in sample_data_list:
|
|
2447
|
+
if sample_data and isinstance(sample_data, dict) and "usage" in sample_data:
|
|
2448
|
+
usage_data = sample_data["usage"]
|
|
2449
|
+
model_name = sample_data.get("model", "unknown") if usage_data.get("model", "unknown") else "unknown"
|
|
2450
|
+
if _is_none_or_nan(model_name):
|
|
2451
|
+
continue
|
|
2452
|
+
if model_name not in model_usage_stats:
|
|
2453
|
+
model_usage_stats[model_name] = {
|
|
2454
|
+
"invocation_count": 0,
|
|
2455
|
+
"total_tokens": 0,
|
|
2456
|
+
"prompt_tokens": 0,
|
|
2457
|
+
"completion_tokens": 0,
|
|
2458
|
+
"cached_tokens": 0,
|
|
2459
|
+
}
|
|
2460
|
+
# Aggregate usage statistics
|
|
2461
|
+
model_stats = model_usage_stats[model_name]
|
|
2462
|
+
model_stats["invocation_count"] += 1
|
|
2463
|
+
if isinstance(usage_data, dict):
|
|
2464
|
+
cur_total_tokens = usage_data.get("total_tokens", 0)
|
|
2465
|
+
if _is_none_or_nan(cur_total_tokens):
|
|
2466
|
+
cur_total_tokens = 0
|
|
2467
|
+
cur_prompt_tokens = usage_data.get("prompt_tokens", 0)
|
|
2468
|
+
if _is_none_or_nan(cur_prompt_tokens):
|
|
2469
|
+
cur_prompt_tokens = 0
|
|
2470
|
+
cur_completion_tokens = usage_data.get("completion_tokens", 0)
|
|
2471
|
+
if _is_none_or_nan(cur_completion_tokens):
|
|
2472
|
+
cur_completion_tokens = 0
|
|
2473
|
+
cur_cached_tokens = usage_data.get("cached_tokens", 0)
|
|
2474
|
+
if _is_none_or_nan(cur_cached_tokens):
|
|
2475
|
+
cur_cached_tokens = 0
|
|
2476
|
+
logger.info(
|
|
2477
|
+
f"Model: {model_name}, cur_total_tokens: {cur_total_tokens}, {_is_none_or_nan(cur_total_tokens)}, cur_prompt_tokens: {cur_prompt_tokens}, cur_completion_tokens: {cur_completion_tokens}, cur_cached_tokens: {cur_cached_tokens}"
|
|
2478
|
+
)
|
|
2479
|
+
model_stats["total_tokens"] += cur_total_tokens
|
|
2480
|
+
model_stats["prompt_tokens"] += cur_prompt_tokens
|
|
2481
|
+
model_stats["completion_tokens"] += cur_completion_tokens
|
|
2482
|
+
model_stats["cached_tokens"] += cur_cached_tokens
|
|
2483
|
+
|
|
2484
|
+
# Convert model usage stats to list format matching EvaluationRunPerModelUsage
|
|
2485
|
+
per_model_usage = []
|
|
2486
|
+
for model_name, stats in model_usage_stats.items():
|
|
2487
|
+
per_model_usage.append(
|
|
2488
|
+
{
|
|
2489
|
+
"model_name": model_name,
|
|
2490
|
+
"invocation_count": stats["invocation_count"],
|
|
2491
|
+
"total_tokens": stats["total_tokens"],
|
|
2492
|
+
"prompt_tokens": stats["prompt_tokens"],
|
|
2493
|
+
"completion_tokens": stats["completion_tokens"],
|
|
2494
|
+
"cached_tokens": stats["cached_tokens"],
|
|
2495
|
+
}
|
|
2496
|
+
)
|
|
2497
|
+
result_counts_stats_val = []
|
|
2498
|
+
logger.info(f"\r\n Result counts stats: {result_counts_stats}")
|
|
2499
|
+
for criteria_name, stats_val in result_counts_stats.items():
|
|
2500
|
+
if isinstance(stats_val, dict):
|
|
2501
|
+
logger.info(f"\r\n Criteria: {criteria_name}, stats: {stats_val}")
|
|
2502
|
+
cur_passed = stats_val.get("passed", 0)
|
|
2503
|
+
if _is_none_or_nan(cur_passed):
|
|
2504
|
+
cur_passed = 0
|
|
2505
|
+
cur_failed_count = stats_val.get("failed", 0)
|
|
2506
|
+
if _is_none_or_nan(cur_failed_count):
|
|
2507
|
+
cur_failed_count = 0
|
|
2508
|
+
result_counts_stats_val.append(
|
|
2509
|
+
{
|
|
2510
|
+
"testing_criteria": criteria_name if not _is_none_or_nan(criteria_name) else "unknown",
|
|
2511
|
+
"passed": cur_passed,
|
|
2512
|
+
"failed": cur_failed_count,
|
|
2513
|
+
}
|
|
2514
|
+
)
|
|
2515
|
+
return {
|
|
2516
|
+
"result_counts": result_counts,
|
|
2517
|
+
"per_model_usage": per_model_usage,
|
|
2518
|
+
"per_testing_criteria_results": result_counts_stats_val,
|
|
2519
|
+
}
|