azure-ai-evaluation 1.11.2__py3-none-any.whl → 1.13.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- azure/ai/evaluation/__init__.py +2 -0
- azure/ai/evaluation/_aoai/aoai_grader.py +69 -28
- azure/ai/evaluation/_aoai/label_grader.py +14 -13
- azure/ai/evaluation/_aoai/python_grader.py +15 -13
- azure/ai/evaluation/_aoai/score_model_grader.py +13 -10
- azure/ai/evaluation/_aoai/string_check_grader.py +13 -13
- azure/ai/evaluation/_aoai/text_similarity_grader.py +16 -25
- azure/ai/evaluation/_common/__init__.py +2 -1
- azure/ai/evaluation/_common/constants.py +109 -0
- azure/ai/evaluation/_common/evaluation_onedp_client.py +5 -5
- azure/ai/evaluation/_common/onedp/__init__.py +2 -2
- azure/ai/evaluation/_common/onedp/_client.py +44 -14
- azure/ai/evaluation/_common/onedp/_configuration.py +9 -7
- azure/ai/evaluation/_common/onedp/_utils/model_base.py +1 -1
- azure/ai/evaluation/_common/onedp/_validation.py +18 -2
- azure/ai/evaluation/_common/onedp/aio/__init__.py +2 -2
- azure/ai/evaluation/_common/onedp/aio/_client.py +44 -14
- azure/ai/evaluation/_common/onedp/aio/_configuration.py +9 -7
- azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +12 -0
- azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +3942 -1631
- azure/ai/evaluation/_common/onedp/models/__init__.py +196 -6
- azure/ai/evaluation/_common/onedp/models/_enums.py +217 -0
- azure/ai/evaluation/_common/onedp/models/_models.py +3876 -603
- azure/ai/evaluation/_common/onedp/operations/__init__.py +12 -0
- azure/ai/evaluation/_common/onedp/operations/_operations.py +5422 -2577
- azure/ai/evaluation/_common/rai_service.py +299 -2
- azure/ai/evaluation/_common/utils.py +173 -39
- azure/ai/evaluation/_constants.py +100 -0
- azure/ai/evaluation/_eval_mapping.py +10 -0
- azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +10 -0
- azure/ai/evaluation/_evaluate/_evaluate.py +1125 -9
- azure/ai/evaluation/_evaluate/_evaluate_aoai.py +330 -51
- azure/ai/evaluation/_evaluate/_utils.py +17 -6
- azure/ai/evaluation/_evaluator_definition.py +76 -0
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +1 -0
- azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +0 -17
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +7 -1
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +80 -4
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +181 -3
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +7 -1
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +28 -13
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +19 -14
- azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +23 -4
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +21 -7
- azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +36 -19
- azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +43 -20
- azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +7 -1
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +7 -1
- azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +32 -6
- azure/ai/evaluation/_evaluators/_task_completion/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_task_completion/_task_completion.py +177 -0
- azure/ai/evaluation/_evaluators/_task_completion/task_completion.prompty +220 -0
- azure/ai/evaluation/_evaluators/_task_navigation_efficiency/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_task_navigation_efficiency/_task_navigation_efficiency.py +384 -0
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +23 -127
- azure/ai/evaluation/_evaluators/_tool_input_accuracy/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py +263 -0
- azure/ai/evaluation/_evaluators/_tool_input_accuracy/tool_input_accuracy.prompty +76 -0
- azure/ai/evaluation/_evaluators/_tool_output_utilization/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py +225 -0
- azure/ai/evaluation/_evaluators/_tool_output_utilization/tool_output_utilization.prompty +221 -0
- azure/ai/evaluation/_evaluators/_tool_selection/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py +266 -0
- azure/ai/evaluation/_evaluators/_tool_selection/tool_selection.prompty +104 -0
- azure/ai/evaluation/_evaluators/_tool_success/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_tool_success/_tool_success.py +301 -0
- azure/ai/evaluation/_evaluators/_tool_success/tool_success.prompty +321 -0
- azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +0 -19
- azure/ai/evaluation/_exceptions.py +6 -0
- azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +14 -1
- azure/ai/evaluation/_legacy/prompty/_prompty.py +2 -1
- azure/ai/evaluation/_legacy/prompty/_utils.py +54 -6
- azure/ai/evaluation/_model_configurations.py +26 -0
- azure/ai/evaluation/_version.py +1 -1
- azure/ai/evaluation/red_team/_attack_objective_generator.py +3 -1
- azure/ai/evaluation/red_team/_attack_strategy.py +1 -0
- azure/ai/evaluation/red_team/_callback_chat_target.py +45 -14
- azure/ai/evaluation/red_team/_evaluation_processor.py +129 -12
- azure/ai/evaluation/red_team/_mlflow_integration.py +144 -36
- azure/ai/evaluation/red_team/_orchestrator_manager.py +309 -51
- azure/ai/evaluation/red_team/_red_team.py +503 -37
- azure/ai/evaluation/red_team/_red_team_result.py +264 -15
- azure/ai/evaluation/red_team/_result_processor.py +953 -31
- azure/ai/evaluation/red_team/_utils/constants.py +1 -0
- azure/ai/evaluation/red_team/_utils/formatting_utils.py +126 -25
- azure/ai/evaluation/red_team/_utils/metric_mapping.py +10 -7
- azure/ai/evaluation/red_team/_utils/strategy_utils.py +3 -25
- azure/ai/evaluation/simulator/_adversarial_simulator.py +1 -1
- azure/ai/evaluation/simulator/_conversation/__init__.py +1 -1
- azure/ai/evaluation/simulator/_conversation/_conversation.py +1 -1
- azure/ai/evaluation/simulator/_direct_attack_simulator.py +1 -1
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +1 -1
- azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +25 -2
- azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +1 -0
- azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +1 -1
- azure/ai/evaluation/simulator/_model_tools/_template_handler.py +1 -1
- azure/ai/evaluation/simulator/_model_tools/models.py +1 -1
- {azure_ai_evaluation-1.11.2.dist-info → azure_ai_evaluation-1.13.0.dist-info}/METADATA +44 -10
- {azure_ai_evaluation-1.11.2.dist-info → azure_ai_evaluation-1.13.0.dist-info}/RECORD +102 -84
- {azure_ai_evaluation-1.11.2.dist-info → azure_ai_evaluation-1.13.0.dist-info}/WHEEL +0 -0
- {azure_ai_evaluation-1.11.2.dist-info → azure_ai_evaluation-1.13.0.dist-info}/licenses/NOTICE.txt +0 -0
- {azure_ai_evaluation-1.11.2.dist-info → azure_ai_evaluation-1.13.0.dist-info}/top_level.txt +0 -0
|
@@ -5,11 +5,13 @@ import inspect
|
|
|
5
5
|
import contextlib
|
|
6
6
|
import json
|
|
7
7
|
import logging
|
|
8
|
+
import math
|
|
8
9
|
import os
|
|
9
10
|
import re
|
|
10
11
|
import tempfile
|
|
11
12
|
import json
|
|
12
|
-
|
|
13
|
+
import time
|
|
14
|
+
from typing import Any, Callable, Dict, Iterable, Iterator, List, Literal, Optional, Set, Tuple, TypedDict, Union, cast
|
|
13
15
|
|
|
14
16
|
from openai import OpenAI, AzureOpenAI
|
|
15
17
|
from azure.ai.evaluation._legacy._adapters._constants import LINE_NUMBER
|
|
@@ -18,8 +20,8 @@ import pandas as pd
|
|
|
18
20
|
|
|
19
21
|
from azure.ai.evaluation._common.math import list_mean_nan_safe, apply_transform_nan_safe
|
|
20
22
|
from azure.ai.evaluation._common.utils import validate_azure_ai_project, is_onedp_project
|
|
23
|
+
from azure.ai.evaluation._evaluators._common._base_eval import EvaluatorBase
|
|
21
24
|
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
22
|
-
|
|
23
25
|
from azure.ai.evaluation._aoai.aoai_grader import AzureOpenAIGrader
|
|
24
26
|
|
|
25
27
|
from .._constants import (
|
|
@@ -31,8 +33,10 @@ from .._constants import (
|
|
|
31
33
|
_InternalEvaluationMetrics,
|
|
32
34
|
BINARY_AGGREGATE_SUFFIX,
|
|
33
35
|
DEFAULT_OAI_EVAL_RUN_NAME,
|
|
36
|
+
EVALUATION_EVENT_NAME,
|
|
37
|
+
_EvaluatorMetricMapping,
|
|
34
38
|
)
|
|
35
|
-
from .._model_configurations import AzureAIProject, EvaluationResult, EvaluatorConfig
|
|
39
|
+
from .._model_configurations import AzureAIProject, EvaluationResult, EvaluatorConfig, AppInsightsConfig
|
|
36
40
|
from .._user_agent import UserAgentSingleton
|
|
37
41
|
from ._batch_run import (
|
|
38
42
|
EvalRunContext,
|
|
@@ -282,6 +286,51 @@ def _aggregation_binary_output(df: pd.DataFrame) -> Dict[str, float]:
|
|
|
282
286
|
return results
|
|
283
287
|
|
|
284
288
|
|
|
289
|
+
def _get_token_count_columns_to_exclude(df: pd.DataFrame) -> List[str]:
|
|
290
|
+
"""Identify token count columns from known SDK metrics that should be excluded from aggregation.
|
|
291
|
+
|
|
292
|
+
Token counts from custom evaluators are not excluded, only those from EvaluationMetrics
|
|
293
|
+
and _InternalEvaluationMetrics.
|
|
294
|
+
|
|
295
|
+
:param df: The dataframe of evaluation results.
|
|
296
|
+
:type df: ~pandas.DataFrame
|
|
297
|
+
:return: List of column names to exclude from aggregation.
|
|
298
|
+
:rtype: List[str]
|
|
299
|
+
"""
|
|
300
|
+
# Get all metric values from EvaluationMetrics class
|
|
301
|
+
evaluation_metrics_values = [
|
|
302
|
+
getattr(EvaluationMetrics, attr)
|
|
303
|
+
for attr in dir(EvaluationMetrics)
|
|
304
|
+
if not attr.startswith("_") and isinstance(getattr(EvaluationMetrics, attr), str)
|
|
305
|
+
]
|
|
306
|
+
|
|
307
|
+
# Get all metric values from _InternalEvaluationMetrics class
|
|
308
|
+
internal_metrics_values = [
|
|
309
|
+
getattr(_InternalEvaluationMetrics, attr)
|
|
310
|
+
for attr in dir(_InternalEvaluationMetrics)
|
|
311
|
+
if not attr.startswith("_") and isinstance(getattr(_InternalEvaluationMetrics, attr), str)
|
|
312
|
+
]
|
|
313
|
+
|
|
314
|
+
# Combine all known metrics
|
|
315
|
+
all_known_metrics = evaluation_metrics_values + internal_metrics_values
|
|
316
|
+
|
|
317
|
+
# Find token count columns that belong to known metrics
|
|
318
|
+
token_count_cols = [
|
|
319
|
+
col
|
|
320
|
+
for col in df.columns
|
|
321
|
+
if (
|
|
322
|
+
any(
|
|
323
|
+
col.endswith(f"{metric}_prompt_tokens")
|
|
324
|
+
or col.endswith(f"{metric}_completion_tokens")
|
|
325
|
+
or col.endswith(f"{metric}_total_tokens")
|
|
326
|
+
for metric in all_known_metrics
|
|
327
|
+
)
|
|
328
|
+
)
|
|
329
|
+
]
|
|
330
|
+
|
|
331
|
+
return token_count_cols
|
|
332
|
+
|
|
333
|
+
|
|
285
334
|
def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, Callable]) -> Dict[str, float]:
|
|
286
335
|
"""Aggregate metrics from the evaluation results.
|
|
287
336
|
On top of naively calculating the mean of most metrics, this function also identifies certain columns
|
|
@@ -314,9 +363,16 @@ def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, Callable]) -> Dic
|
|
|
314
363
|
handled_columns.extend(label_cols)
|
|
315
364
|
defect_rates.update(label_defect_rates)
|
|
316
365
|
|
|
366
|
+
# Exclude token count columns from aggregation for known SDK metrics
|
|
367
|
+
token_count_cols = _get_token_count_columns_to_exclude(df)
|
|
368
|
+
handled_columns.extend(token_count_cols)
|
|
369
|
+
|
|
317
370
|
# For rest of metrics, we will calculate mean
|
|
318
371
|
df.drop(columns=handled_columns, inplace=True)
|
|
319
372
|
|
|
373
|
+
# Convert "not applicable" strings to None to allow proper numeric aggregation
|
|
374
|
+
df = df.replace(EvaluatorBase._NOT_APPLICABLE_RESULT, None)
|
|
375
|
+
|
|
320
376
|
# NOTE: nan/None values don't count as as booleans, so boolean columns with
|
|
321
377
|
# nan/None values won't have a mean produced from them.
|
|
322
378
|
# This is different from label-based known evaluators, which have special handling.
|
|
@@ -789,7 +845,7 @@ def evaluate(
|
|
|
789
845
|
try:
|
|
790
846
|
user_agent: Optional[str] = kwargs.get("user_agent")
|
|
791
847
|
with UserAgentSingleton().add_useragent_product(user_agent) if user_agent else contextlib.nullcontext():
|
|
792
|
-
|
|
848
|
+
results = _evaluate(
|
|
793
849
|
evaluation_name=evaluation_name,
|
|
794
850
|
target=target,
|
|
795
851
|
data=data,
|
|
@@ -801,6 +857,7 @@ def evaluate(
|
|
|
801
857
|
tags=tags,
|
|
802
858
|
**kwargs,
|
|
803
859
|
)
|
|
860
|
+
return results
|
|
804
861
|
except Exception as e:
|
|
805
862
|
# Handle multiprocess bootstrap error
|
|
806
863
|
bootstrap_error = (
|
|
@@ -896,6 +953,7 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
|
|
|
896
953
|
results_df = pd.DataFrame()
|
|
897
954
|
metrics: Dict[str, float] = {}
|
|
898
955
|
eval_run_info_list: List[OAIEvalRunCreationInfo] = []
|
|
956
|
+
eval_run_summary_dict = {}
|
|
899
957
|
|
|
900
958
|
# Start OAI eval runs if any graders are present.
|
|
901
959
|
need_oai_run = len(graders) > 0
|
|
@@ -930,6 +988,8 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
|
|
|
930
988
|
got_local_results = True
|
|
931
989
|
# TODO figure out how to update this printing to include OAI results?
|
|
932
990
|
_print_summary(per_evaluator_results)
|
|
991
|
+
eval_run_summary_dict = {name: result["run_summary"] for name, result in per_evaluator_results.items()}
|
|
992
|
+
LOGGER.info(f"run_summary: \r\n{json.dumps(eval_run_summary_dict, indent=4)}")
|
|
933
993
|
except EvaluationException as e:
|
|
934
994
|
if need_get_oai_results:
|
|
935
995
|
# If there are OAI graders, we only print a warning on local failures.
|
|
@@ -977,13 +1037,322 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
|
|
|
977
1037
|
|
|
978
1038
|
result_df_dict = results_df.to_dict("records")
|
|
979
1039
|
result: EvaluationResult = {"rows": result_df_dict, "metrics": metrics, "studio_url": studio_url} # type: ignore
|
|
1040
|
+
# _add_aoai_structured_results_to_results(result, LOGGER, kwargs.get("eval_meta_data"))
|
|
1041
|
+
|
|
1042
|
+
eval_id: Optional[str] = kwargs.get("_eval_id")
|
|
1043
|
+
eval_run_id: Optional[str] = kwargs.get("_eval_run_id")
|
|
1044
|
+
eval_meta_data: Optional[Dict[str, Any]] = kwargs.get("_eval_meta_data")
|
|
1045
|
+
if kwargs.get("_convert_to_aoai_evaluation_result", False):
|
|
1046
|
+
_convert_results_to_aoai_evaluation_results(
|
|
1047
|
+
result, LOGGER, eval_id, eval_run_id, evaluators_and_graders, eval_run_summary_dict, eval_meta_data
|
|
1048
|
+
)
|
|
1049
|
+
if app_insights_configuration := kwargs.get("_app_insights_configuration"):
|
|
1050
|
+
emit_eval_result_events_to_app_insights(
|
|
1051
|
+
app_insights_configuration, result["_evaluation_results_list"], evaluator_config
|
|
1052
|
+
)
|
|
980
1053
|
|
|
981
1054
|
if output_path:
|
|
982
1055
|
_write_output(output_path, result)
|
|
983
|
-
|
|
984
1056
|
return result
|
|
985
1057
|
|
|
986
1058
|
|
|
1059
|
+
def _build_internal_log_attributes(
|
|
1060
|
+
event_data: Dict[str, Any],
|
|
1061
|
+
metric_name: str,
|
|
1062
|
+
evaluator_config: Optional[Dict[str, EvaluatorConfig]],
|
|
1063
|
+
internal_log_attributes: Dict[str, str],
|
|
1064
|
+
) -> Dict[str, str]:
|
|
1065
|
+
"""
|
|
1066
|
+
Build internal log attributes for OpenTelemetry logging.
|
|
1067
|
+
|
|
1068
|
+
:param event_data: The event data containing threshold and name information
|
|
1069
|
+
:type event_data: Dict[str, Any]
|
|
1070
|
+
:param metric_name: The name of the metric being evaluated
|
|
1071
|
+
:type metric_name: str
|
|
1072
|
+
:param evaluator_config: Configuration for evaluators
|
|
1073
|
+
:type evaluator_config: Optional[Dict[str, EvaluatorConfig]]
|
|
1074
|
+
:return: Dictionary of internal log attributes
|
|
1075
|
+
:rtype: Dict[str, str]
|
|
1076
|
+
"""
|
|
1077
|
+
# Add threshold if present
|
|
1078
|
+
if event_data.get("threshold"):
|
|
1079
|
+
internal_log_attributes["gen_ai.evaluation.threshold"] = str(event_data["threshold"])
|
|
1080
|
+
|
|
1081
|
+
# Add testing criteria details if present
|
|
1082
|
+
testing_criteria_name = event_data.get("name")
|
|
1083
|
+
if testing_criteria_name:
|
|
1084
|
+
internal_log_attributes["gen_ai.evaluation.testing_criteria.name"] = testing_criteria_name
|
|
1085
|
+
|
|
1086
|
+
# Get evaluator definition details
|
|
1087
|
+
if evaluator_config and testing_criteria_name in evaluator_config:
|
|
1088
|
+
testing_criteria_config = evaluator_config[testing_criteria_name]
|
|
1089
|
+
|
|
1090
|
+
if evaluator_name := testing_criteria_config.get("_evaluator_name"):
|
|
1091
|
+
internal_log_attributes["gen_ai.evaluator.name"] = str(evaluator_name)
|
|
1092
|
+
|
|
1093
|
+
if evaluator_version := testing_criteria_config.get("_evaluator_version"):
|
|
1094
|
+
internal_log_attributes["gen_ai.evaluator.version"] = str(evaluator_version)
|
|
1095
|
+
|
|
1096
|
+
if evaluator_id := testing_criteria_config.get("_evaluator_id"):
|
|
1097
|
+
internal_log_attributes["gen_ai.evaluator.id"] = str(evaluator_id)
|
|
1098
|
+
|
|
1099
|
+
if evaluator_definition := testing_criteria_config.get("_evaluator_definition"):
|
|
1100
|
+
metric_config_detail = evaluator_definition.get("metrics").get(metric_name)
|
|
1101
|
+
|
|
1102
|
+
if metric_config_detail:
|
|
1103
|
+
if metric_config_detail.get("min_value") is not None:
|
|
1104
|
+
internal_log_attributes["gen_ai.evaluation.min_value"] = str(metric_config_detail["min_value"])
|
|
1105
|
+
if metric_config_detail.get("max_value") is not None:
|
|
1106
|
+
internal_log_attributes["gen_ai.evaluation.max_value"] = str(metric_config_detail["max_value"])
|
|
1107
|
+
|
|
1108
|
+
return internal_log_attributes
|
|
1109
|
+
|
|
1110
|
+
|
|
1111
|
+
def _log_events_to_app_insights(
|
|
1112
|
+
otel_logger,
|
|
1113
|
+
events: List[Dict[str, Any]],
|
|
1114
|
+
log_attributes: Dict[str, Any],
|
|
1115
|
+
app_insights_config: AppInsightsConfig,
|
|
1116
|
+
data_source_item: Optional[Dict[str, Any]] = None,
|
|
1117
|
+
evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None,
|
|
1118
|
+
) -> None:
|
|
1119
|
+
"""
|
|
1120
|
+
Log independent events directly to App Insights using OpenTelemetry logging.
|
|
1121
|
+
No spans are created - events are sent as pure log records.
|
|
1122
|
+
|
|
1123
|
+
:param otel_logger: OpenTelemetry logger instance
|
|
1124
|
+
:type otel_logger: Logger
|
|
1125
|
+
:param events: List of event data dictionaries to log
|
|
1126
|
+
:type events: List[Dict[str, Any]]
|
|
1127
|
+
:param log_attributes: Attributes dict to use for each event (already includes extra_attributes if present)
|
|
1128
|
+
:type log_attributes: Dict[str, Any]
|
|
1129
|
+
:param app_insights_config: App Insights configuration containing connection string
|
|
1130
|
+
:type app_insights_config: AppInsightsConfig
|
|
1131
|
+
:param data_source_item: Data source item containing trace, response, and agent information
|
|
1132
|
+
:type data_source_item: Optional[Dict[str, Any]]
|
|
1133
|
+
"""
|
|
1134
|
+
|
|
1135
|
+
from opentelemetry import trace
|
|
1136
|
+
from opentelemetry.trace import SpanContext, TraceFlags, NonRecordingSpan
|
|
1137
|
+
|
|
1138
|
+
try:
|
|
1139
|
+
# Initialize values from AppInsights config as defaults
|
|
1140
|
+
trace_id = None
|
|
1141
|
+
span_id = None
|
|
1142
|
+
response_id = None
|
|
1143
|
+
conversation_id = None
|
|
1144
|
+
previous_response_id = None
|
|
1145
|
+
agent_id = app_insights_config.get("agent_id", None)
|
|
1146
|
+
agent_version = app_insights_config.get("agent_version", None)
|
|
1147
|
+
agent_name = app_insights_config.get("agent_name", None)
|
|
1148
|
+
|
|
1149
|
+
# Data source item values have higher priority and will override AppInsights config defaults
|
|
1150
|
+
if data_source_item:
|
|
1151
|
+
for key, value in data_source_item.items():
|
|
1152
|
+
if key.endswith("trace_id") and value and isinstance(value, str):
|
|
1153
|
+
# Remove dashes if present
|
|
1154
|
+
trace_id_str = str(value).replace("-", "").lower()
|
|
1155
|
+
if len(trace_id_str) == 32: # Valid trace_id length
|
|
1156
|
+
trace_id = int(trace_id_str, 16)
|
|
1157
|
+
elif key == "previous_response_id" and value and isinstance(value, str):
|
|
1158
|
+
previous_response_id = value
|
|
1159
|
+
elif key == "response_id" and value and isinstance(value, str):
|
|
1160
|
+
response_id = value
|
|
1161
|
+
elif key == "conversation_id" and value and isinstance(value, str):
|
|
1162
|
+
conversation_id = value
|
|
1163
|
+
elif key == "agent_id" and value and isinstance(value, str):
|
|
1164
|
+
agent_id = value
|
|
1165
|
+
elif key.endswith("span_id") and value and isinstance(value, str):
|
|
1166
|
+
# Remove dashes if present and convert to int
|
|
1167
|
+
span_id_str = str(value).replace("-", "").lower()
|
|
1168
|
+
if len(span_id_str) == 16: # Valid span_id length (64-bit = 16 hex chars)
|
|
1169
|
+
span_id = int(span_id_str, 16)
|
|
1170
|
+
elif key == "agent_version" and value and isinstance(value, str):
|
|
1171
|
+
agent_version = value
|
|
1172
|
+
elif key == "agent_name" and value and isinstance(value, str):
|
|
1173
|
+
agent_name = value
|
|
1174
|
+
|
|
1175
|
+
# Log each event as a separate log record
|
|
1176
|
+
for i, event_data in enumerate(events):
|
|
1177
|
+
try:
|
|
1178
|
+
# Prepare log record attributes with specific mappings
|
|
1179
|
+
# The standard attributes are already in https://github.com/open-telemetry/semantic-conventions/blob/main/docs/gen-ai/gen-ai-events.md#event-eventgen_aievaluationresult
|
|
1180
|
+
metric_name = event_data.get("metric")
|
|
1181
|
+
standard_log_attributes = {}
|
|
1182
|
+
standard_log_attributes["microsoft.custom_event.name"] = EVALUATION_EVENT_NAME
|
|
1183
|
+
standard_log_attributes["gen_ai.evaluation.name"] = metric_name
|
|
1184
|
+
if event_data.get("score") is not None:
|
|
1185
|
+
standard_log_attributes["gen_ai.evaluation.score.value"] = event_data.get("score")
|
|
1186
|
+
if event_data.get("label") is not None:
|
|
1187
|
+
standard_log_attributes["gen_ai.evaluation.score.label"] = event_data.get("label")
|
|
1188
|
+
|
|
1189
|
+
# Internal proposed attributes
|
|
1190
|
+
# Put it in internal property bag for now, will be expanded if we got sign-off to Otel standard later.
|
|
1191
|
+
internal_log_attributes = _build_internal_log_attributes(
|
|
1192
|
+
event_data, metric_name, evaluator_config, log_attributes
|
|
1193
|
+
)
|
|
1194
|
+
|
|
1195
|
+
# Optional field that may not always be present
|
|
1196
|
+
if "reason" in event_data:
|
|
1197
|
+
standard_log_attributes["gen_ai.evaluation.explanation"] = str(event_data["reason"])
|
|
1198
|
+
|
|
1199
|
+
# Handle error from sample if present
|
|
1200
|
+
# Put the error message in error.type to follow OTel semantic conventions
|
|
1201
|
+
error = event_data.get("sample", {}).get("error", {}).get("message", None)
|
|
1202
|
+
if error:
|
|
1203
|
+
standard_log_attributes["error.type"] = error
|
|
1204
|
+
|
|
1205
|
+
# Handle redteam attack properties if present
|
|
1206
|
+
if "properties" in event_data:
|
|
1207
|
+
properties = event_data["properties"]
|
|
1208
|
+
|
|
1209
|
+
if "attack_success" in properties:
|
|
1210
|
+
internal_log_attributes["gen_ai.redteam.attack.success"] = str(properties["attack_success"])
|
|
1211
|
+
|
|
1212
|
+
if "attack_technique" in properties:
|
|
1213
|
+
internal_log_attributes["gen_ai.redteam.attack.technique"] = str(properties["attack_technique"])
|
|
1214
|
+
|
|
1215
|
+
if "attack_complexity" in properties:
|
|
1216
|
+
internal_log_attributes["gen_ai.redteam.attack.complexity"] = str(
|
|
1217
|
+
properties["attack_complexity"]
|
|
1218
|
+
)
|
|
1219
|
+
|
|
1220
|
+
if "attack_success_threshold" in properties:
|
|
1221
|
+
internal_log_attributes["gen_ai.redteam.attack.success_threshold"] = str(
|
|
1222
|
+
properties["attack_success_threshold"]
|
|
1223
|
+
)
|
|
1224
|
+
|
|
1225
|
+
# Add data source item attributes if present
|
|
1226
|
+
if response_id:
|
|
1227
|
+
standard_log_attributes["gen_ai.response.id"] = response_id
|
|
1228
|
+
if conversation_id:
|
|
1229
|
+
standard_log_attributes["gen_ai.conversation.id"] = conversation_id
|
|
1230
|
+
if previous_response_id:
|
|
1231
|
+
internal_log_attributes["gen_ai.previous.response.id"] = previous_response_id
|
|
1232
|
+
if agent_id:
|
|
1233
|
+
standard_log_attributes["gen_ai.agent.id"] = agent_id
|
|
1234
|
+
if agent_name:
|
|
1235
|
+
standard_log_attributes["gen_ai.agent.name"] = agent_name
|
|
1236
|
+
if agent_version:
|
|
1237
|
+
internal_log_attributes["gen_ai.agent.version"] = agent_version
|
|
1238
|
+
|
|
1239
|
+
# Combine standard and internal attributes, put internal under the properties bag
|
|
1240
|
+
standard_log_attributes["internal_properties"] = json.dumps(internal_log_attributes)
|
|
1241
|
+
# Anonymize IP address to prevent Azure GeoIP enrichment and location tracking
|
|
1242
|
+
standard_log_attributes["http.client_ip"] = "0.0.0.0"
|
|
1243
|
+
|
|
1244
|
+
# Create context with trace_id and span_id if present (for distributed tracing correlation)
|
|
1245
|
+
ctx = None
|
|
1246
|
+
if trace_id:
|
|
1247
|
+
span_context = SpanContext(
|
|
1248
|
+
trace_id=trace_id,
|
|
1249
|
+
span_id=span_id if span_id else 0, # Use extracted span_id or 0 if not available
|
|
1250
|
+
is_remote=False,
|
|
1251
|
+
trace_flags=TraceFlags(0x01),
|
|
1252
|
+
)
|
|
1253
|
+
span = NonRecordingSpan(span_context)
|
|
1254
|
+
ctx = trace.set_span_in_context(span)
|
|
1255
|
+
|
|
1256
|
+
otel_logger.emit(
|
|
1257
|
+
timestamp=time.time_ns(),
|
|
1258
|
+
observed_timestamp=time.time_ns(),
|
|
1259
|
+
body=EVALUATION_EVENT_NAME,
|
|
1260
|
+
attributes=standard_log_attributes,
|
|
1261
|
+
context=ctx,
|
|
1262
|
+
)
|
|
1263
|
+
|
|
1264
|
+
except Exception as e:
|
|
1265
|
+
LOGGER.warning(f"Failed to log event {i}: {e}")
|
|
1266
|
+
|
|
1267
|
+
except Exception as e:
|
|
1268
|
+
LOGGER.error(f"Failed to log events to App Insights: {e}")
|
|
1269
|
+
|
|
1270
|
+
|
|
1271
|
+
def emit_eval_result_events_to_app_insights(
|
|
1272
|
+
app_insights_config: AppInsightsConfig,
|
|
1273
|
+
results: List[Dict],
|
|
1274
|
+
evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None,
|
|
1275
|
+
) -> None:
|
|
1276
|
+
"""
|
|
1277
|
+
Emit evaluation result events to App Insights using OpenTelemetry logging.
|
|
1278
|
+
Each result is logged as an independent log record, potentially including trace context.
|
|
1279
|
+
|
|
1280
|
+
:param app_insights_config: App Insights configuration containing connection string
|
|
1281
|
+
:type app_insights_config: AppInsightsConfig
|
|
1282
|
+
:param results: List of evaluation results to log
|
|
1283
|
+
:type results: List[Dict]
|
|
1284
|
+
"""
|
|
1285
|
+
|
|
1286
|
+
from opentelemetry import _logs
|
|
1287
|
+
from opentelemetry.sdk._logs import LoggerProvider
|
|
1288
|
+
from opentelemetry.sdk._logs.export import BatchLogRecordProcessor
|
|
1289
|
+
from opentelemetry.sdk.resources import Resource
|
|
1290
|
+
from opentelemetry.semconv.resource import ResourceAttributes
|
|
1291
|
+
from azure.monitor.opentelemetry.exporter import AzureMonitorLogExporter
|
|
1292
|
+
|
|
1293
|
+
if not results:
|
|
1294
|
+
LOGGER.debug("No results to log to App Insights")
|
|
1295
|
+
return
|
|
1296
|
+
|
|
1297
|
+
try:
|
|
1298
|
+
# Configure OpenTelemetry logging with anonymized Resource attributes
|
|
1299
|
+
|
|
1300
|
+
# Create a resource with minimal attributes to prevent sensitive data collection
|
|
1301
|
+
# SERVICE_INSTANCE_ID maps to cloud_RoleInstance in Azure Monitor and prevents
|
|
1302
|
+
# Azure Monitor from auto-detecting the device hostname
|
|
1303
|
+
anonymized_resource = Resource.create(
|
|
1304
|
+
{
|
|
1305
|
+
ResourceAttributes.SERVICE_NAME: "unknown",
|
|
1306
|
+
ResourceAttributes.SERVICE_INSTANCE_ID: "unknown",
|
|
1307
|
+
}
|
|
1308
|
+
)
|
|
1309
|
+
|
|
1310
|
+
logger_provider = LoggerProvider(resource=anonymized_resource)
|
|
1311
|
+
_logs.set_logger_provider(logger_provider)
|
|
1312
|
+
|
|
1313
|
+
# Create Azure Monitor log exporter
|
|
1314
|
+
azure_log_exporter = AzureMonitorLogExporter(connection_string=app_insights_config["connection_string"])
|
|
1315
|
+
|
|
1316
|
+
# Add the Azure Monitor exporter to the logger provider
|
|
1317
|
+
logger_provider.add_log_record_processor(BatchLogRecordProcessor(azure_log_exporter))
|
|
1318
|
+
|
|
1319
|
+
# Create a logger from OUR configured logger_provider (not the global one)
|
|
1320
|
+
# This ensures the logger uses our anonymized resource
|
|
1321
|
+
otel_logger = logger_provider.get_logger(__name__)
|
|
1322
|
+
|
|
1323
|
+
# Initialize base log attributes with extra_attributes if present, otherwise empty dict
|
|
1324
|
+
base_log_attributes = app_insights_config.get("extra_attributes", {})
|
|
1325
|
+
|
|
1326
|
+
# Add AppInsights config attributes with proper semantic convention mappings
|
|
1327
|
+
if "run_type" in app_insights_config:
|
|
1328
|
+
base_log_attributes["gen_ai.evaluation.azure_ai_type"] = str(app_insights_config["run_type"])
|
|
1329
|
+
if "schedule_type" in app_insights_config:
|
|
1330
|
+
base_log_attributes["gen_ai.evaluation.azure_ai_scheduled"] = str(app_insights_config["schedule_type"])
|
|
1331
|
+
if "run_id" in app_insights_config:
|
|
1332
|
+
base_log_attributes["gen_ai.evaluation.run.id"] = str(app_insights_config["run_id"])
|
|
1333
|
+
if "project_id" in app_insights_config:
|
|
1334
|
+
base_log_attributes["gen_ai.azure_ai_project.id"] = str(app_insights_config["project_id"])
|
|
1335
|
+
|
|
1336
|
+
for result in results:
|
|
1337
|
+
# Create a copy of base attributes for this result's events
|
|
1338
|
+
log_attributes = base_log_attributes.copy()
|
|
1339
|
+
|
|
1340
|
+
_log_events_to_app_insights(
|
|
1341
|
+
otel_logger=otel_logger,
|
|
1342
|
+
events=result["results"],
|
|
1343
|
+
log_attributes=log_attributes,
|
|
1344
|
+
data_source_item=result["datasource_item"] if "datasource_item" in result else None,
|
|
1345
|
+
evaluator_config=evaluator_config,
|
|
1346
|
+
app_insights_config=app_insights_config,
|
|
1347
|
+
)
|
|
1348
|
+
# Force flush to ensure events are sent
|
|
1349
|
+
logger_provider.force_flush()
|
|
1350
|
+
LOGGER.info(f"Successfully logged {len(results)} evaluation results to App Insights")
|
|
1351
|
+
|
|
1352
|
+
except Exception as e:
|
|
1353
|
+
LOGGER.error(f"Failed to emit evaluation results to App Insights: {e}")
|
|
1354
|
+
|
|
1355
|
+
|
|
987
1356
|
def _preprocess_data(
|
|
988
1357
|
data: Union[str, os.PathLike],
|
|
989
1358
|
evaluators_and_graders: Dict[str, Union[Callable, AzureOpenAIGrader]],
|
|
@@ -1066,7 +1435,7 @@ def _preprocess_data(
|
|
|
1066
1435
|
batch_run_data = input_data_df
|
|
1067
1436
|
elif client_type == "pf_client":
|
|
1068
1437
|
batch_run_client = ProxyClient(user_agent=UserAgentSingleton().value)
|
|
1069
|
-
# Ensure the absolute path is
|
|
1438
|
+
# Ensure the absolute path is Re to pf.run, as relative path doesn't work with
|
|
1070
1439
|
# multiple evaluators. If the path is already absolute, abspath will return the original path.
|
|
1071
1440
|
batch_run_data = os.path.abspath(data)
|
|
1072
1441
|
elif client_type == "code_client":
|
|
@@ -1131,11 +1500,36 @@ def _preprocess_data(
|
|
|
1131
1500
|
# via target mapping.
|
|
1132
1501
|
# If both the data and the output dictionary of the target function
|
|
1133
1502
|
# have the same column, then the target function value is used.
|
|
1503
|
+
# NEW: flatten nested object columns (e.g., 'item') so we can map leaf values automatically.
|
|
1504
|
+
# Ensure the data does not contain top-level 'conversation' or 'messages' columns (which indicate chat/conversation data)
|
|
1134
1505
|
if input_data_df is not None:
|
|
1506
|
+
if "conversation" in input_data_df.columns or "messages" in input_data_df.columns:
|
|
1507
|
+
# No action is taken when 'conversation' or 'messages' columns are present,
|
|
1508
|
+
# as these indicate chat/conversation data which should not be flattened or mapped by default.
|
|
1509
|
+
pass
|
|
1510
|
+
else:
|
|
1511
|
+
input_data_df = _flatten_object_columns_for_default_mapping(input_data_df)
|
|
1512
|
+
|
|
1513
|
+
# Build default mapping for leaves:
|
|
1514
|
+
if input_data_df is not None:
|
|
1515
|
+
# First, map flattened nested columns (those containing a dot) to leaf names.
|
|
1516
|
+
for col in input_data_df.columns:
|
|
1517
|
+
# Skip target output columns
|
|
1518
|
+
if col.startswith(Prefixes.TSG_OUTPUTS):
|
|
1519
|
+
continue
|
|
1520
|
+
# Skip root container columns (no dot) here; they'll be handled below if truly primitive.
|
|
1521
|
+
if "." in col:
|
|
1522
|
+
leaf_name = col.split(".")[-1]
|
|
1523
|
+
if leaf_name not in column_mapping["default"]:
|
|
1524
|
+
column_mapping["default"][leaf_name] = f"${{data.{col}}}"
|
|
1525
|
+
|
|
1526
|
+
# Then, handle remaining top-level primitive columns (original logic).
|
|
1135
1527
|
for col in input_data_df.columns:
|
|
1136
|
-
|
|
1137
|
-
|
|
1138
|
-
|
|
1528
|
+
if (
|
|
1529
|
+
not col.startswith(Prefixes.TSG_OUTPUTS)
|
|
1530
|
+
and col not in column_mapping["default"].keys()
|
|
1531
|
+
and "." not in col # only pure top-level primitives
|
|
1532
|
+
):
|
|
1139
1533
|
column_mapping["default"][col] = f"${{data.{col}}}"
|
|
1140
1534
|
|
|
1141
1535
|
return __ValidatedData(
|
|
@@ -1149,6 +1543,79 @@ def _preprocess_data(
|
|
|
1149
1543
|
)
|
|
1150
1544
|
|
|
1151
1545
|
|
|
1546
|
+
def _flatten_object_columns_for_default_mapping(
|
|
1547
|
+
df: pd.DataFrame, root_prefixes: Optional[Iterable[str]] = None
|
|
1548
|
+
) -> pd.DataFrame:
|
|
1549
|
+
"""Flatten nested dictionary-valued columns into dotted leaf columns.
|
|
1550
|
+
|
|
1551
|
+
For any column whose cells (in at least one row) are ``dict`` objects, this utility discovers all
|
|
1552
|
+
leaf paths (recursively descending only through ``dict`` nodes) and materializes new DataFrame
|
|
1553
|
+
columns named ``"<original_col>.<nested.path.leaf>"`` for every unique leaf encountered across
|
|
1554
|
+
all rows. A *leaf* is defined as any value that is **not** a ``dict`` (lists / primitives / ``None``
|
|
1555
|
+
are all treated as leaves). Existing columns are never overwritten (idempotent behavior).
|
|
1556
|
+
|
|
1557
|
+
Example
|
|
1558
|
+
If a column ``item`` contains objects like ``{"a": {"b": 1, "c": 2}}`` a pair of new
|
|
1559
|
+
columns ``item.a.b`` and ``item.a.c`` will be added with the corresponding scalar values.
|
|
1560
|
+
|
|
1561
|
+
:param df: Input DataFrame to flatten in place.
|
|
1562
|
+
:type df: ~pandas.DataFrame
|
|
1563
|
+
:param root_prefixes: Optional iterable restricting which top-level columns are considered
|
|
1564
|
+
for flattening. If ``None``, all columns containing at least one ``dict`` value are processed.
|
|
1565
|
+
:type root_prefixes: Optional[Iterable[str]]
|
|
1566
|
+
:return: The same DataFrame instance (returned for convenient chaining).
|
|
1567
|
+
:rtype: ~pandas.DataFrame
|
|
1568
|
+
"""
|
|
1569
|
+
candidate_cols = []
|
|
1570
|
+
if root_prefixes is not None:
|
|
1571
|
+
candidate_cols = [c for c in root_prefixes if c in df.columns]
|
|
1572
|
+
else:
|
|
1573
|
+
# pick columns where at least one non-null value is a dict
|
|
1574
|
+
for c in df.columns:
|
|
1575
|
+
series = df[c]
|
|
1576
|
+
if series.map(lambda v: isinstance(v, dict)).any():
|
|
1577
|
+
candidate_cols.append(c)
|
|
1578
|
+
|
|
1579
|
+
def _extract_leaves(obj: Any, prefix: str) -> Iterator[Tuple[str, Any]]:
|
|
1580
|
+
if isinstance(obj, dict):
|
|
1581
|
+
for k, v in obj.items():
|
|
1582
|
+
new_prefix = f"{prefix}.{k}" if prefix else k
|
|
1583
|
+
if isinstance(v, dict):
|
|
1584
|
+
yield from _extract_leaves(v, new_prefix)
|
|
1585
|
+
else:
|
|
1586
|
+
# treat list / primitive / None as leaf
|
|
1587
|
+
yield new_prefix, v
|
|
1588
|
+
|
|
1589
|
+
for root_col in candidate_cols:
|
|
1590
|
+
# Build a union of leaf paths across rows to ensure consistent columns
|
|
1591
|
+
leaf_paths: Set[str] = set()
|
|
1592
|
+
for val in df[root_col]:
|
|
1593
|
+
if isinstance(val, dict):
|
|
1594
|
+
for path, _ in _extract_leaves(val, root_col):
|
|
1595
|
+
leaf_paths.add(path)
|
|
1596
|
+
|
|
1597
|
+
if not leaf_paths:
|
|
1598
|
+
continue
|
|
1599
|
+
|
|
1600
|
+
# Create each flattened column if absent
|
|
1601
|
+
for path in leaf_paths:
|
|
1602
|
+
if path in df.columns:
|
|
1603
|
+
continue # already present
|
|
1604
|
+
relative_keys = path[len(root_col) + 1 :].split(".") if len(path) > len(root_col) else []
|
|
1605
|
+
|
|
1606
|
+
def getter(root_val: Any) -> Any:
|
|
1607
|
+
cur = root_val
|
|
1608
|
+
for rk in relative_keys:
|
|
1609
|
+
if not isinstance(cur, dict):
|
|
1610
|
+
return None
|
|
1611
|
+
cur = cur.get(rk, None)
|
|
1612
|
+
return cur
|
|
1613
|
+
|
|
1614
|
+
df[path] = df[root_col].map(lambda rv: getter(rv) if isinstance(rv, dict) else None)
|
|
1615
|
+
|
|
1616
|
+
return df
|
|
1617
|
+
|
|
1618
|
+
|
|
1152
1619
|
def _run_callable_evaluators(
|
|
1153
1620
|
validated_data: __ValidatedData,
|
|
1154
1621
|
fail_on_evaluator_errors: bool = False,
|
|
@@ -1304,3 +1771,652 @@ def _turn_error_logs_into_exception(log_path: str) -> None:
|
|
|
1304
1771
|
category=ErrorCategory.FAILED_EXECUTION,
|
|
1305
1772
|
blame=ErrorBlame.UNKNOWN,
|
|
1306
1773
|
)
|
|
1774
|
+
|
|
1775
|
+
|
|
1776
|
+
def _convert_results_to_aoai_evaluation_results(
|
|
1777
|
+
results: EvaluationResult,
|
|
1778
|
+
logger: logging.Logger,
|
|
1779
|
+
eval_id: Optional[str] = None,
|
|
1780
|
+
eval_run_id: Optional[str] = None,
|
|
1781
|
+
evaluators: Dict[str, Union[Callable, AzureOpenAIGrader]] = None,
|
|
1782
|
+
eval_run_summary: Optional[Dict[str, Any]] = None,
|
|
1783
|
+
eval_meta_data: Optional[Dict[str, Any]] = None,
|
|
1784
|
+
) -> None:
|
|
1785
|
+
"""
|
|
1786
|
+
Convert evaluation results to AOAI evaluation results format.
|
|
1787
|
+
|
|
1788
|
+
Each row of input results.rows looks like:
|
|
1789
|
+
{"inputs.query":"What is the capital of France?","inputs.context":"France is in Europe",
|
|
1790
|
+
"inputs.generated_response":"Paris is the capital of France.","inputs.ground_truth":"Paris is the capital of France.",
|
|
1791
|
+
"outputs.F1_score.f1_score":1.0,"outputs.F1_score.f1_result":"pass","outputs.F1_score.f1_threshold":0.5}
|
|
1792
|
+
|
|
1793
|
+
Convert each row into new RunOutputItem object with results array.
|
|
1794
|
+
|
|
1795
|
+
:param results: The evaluation results to convert
|
|
1796
|
+
:type results: EvaluationResult
|
|
1797
|
+
:param eval_meta_data: The evaluation metadata, containing eval_id, eval_run_id, and testing_criteria
|
|
1798
|
+
:type eval_meta_data: Dict[str, Any]
|
|
1799
|
+
:param logger: Logger instance
|
|
1800
|
+
:type logger: logging.Logger
|
|
1801
|
+
:return: EvaluationResult with converted evaluation results in AOAI format
|
|
1802
|
+
:rtype: EvaluationResult
|
|
1803
|
+
"""
|
|
1804
|
+
|
|
1805
|
+
if evaluators is None:
|
|
1806
|
+
return
|
|
1807
|
+
|
|
1808
|
+
# Get the testing_criteria_name and testing_criteria_type from evaluators
|
|
1809
|
+
testing_criteria_name_types_metrics: Optional[Dict[str, Any]] = {}
|
|
1810
|
+
criteria_name_types_from_meta: Optional[Dict[str, str]] = {}
|
|
1811
|
+
if eval_meta_data and "testing_criteria" in eval_meta_data:
|
|
1812
|
+
testing_criteria_list: Optional[List[Dict[str, Any]]] = eval_meta_data.get("testing_criteria")
|
|
1813
|
+
if testing_criteria_list is not None:
|
|
1814
|
+
for criteria in testing_criteria_list:
|
|
1815
|
+
criteria_name = criteria.get("name")
|
|
1816
|
+
criteria_type = criteria.get("type")
|
|
1817
|
+
if criteria_name is not None and criteria_type is not None:
|
|
1818
|
+
criteria_name_types_from_meta[criteria_name] = criteria
|
|
1819
|
+
|
|
1820
|
+
for criteria_name, evaluator in evaluators.items():
|
|
1821
|
+
criteria_type = None
|
|
1822
|
+
metrics = []
|
|
1823
|
+
if criteria_name in criteria_name_types_from_meta:
|
|
1824
|
+
criteria_type = criteria_name_types_from_meta[criteria_name].get("type", None)
|
|
1825
|
+
evaluator_name = criteria_name_types_from_meta[criteria_name].get("evaluator_name", None)
|
|
1826
|
+
current_evaluator_metrics = criteria_name_types_from_meta[criteria_name].get("metrics", None)
|
|
1827
|
+
if current_evaluator_metrics and len(current_evaluator_metrics) > 0:
|
|
1828
|
+
metrics.extend(current_evaluator_metrics)
|
|
1829
|
+
elif evaluator_name:
|
|
1830
|
+
if criteria_type == "azure_ai_evaluator" and evaluator_name.startswith("builtin."):
|
|
1831
|
+
evaluator_name = evaluator_name.replace("builtin.", "")
|
|
1832
|
+
metrics_mapped = _EvaluatorMetricMapping.EVALUATOR_NAME_METRICS_MAPPINGS.get(evaluator_name, [])
|
|
1833
|
+
if metrics_mapped and len(metrics_mapped) > 0:
|
|
1834
|
+
metrics.extend(metrics_mapped)
|
|
1835
|
+
else:
|
|
1836
|
+
metrics.append(criteria_name)
|
|
1837
|
+
elif isinstance(evaluator, AzureOpenAIGrader):
|
|
1838
|
+
criteria_type = evaluator._type # pylint: disable=protected-access
|
|
1839
|
+
metrics.append(criteria_name)
|
|
1840
|
+
elif isinstance(evaluator, EvaluatorBase):
|
|
1841
|
+
criteria_type = "azure_ai_evaluator"
|
|
1842
|
+
evaluator_class_name = evaluator.__class__.__name__
|
|
1843
|
+
eval_name = _EvaluatorMetricMapping.EVAL_CLASS_NAME_MAP.get(evaluator_class_name, None)
|
|
1844
|
+
if eval_name:
|
|
1845
|
+
metrics_mapped = _EvaluatorMetricMapping.EVALUATOR_NAME_METRICS_MAPPINGS.get(eval_name, [])
|
|
1846
|
+
if metrics_mapped and len(metrics_mapped) > 0:
|
|
1847
|
+
metrics.extend(metrics_mapped)
|
|
1848
|
+
else:
|
|
1849
|
+
metrics.append(criteria_name)
|
|
1850
|
+
else:
|
|
1851
|
+
criteria_type = "unknown"
|
|
1852
|
+
metrics.append(criteria_name)
|
|
1853
|
+
testing_criteria_name_types_metrics[criteria_name] = {"type": criteria_type, "metrics": metrics}
|
|
1854
|
+
|
|
1855
|
+
created_time = int(time.time())
|
|
1856
|
+
converted_rows = []
|
|
1857
|
+
|
|
1858
|
+
for row_idx, row in enumerate(results.get("rows", [])):
|
|
1859
|
+
# Group outputs by test criteria name
|
|
1860
|
+
criteria_groups = {criteria: {} for criteria in testing_criteria_name_types_metrics.keys()}
|
|
1861
|
+
input_groups = {}
|
|
1862
|
+
top_sample = {}
|
|
1863
|
+
for key, value in row.items():
|
|
1864
|
+
if key.startswith("outputs."):
|
|
1865
|
+
# Parse key: outputs.<test-criteria-name>.<metric>
|
|
1866
|
+
parts = key.split(".", 2) # Split into max 3 parts: ['outputs', '<criteria-name>', '<metric>']
|
|
1867
|
+
if len(parts) >= 3:
|
|
1868
|
+
criteria_name = parts[1]
|
|
1869
|
+
metric_name = parts[2]
|
|
1870
|
+
|
|
1871
|
+
if criteria_name not in criteria_groups:
|
|
1872
|
+
criteria_groups[criteria_name] = {}
|
|
1873
|
+
|
|
1874
|
+
criteria_groups[criteria_name][metric_name] = value
|
|
1875
|
+
elif key.startswith("inputs."):
|
|
1876
|
+
input_key = key.replace("inputs.", "")
|
|
1877
|
+
if input_key not in input_groups:
|
|
1878
|
+
input_groups[input_key] = value
|
|
1879
|
+
|
|
1880
|
+
# Convert each criteria group to RunOutputItem result
|
|
1881
|
+
run_output_results = []
|
|
1882
|
+
for criteria_name, metrics in criteria_groups.items():
|
|
1883
|
+
# Extract metrics for this criteria
|
|
1884
|
+
expected_metrics = testing_criteria_name_types_metrics.get(criteria_name, {}).get("metrics", [])
|
|
1885
|
+
criteria_type = testing_criteria_name_types_metrics.get(criteria_name, {}).get("type", "unknown")
|
|
1886
|
+
result_per_metric = {}
|
|
1887
|
+
# Find score - look for various score patterns
|
|
1888
|
+
for metric_key, metric_value in metrics.items():
|
|
1889
|
+
if metric_key.endswith("_score") or metric_key == "score":
|
|
1890
|
+
metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
|
|
1891
|
+
if metric not in result_per_metric:
|
|
1892
|
+
result_per_metric[metric] = {"score": metric_value}
|
|
1893
|
+
else:
|
|
1894
|
+
result_per_metric[metric]["score"] = metric_value
|
|
1895
|
+
_append_indirect_attachments_to_results(result_per_metric, "score", metric, metric_value)
|
|
1896
|
+
if metric_key == "passed":
|
|
1897
|
+
metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
|
|
1898
|
+
if metric not in result_per_metric:
|
|
1899
|
+
result_per_metric[metric] = {"passed": metric_value}
|
|
1900
|
+
else:
|
|
1901
|
+
result_per_metric[metric]["passed"] = metric_value
|
|
1902
|
+
_append_indirect_attachments_to_results(result_per_metric, "passed", metric, metric_value)
|
|
1903
|
+
elif metric_key.endswith("_result") or metric_key == "result" or metric_key.endswith("_label"):
|
|
1904
|
+
metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
|
|
1905
|
+
label = metric_value
|
|
1906
|
+
passed = (
|
|
1907
|
+
True if (str(metric_value).lower() == "pass" or str(metric_value).lower() == "true") else False
|
|
1908
|
+
)
|
|
1909
|
+
if metric not in result_per_metric:
|
|
1910
|
+
if criteria_type == "azure_ai_evaluator":
|
|
1911
|
+
result_per_metric[metric] = {"label": label, "passed": passed}
|
|
1912
|
+
else:
|
|
1913
|
+
result_per_metric[metric] = {"label": label}
|
|
1914
|
+
else:
|
|
1915
|
+
result_per_metric[metric]["label"] = metric_value
|
|
1916
|
+
if criteria_type == "azure_ai_evaluator":
|
|
1917
|
+
result_per_metric[metric]["passed"] = passed
|
|
1918
|
+
_append_indirect_attachments_to_results(result_per_metric, "label", metric, label)
|
|
1919
|
+
if criteria_type == "azure_ai_evaluator":
|
|
1920
|
+
_append_indirect_attachments_to_results(result_per_metric, "passed", metric, passed)
|
|
1921
|
+
elif (
|
|
1922
|
+
metric_key.endswith("_reason") and not metric_key.endswith("_finish_reason")
|
|
1923
|
+
) or metric_key == "reason":
|
|
1924
|
+
metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
|
|
1925
|
+
if metric not in result_per_metric:
|
|
1926
|
+
result_per_metric[metric] = {"reason": metric_value}
|
|
1927
|
+
else:
|
|
1928
|
+
result_per_metric[metric]["reason"] = metric_value
|
|
1929
|
+
_append_indirect_attachments_to_results(result_per_metric, "reason", metric, metric_value)
|
|
1930
|
+
elif metric_key.endswith("_threshold") or metric_key == "threshold":
|
|
1931
|
+
metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
|
|
1932
|
+
if metric not in result_per_metric:
|
|
1933
|
+
result_per_metric[metric] = {"threshold": metric_value}
|
|
1934
|
+
else:
|
|
1935
|
+
result_per_metric[metric]["threshold"] = metric_value
|
|
1936
|
+
_append_indirect_attachments_to_results(result_per_metric, "threshold", metric, metric_value)
|
|
1937
|
+
elif metric_key == "sample":
|
|
1938
|
+
metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
|
|
1939
|
+
if metric not in result_per_metric:
|
|
1940
|
+
result_per_metric[metric] = {"sample": metric_value}
|
|
1941
|
+
else:
|
|
1942
|
+
result_per_metric[metric]["sample"] = metric_value
|
|
1943
|
+
_append_indirect_attachments_to_results(result_per_metric, "sample", metric, metric_value)
|
|
1944
|
+
elif metric_key.endswith("_finish_reason"):
|
|
1945
|
+
metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
|
|
1946
|
+
if metric not in result_per_metric:
|
|
1947
|
+
result_per_metric[metric] = {"sample": {"finish_reason": metric_value}}
|
|
1948
|
+
elif metric in result_per_metric and "sample" not in result_per_metric[metric]:
|
|
1949
|
+
result_per_metric[metric]["sample"] = {"finish_reason": metric_value}
|
|
1950
|
+
elif (
|
|
1951
|
+
metric in result_per_metric
|
|
1952
|
+
and "sample" in result_per_metric[metric]
|
|
1953
|
+
and "finish_reason" not in result_per_metric[metric]["sample"]
|
|
1954
|
+
):
|
|
1955
|
+
result_per_metric[metric]["sample"]["finish_reason"] = metric_value
|
|
1956
|
+
_append_indirect_attachments_to_results(
|
|
1957
|
+
result_per_metric, "sample", metric, metric_value, "finish_reason"
|
|
1958
|
+
)
|
|
1959
|
+
elif metric_key.endswith("_model"):
|
|
1960
|
+
metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
|
|
1961
|
+
if metric not in result_per_metric:
|
|
1962
|
+
result_per_metric[metric] = {"sample": {"model": metric_value}}
|
|
1963
|
+
elif metric in result_per_metric and "sample" not in result_per_metric[metric]:
|
|
1964
|
+
result_per_metric[metric]["sample"] = {"model": metric_value}
|
|
1965
|
+
elif (
|
|
1966
|
+
metric in result_per_metric
|
|
1967
|
+
and "sample" in result_per_metric[metric]
|
|
1968
|
+
and "model" not in result_per_metric[metric]["sample"]
|
|
1969
|
+
):
|
|
1970
|
+
result_per_metric[metric]["sample"]["model"] = metric_value
|
|
1971
|
+
_append_indirect_attachments_to_results(result_per_metric, "sample", metric, metric_value, "model")
|
|
1972
|
+
elif metric_key.endswith("_sample_input"):
|
|
1973
|
+
metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
|
|
1974
|
+
input_metric_val_json: Optional[List[Dict[str, Any]]] = []
|
|
1975
|
+
try:
|
|
1976
|
+
input_metric_val_json = json.loads(metric_value)
|
|
1977
|
+
except Exception as e:
|
|
1978
|
+
logger.warning(f"Failed to parse _sample_input value as JSON: {e}")
|
|
1979
|
+
if metric not in result_per_metric:
|
|
1980
|
+
result_per_metric[metric] = {"sample": {"input": input_metric_val_json}}
|
|
1981
|
+
elif metric in result_per_metric and "sample" not in result_per_metric[metric]:
|
|
1982
|
+
result_per_metric[metric]["sample"] = {"input": input_metric_val_json}
|
|
1983
|
+
elif (
|
|
1984
|
+
metric in result_per_metric
|
|
1985
|
+
and "sample" in result_per_metric[metric]
|
|
1986
|
+
and "input" not in result_per_metric[metric]["sample"]
|
|
1987
|
+
):
|
|
1988
|
+
result_per_metric[metric]["sample"]["input"] = input_metric_val_json
|
|
1989
|
+
_append_indirect_attachments_to_results(
|
|
1990
|
+
result_per_metric, "sample", metric, input_metric_val_json, "input"
|
|
1991
|
+
)
|
|
1992
|
+
elif metric_key.endswith("_sample_output"):
|
|
1993
|
+
metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
|
|
1994
|
+
output_metric_val_json: Optional[List[Dict[str, Any]]] = []
|
|
1995
|
+
try:
|
|
1996
|
+
output_metric_val_json = json.loads(metric_value)
|
|
1997
|
+
except Exception as e:
|
|
1998
|
+
logger.warning(f"Failed to parse _sample_output value as JSON: {e}")
|
|
1999
|
+
if metric not in result_per_metric:
|
|
2000
|
+
result_per_metric[metric] = {"sample": {"output": output_metric_val_json}}
|
|
2001
|
+
elif metric in result_per_metric and "sample" not in result_per_metric[metric]:
|
|
2002
|
+
result_per_metric[metric]["sample"] = {"output": output_metric_val_json}
|
|
2003
|
+
elif (
|
|
2004
|
+
metric in result_per_metric
|
|
2005
|
+
and "sample" in result_per_metric[metric]
|
|
2006
|
+
and "output" not in result_per_metric[metric]["sample"]
|
|
2007
|
+
):
|
|
2008
|
+
result_per_metric[metric]["sample"]["output"] = output_metric_val_json
|
|
2009
|
+
_append_indirect_attachments_to_results(
|
|
2010
|
+
result_per_metric, "sample", metric, output_metric_val_json, "output"
|
|
2011
|
+
)
|
|
2012
|
+
elif metric_key.endswith("_total_tokens"):
|
|
2013
|
+
metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
|
|
2014
|
+
if metric not in result_per_metric:
|
|
2015
|
+
result_per_metric[metric] = {"sample": {"usage": {"total_tokens": metric_value}}}
|
|
2016
|
+
elif metric in result_per_metric and "sample" not in result_per_metric[metric]:
|
|
2017
|
+
result_per_metric[metric]["sample"] = {"usage": {"total_tokens": metric_value}}
|
|
2018
|
+
elif (
|
|
2019
|
+
metric in result_per_metric
|
|
2020
|
+
and "sample" in result_per_metric[metric]
|
|
2021
|
+
and "usage" not in result_per_metric[metric]["sample"]
|
|
2022
|
+
):
|
|
2023
|
+
result_per_metric[metric]["sample"]["usage"] = {"total_tokens": metric_value}
|
|
2024
|
+
else:
|
|
2025
|
+
result_per_metric[metric]["sample"]["usage"]["total_tokens"] = metric_value
|
|
2026
|
+
_append_indirect_attachments_to_results(
|
|
2027
|
+
result_per_metric, "sample", metric, metric_value, "usage", "total_tokens"
|
|
2028
|
+
)
|
|
2029
|
+
elif metric_key.endswith("_prompt_tokens"):
|
|
2030
|
+
metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
|
|
2031
|
+
if metric not in result_per_metric:
|
|
2032
|
+
result_per_metric[metric] = {"sample": {"usage": {"prompt_tokens": metric_value}}}
|
|
2033
|
+
elif metric in result_per_metric and "sample" not in result_per_metric[metric]:
|
|
2034
|
+
result_per_metric[metric]["sample"] = {"usage": {"prompt_tokens": metric_value}}
|
|
2035
|
+
elif (
|
|
2036
|
+
metric in result_per_metric
|
|
2037
|
+
and "sample" in result_per_metric[metric]
|
|
2038
|
+
and "usage" not in result_per_metric[metric]["sample"]
|
|
2039
|
+
):
|
|
2040
|
+
result_per_metric[metric]["sample"]["usage"] = {"prompt_tokens": metric_value}
|
|
2041
|
+
else:
|
|
2042
|
+
result_per_metric[metric]["sample"]["usage"]["prompt_tokens"] = metric_value
|
|
2043
|
+
_append_indirect_attachments_to_results(
|
|
2044
|
+
result_per_metric, "sample", metric, metric_value, "usage", "prompt_tokens"
|
|
2045
|
+
)
|
|
2046
|
+
elif metric_key.endswith("_completion_tokens"):
|
|
2047
|
+
metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
|
|
2048
|
+
if metric not in result_per_metric:
|
|
2049
|
+
result_per_metric[metric] = {"sample": {"usage": {"completion_tokens": metric_value}}}
|
|
2050
|
+
elif metric in result_per_metric and "sample" not in result_per_metric[metric]:
|
|
2051
|
+
result_per_metric[metric]["sample"] = {"usage": {"completion_tokens": metric_value}}
|
|
2052
|
+
elif (
|
|
2053
|
+
metric in result_per_metric
|
|
2054
|
+
and "sample" in result_per_metric[metric]
|
|
2055
|
+
and "usage" not in result_per_metric[metric]["sample"]
|
|
2056
|
+
):
|
|
2057
|
+
result_per_metric[metric]["sample"]["usage"] = {"completion_tokens": metric_value}
|
|
2058
|
+
else:
|
|
2059
|
+
result_per_metric[metric]["sample"]["usage"]["completion_tokens"] = metric_value
|
|
2060
|
+
_append_indirect_attachments_to_results(
|
|
2061
|
+
result_per_metric, "sample", metric, metric_value, "usage", "completion_tokens"
|
|
2062
|
+
)
|
|
2063
|
+
elif not any(
|
|
2064
|
+
metric_key.endswith(suffix)
|
|
2065
|
+
for suffix in [
|
|
2066
|
+
"_result",
|
|
2067
|
+
"_reason",
|
|
2068
|
+
"_threshold",
|
|
2069
|
+
"_label",
|
|
2070
|
+
"_score",
|
|
2071
|
+
"_model",
|
|
2072
|
+
"_finish_reason",
|
|
2073
|
+
"_sample_input",
|
|
2074
|
+
"_sample_output",
|
|
2075
|
+
"_total_tokens",
|
|
2076
|
+
"_prompt_tokens",
|
|
2077
|
+
"_completion_tokens",
|
|
2078
|
+
]
|
|
2079
|
+
):
|
|
2080
|
+
metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
|
|
2081
|
+
# If no score found yet and this doesn't match other patterns, use as score
|
|
2082
|
+
if metric_key == metric and metric not in result_per_metric:
|
|
2083
|
+
result_per_metric[metric] = {"score": metric_value}
|
|
2084
|
+
elif metric_key == metric and result_per_metric[metric].get("score", None) is None:
|
|
2085
|
+
result_per_metric[metric]["score"] = metric_value
|
|
2086
|
+
|
|
2087
|
+
for metric, metric_values in result_per_metric.items():
|
|
2088
|
+
score = metric_values.get("score", None)
|
|
2089
|
+
label = metric_values.get("label", None)
|
|
2090
|
+
reason = metric_values.get("reason", None)
|
|
2091
|
+
threshold = metric_values.get("threshold", None)
|
|
2092
|
+
passed = metric_values.get("passed", None)
|
|
2093
|
+
sample = metric_values.get("sample", None)
|
|
2094
|
+
|
|
2095
|
+
# Create result object for this criteria
|
|
2096
|
+
result_obj = {
|
|
2097
|
+
"type": testing_criteria_name_types_metrics.get(criteria_name, {}).get(
|
|
2098
|
+
"type", "azure_ai_evaluator"
|
|
2099
|
+
),
|
|
2100
|
+
"name": criteria_name, # Use criteria name as name
|
|
2101
|
+
"metric": metric if metric is not None else criteria_name, # Use criteria name as metric
|
|
2102
|
+
}
|
|
2103
|
+
# Add optional fields
|
|
2104
|
+
if (
|
|
2105
|
+
metric in _EvaluatorMetricMapping.EVALUATOR_NAME_METRICS_MAPPINGS["indirect_attack"]
|
|
2106
|
+
or metric in _EvaluatorMetricMapping.EVALUATOR_NAME_METRICS_MAPPINGS["code_vulnerability"]
|
|
2107
|
+
or metric in _EvaluatorMetricMapping.EVALUATOR_NAME_METRICS_MAPPINGS["protected_material"]
|
|
2108
|
+
):
|
|
2109
|
+
copy_label = label
|
|
2110
|
+
if copy_label is not None and isinstance(copy_label, bool) and copy_label == True:
|
|
2111
|
+
label = "fail"
|
|
2112
|
+
score = 0.0
|
|
2113
|
+
passed = False
|
|
2114
|
+
else:
|
|
2115
|
+
label = "pass"
|
|
2116
|
+
score = 1.0
|
|
2117
|
+
passed = True
|
|
2118
|
+
result_obj["score"] = (
|
|
2119
|
+
score if not (score is None or (isinstance(score, float) and math.isnan(score))) else None
|
|
2120
|
+
)
|
|
2121
|
+
result_obj["label"] = label
|
|
2122
|
+
result_obj["reason"] = reason
|
|
2123
|
+
result_obj["threshold"] = threshold
|
|
2124
|
+
result_obj["passed"] = passed
|
|
2125
|
+
|
|
2126
|
+
if sample is not None:
|
|
2127
|
+
result_obj["sample"] = sample
|
|
2128
|
+
top_sample = sample # Save top sample for the row
|
|
2129
|
+
run_output_results.append(result_obj)
|
|
2130
|
+
|
|
2131
|
+
if (
|
|
2132
|
+
eval_run_summary
|
|
2133
|
+
and criteria_name in eval_run_summary
|
|
2134
|
+
and isinstance(eval_run_summary[criteria_name], dict)
|
|
2135
|
+
and "error_code" in eval_run_summary[criteria_name]
|
|
2136
|
+
) and eval_run_summary[criteria_name].get("error_code", None) is not None:
|
|
2137
|
+
error_info = (
|
|
2138
|
+
{
|
|
2139
|
+
"code": eval_run_summary[criteria_name].get("error_code", None),
|
|
2140
|
+
"message": eval_run_summary[criteria_name].get("error_message", None),
|
|
2141
|
+
}
|
|
2142
|
+
if eval_run_summary[criteria_name].get("error_code", None) is not None
|
|
2143
|
+
else None
|
|
2144
|
+
)
|
|
2145
|
+
sample = {"error": error_info} if error_info is not None else None
|
|
2146
|
+
# Create result object for this criteria
|
|
2147
|
+
metrics = testing_criteria_name_types_metrics.get(criteria_name, {}).get("metrics", [])
|
|
2148
|
+
for metric in metrics:
|
|
2149
|
+
result_obj = {
|
|
2150
|
+
"type": testing_criteria_name_types_metrics.get(criteria_name, {}).get(
|
|
2151
|
+
"type", "azure_ai_evaluator"
|
|
2152
|
+
),
|
|
2153
|
+
"name": criteria_name, # Use criteria name as name
|
|
2154
|
+
"metric": metric if metric is not None else criteria_name, # Use criteria name as metric
|
|
2155
|
+
"score": None,
|
|
2156
|
+
"label": None,
|
|
2157
|
+
"reason": None,
|
|
2158
|
+
"threshold": None,
|
|
2159
|
+
"passed": None,
|
|
2160
|
+
"sample": sample,
|
|
2161
|
+
}
|
|
2162
|
+
run_output_results.append(result_obj)
|
|
2163
|
+
|
|
2164
|
+
# Create RunOutputItem structure
|
|
2165
|
+
run_output_item = {
|
|
2166
|
+
"object": "eval.run.output_item",
|
|
2167
|
+
"id": f"{row_idx+1}",
|
|
2168
|
+
"run_id": eval_run_id,
|
|
2169
|
+
"eval_id": eval_id,
|
|
2170
|
+
"created_at": created_time,
|
|
2171
|
+
"datasource_item_id": row_idx,
|
|
2172
|
+
"datasource_item": input_groups,
|
|
2173
|
+
"results": run_output_results,
|
|
2174
|
+
"status": "completed" if len(run_output_results) > 0 else "error",
|
|
2175
|
+
}
|
|
2176
|
+
|
|
2177
|
+
run_output_item["sample"] = top_sample
|
|
2178
|
+
|
|
2179
|
+
converted_rows.append(run_output_item)
|
|
2180
|
+
|
|
2181
|
+
# Create converted results maintaining the same structure
|
|
2182
|
+
results["_evaluation_results_list"] = converted_rows
|
|
2183
|
+
logger.info(
|
|
2184
|
+
f"Converted {len(converted_rows)} rows to AOAI evaluation format, eval_id: {eval_id}, eval_run_id: {eval_run_id}"
|
|
2185
|
+
)
|
|
2186
|
+
# Calculate summary statistics
|
|
2187
|
+
evaluation_summary = _calculate_aoai_evaluation_summary(converted_rows, logger)
|
|
2188
|
+
results["_evaluation_summary"] = evaluation_summary
|
|
2189
|
+
logger.info(
|
|
2190
|
+
f"Summary statistics calculated for {len(converted_rows)} rows, eval_id: {eval_id}, eval_run_id: {eval_run_id}"
|
|
2191
|
+
)
|
|
2192
|
+
|
|
2193
|
+
|
|
2194
|
+
def _append_indirect_attachments_to_results(
|
|
2195
|
+
current_result_dict: Dict[str, Any],
|
|
2196
|
+
result_name: str,
|
|
2197
|
+
metric: str,
|
|
2198
|
+
metric_value: Any,
|
|
2199
|
+
nested_result_name: Optional[str] = None,
|
|
2200
|
+
secondnested_result_name: Optional[str] = None,
|
|
2201
|
+
) -> None:
|
|
2202
|
+
"""
|
|
2203
|
+
Append indirect attachments to the current result dictionary.
|
|
2204
|
+
|
|
2205
|
+
:param current_result_dict: The current result dictionary to update
|
|
2206
|
+
:type current_result_dict: Dict[str, Any]
|
|
2207
|
+
:param result_name: The result name
|
|
2208
|
+
:type result_name: str
|
|
2209
|
+
:param metric: The metric name
|
|
2210
|
+
:type metric: str
|
|
2211
|
+
:param metric_value: The value of the metric
|
|
2212
|
+
:type metric_value: Any
|
|
2213
|
+
"""
|
|
2214
|
+
if metric == "xpia" and result_name:
|
|
2215
|
+
for metric_extended in ["xpia_manipulated_content", "xpia_intrusion", "xpia_information_gathering"]:
|
|
2216
|
+
if nested_result_name is None:
|
|
2217
|
+
if metric_extended not in current_result_dict:
|
|
2218
|
+
current_result_dict[metric_extended] = {result_name: metric_value}
|
|
2219
|
+
else:
|
|
2220
|
+
current_result_dict[metric_extended][result_name] = metric_value
|
|
2221
|
+
elif nested_result_name is not None and secondnested_result_name is None:
|
|
2222
|
+
if metric_extended not in current_result_dict:
|
|
2223
|
+
current_result_dict[metric_extended] = {result_name: {nested_result_name: metric_value}}
|
|
2224
|
+
elif metric_extended in current_result_dict and result_name not in current_result_dict[metric_extended]:
|
|
2225
|
+
current_result_dict[metric_extended][result_name] = {nested_result_name: metric_value}
|
|
2226
|
+
elif (
|
|
2227
|
+
metric_extended in current_result_dict
|
|
2228
|
+
and result_name in current_result_dict[metric_extended]
|
|
2229
|
+
and nested_result_name not in current_result_dict[metric_extended][result_name]
|
|
2230
|
+
):
|
|
2231
|
+
current_result_dict[metric_extended][result_name][nested_result_name] = metric_value
|
|
2232
|
+
elif nested_result_name is not None and secondnested_result_name is not None:
|
|
2233
|
+
if metric_extended not in current_result_dict:
|
|
2234
|
+
current_result_dict[metric_extended] = {
|
|
2235
|
+
result_name: {nested_result_name: {secondnested_result_name: metric_value}}
|
|
2236
|
+
}
|
|
2237
|
+
elif metric_extended in current_result_dict and result_name not in current_result_dict[metric_extended]:
|
|
2238
|
+
current_result_dict[metric_extended][result_name] = {
|
|
2239
|
+
nested_result_name: {secondnested_result_name: metric_value}
|
|
2240
|
+
}
|
|
2241
|
+
elif (
|
|
2242
|
+
metric_extended in current_result_dict
|
|
2243
|
+
and result_name in current_result_dict[metric_extended]
|
|
2244
|
+
and nested_result_name not in current_result_dict[metric_extended][result_name]
|
|
2245
|
+
):
|
|
2246
|
+
current_result_dict[metric_extended][result_name][nested_result_name] = {
|
|
2247
|
+
secondnested_result_name: metric_value
|
|
2248
|
+
}
|
|
2249
|
+
else:
|
|
2250
|
+
(
|
|
2251
|
+
current_result_dict[metric_extended][result_name][nested_result_name][secondnested_result_name]
|
|
2252
|
+
) = metric_value
|
|
2253
|
+
|
|
2254
|
+
|
|
2255
|
+
def _get_metric_from_criteria(testing_criteria_name: str, metric_key: str, metric_list: List[str]) -> str:
|
|
2256
|
+
"""
|
|
2257
|
+
Get the metric name from the testing criteria and metric key.
|
|
2258
|
+
|
|
2259
|
+
:param testing_criteria_name: The name of the testing criteria
|
|
2260
|
+
:type testing_criteria_name: str
|
|
2261
|
+
:param metric_key: The metric key to look for
|
|
2262
|
+
:type metric_key: str
|
|
2263
|
+
:param metric_list: List of expected metrics for the testing criteria
|
|
2264
|
+
:type metric_list: List[str]
|
|
2265
|
+
:return: The metric name if found, otherwise the testing criteria name
|
|
2266
|
+
:rtype: str
|
|
2267
|
+
"""
|
|
2268
|
+
metric = None
|
|
2269
|
+
|
|
2270
|
+
if metric_key == "xpia_manipulated_content":
|
|
2271
|
+
metric = "xpia_manipulated_content"
|
|
2272
|
+
return metric
|
|
2273
|
+
elif metric_key == "xpia_intrusion":
|
|
2274
|
+
metric = "xpia_intrusion"
|
|
2275
|
+
return metric
|
|
2276
|
+
elif metric_key == "xpia_information_gathering":
|
|
2277
|
+
metric = "xpia_information_gathering"
|
|
2278
|
+
return metric
|
|
2279
|
+
for expected_metric in metric_list:
|
|
2280
|
+
if metric_key.startswith(expected_metric):
|
|
2281
|
+
metric = expected_metric
|
|
2282
|
+
break
|
|
2283
|
+
if metric is None:
|
|
2284
|
+
metric = testing_criteria_name
|
|
2285
|
+
return metric
|
|
2286
|
+
|
|
2287
|
+
|
|
2288
|
+
def _calculate_aoai_evaluation_summary(aoai_results: list, logger: logging.Logger) -> Dict[str, Any]:
|
|
2289
|
+
"""
|
|
2290
|
+
Calculate summary statistics for AOAI evaluation results.
|
|
2291
|
+
|
|
2292
|
+
:param aoai_results: List of AOAI result objects (run_output_items)
|
|
2293
|
+
:type aoai_results: list
|
|
2294
|
+
:return: Summary statistics dictionary
|
|
2295
|
+
:rtype: Dict[str, Any]
|
|
2296
|
+
"""
|
|
2297
|
+
# Calculate result counts based on aoaiResults
|
|
2298
|
+
result_counts = {"total": 0, "errored": 0, "failed": 0, "passed": 0}
|
|
2299
|
+
|
|
2300
|
+
# Count results by status and calculate per model usage
|
|
2301
|
+
model_usage_stats = {} # Dictionary to aggregate usage by model
|
|
2302
|
+
result_counts_stats = {} # Dictionary to aggregate usage by model
|
|
2303
|
+
|
|
2304
|
+
for aoai_result in aoai_results:
|
|
2305
|
+
logger.info(
|
|
2306
|
+
f"Processing aoai_result with id: {getattr(aoai_result, 'id', 'unknown')}, row keys: {aoai_result.keys() if hasattr(aoai_result, 'keys') else 'N/A'}"
|
|
2307
|
+
)
|
|
2308
|
+
result_counts["total"] += 1
|
|
2309
|
+
passed_count = 0
|
|
2310
|
+
failed_count = 0
|
|
2311
|
+
error_count = 0
|
|
2312
|
+
if isinstance(aoai_result, dict) and "results" in aoai_result:
|
|
2313
|
+
logger.info(
|
|
2314
|
+
f"Processing aoai_result with id: {getattr(aoai_result, 'id', 'unknown')}, results count: {len(aoai_result['results'])}"
|
|
2315
|
+
)
|
|
2316
|
+
for result_item in aoai_result["results"]:
|
|
2317
|
+
if isinstance(result_item, dict):
|
|
2318
|
+
# Check if the result has a 'passed' field
|
|
2319
|
+
if "passed" in result_item and result_item["passed"] is not None:
|
|
2320
|
+
testing_criteria = result_item.get("name", "")
|
|
2321
|
+
if testing_criteria not in result_counts_stats:
|
|
2322
|
+
result_counts_stats[testing_criteria] = {
|
|
2323
|
+
"testing_criteria": testing_criteria,
|
|
2324
|
+
"failed": 0,
|
|
2325
|
+
"passed": 0,
|
|
2326
|
+
}
|
|
2327
|
+
if result_item["passed"] is True:
|
|
2328
|
+
passed_count += 1
|
|
2329
|
+
result_counts_stats[testing_criteria]["passed"] += 1
|
|
2330
|
+
|
|
2331
|
+
elif result_item["passed"] is False:
|
|
2332
|
+
failed_count += 1
|
|
2333
|
+
result_counts_stats[testing_criteria]["failed"] += 1
|
|
2334
|
+
# Check if the result indicates an error status
|
|
2335
|
+
elif ("status" in result_item and result_item["status"] in ["error", "errored"]) or (
|
|
2336
|
+
"sample" in result_item
|
|
2337
|
+
and isinstance(result_item["sample"], dict)
|
|
2338
|
+
and result_item["sample"].get("error", None) is not None
|
|
2339
|
+
):
|
|
2340
|
+
error_count += 1
|
|
2341
|
+
elif hasattr(aoai_result, "status") and aoai_result.status == "error":
|
|
2342
|
+
error_count += 1
|
|
2343
|
+
elif isinstance(aoai_result, dict) and aoai_result.get("status") == "error":
|
|
2344
|
+
error_count += 1
|
|
2345
|
+
|
|
2346
|
+
if error_count > 0:
|
|
2347
|
+
result_counts["errored"] += 1
|
|
2348
|
+
elif failed_count > 0:
|
|
2349
|
+
result_counts["failed"] += 1
|
|
2350
|
+
elif (
|
|
2351
|
+
error_count == 0
|
|
2352
|
+
and failed_count == 0
|
|
2353
|
+
and passed_count > 0
|
|
2354
|
+
and passed_count == len(aoai_result.get("results", []))
|
|
2355
|
+
):
|
|
2356
|
+
result_counts["passed"] += 1
|
|
2357
|
+
|
|
2358
|
+
# Extract usage statistics from aoai_result.sample
|
|
2359
|
+
sample_data_list = []
|
|
2360
|
+
dup_usage_list = _EvaluatorMetricMapping.EVALUATOR_NAME_METRICS_MAPPINGS["indirect_attack"].copy()
|
|
2361
|
+
dup_usage_list.remove("xpia")
|
|
2362
|
+
if isinstance(aoai_result, dict) and aoai_result["results"] and isinstance(aoai_result["results"], list):
|
|
2363
|
+
for result_item in aoai_result["results"]:
|
|
2364
|
+
if (
|
|
2365
|
+
isinstance(result_item, dict)
|
|
2366
|
+
and "sample" in result_item
|
|
2367
|
+
and result_item["sample"]
|
|
2368
|
+
and result_item["metric"] not in dup_usage_list
|
|
2369
|
+
):
|
|
2370
|
+
sample_data_list.append(result_item["sample"])
|
|
2371
|
+
|
|
2372
|
+
for sample_data in sample_data_list:
|
|
2373
|
+
if sample_data and isinstance(sample_data, dict) and "usage" in sample_data:
|
|
2374
|
+
usage_data = sample_data["usage"]
|
|
2375
|
+
model_name = sample_data.get("model", "unknown")
|
|
2376
|
+
if model_name not in model_usage_stats:
|
|
2377
|
+
model_usage_stats[model_name] = {
|
|
2378
|
+
"invocation_count": 0,
|
|
2379
|
+
"total_tokens": 0,
|
|
2380
|
+
"prompt_tokens": 0,
|
|
2381
|
+
"completion_tokens": 0,
|
|
2382
|
+
"cached_tokens": 0,
|
|
2383
|
+
}
|
|
2384
|
+
# Aggregate usage statistics
|
|
2385
|
+
model_stats = model_usage_stats[model_name]
|
|
2386
|
+
model_stats["invocation_count"] += 1
|
|
2387
|
+
if isinstance(usage_data, dict):
|
|
2388
|
+
model_stats["total_tokens"] += usage_data.get("total_tokens", 0)
|
|
2389
|
+
model_stats["prompt_tokens"] += usage_data.get("prompt_tokens", 0)
|
|
2390
|
+
model_stats["completion_tokens"] += usage_data.get("completion_tokens", 0)
|
|
2391
|
+
model_stats["cached_tokens"] += usage_data.get("cached_tokens", 0)
|
|
2392
|
+
|
|
2393
|
+
# Convert model usage stats to list format matching EvaluationRunPerModelUsage
|
|
2394
|
+
per_model_usage = []
|
|
2395
|
+
for model_name, stats in model_usage_stats.items():
|
|
2396
|
+
per_model_usage.append(
|
|
2397
|
+
{
|
|
2398
|
+
"model_name": model_name,
|
|
2399
|
+
"invocation_count": stats["invocation_count"],
|
|
2400
|
+
"total_tokens": stats["total_tokens"],
|
|
2401
|
+
"prompt_tokens": stats["prompt_tokens"],
|
|
2402
|
+
"completion_tokens": stats["completion_tokens"],
|
|
2403
|
+
"cached_tokens": stats["cached_tokens"],
|
|
2404
|
+
}
|
|
2405
|
+
)
|
|
2406
|
+
result_counts_stats_val = []
|
|
2407
|
+
logger.info(f"\r\n Result counts stats: {result_counts_stats}")
|
|
2408
|
+
for criteria_name, stats_val in result_counts_stats.items():
|
|
2409
|
+
if isinstance(stats_val, dict):
|
|
2410
|
+
logger.info(f"\r\n Criteria: {criteria_name}, stats: {stats_val}")
|
|
2411
|
+
result_counts_stats_val.append(
|
|
2412
|
+
{
|
|
2413
|
+
"testing_criteria": criteria_name,
|
|
2414
|
+
"passed": stats_val.get("passed", 0),
|
|
2415
|
+
"failed": stats_val.get("failed", 0),
|
|
2416
|
+
}
|
|
2417
|
+
)
|
|
2418
|
+
return {
|
|
2419
|
+
"result_counts": result_counts,
|
|
2420
|
+
"per_model_usage": per_model_usage,
|
|
2421
|
+
"per_testing_criteria_results": result_counts_stats_val,
|
|
2422
|
+
}
|