azure-ai-evaluation 1.12.0__py3-none-any.whl → 1.13.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- azure/ai/evaluation/__init__.py +2 -0
- azure/ai/evaluation/_aoai/aoai_grader.py +6 -9
- azure/ai/evaluation/_aoai/label_grader.py +6 -10
- azure/ai/evaluation/_aoai/python_grader.py +7 -10
- azure/ai/evaluation/_aoai/score_model_grader.py +5 -7
- azure/ai/evaluation/_aoai/string_check_grader.py +4 -9
- azure/ai/evaluation/_aoai/text_similarity_grader.py +7 -21
- azure/ai/evaluation/_common/__init__.py +2 -1
- azure/ai/evaluation/_common/constants.py +109 -0
- azure/ai/evaluation/_common/evaluation_onedp_client.py +5 -5
- azure/ai/evaluation/_common/onedp/__init__.py +2 -2
- azure/ai/evaluation/_common/onedp/_client.py +44 -14
- azure/ai/evaluation/_common/onedp/_configuration.py +9 -7
- azure/ai/evaluation/_common/onedp/_utils/model_base.py +1 -1
- azure/ai/evaluation/_common/onedp/_validation.py +18 -2
- azure/ai/evaluation/_common/onedp/aio/__init__.py +2 -2
- azure/ai/evaluation/_common/onedp/aio/_client.py +44 -14
- azure/ai/evaluation/_common/onedp/aio/_configuration.py +9 -7
- azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +12 -0
- azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +3942 -1631
- azure/ai/evaluation/_common/onedp/models/__init__.py +196 -6
- azure/ai/evaluation/_common/onedp/models/_enums.py +217 -0
- azure/ai/evaluation/_common/onedp/models/_models.py +3876 -603
- azure/ai/evaluation/_common/onedp/operations/__init__.py +12 -0
- azure/ai/evaluation/_common/onedp/operations/_operations.py +5422 -2577
- azure/ai/evaluation/_common/rai_service.py +299 -2
- azure/ai/evaluation/_common/utils.py +241 -39
- azure/ai/evaluation/_constants.py +100 -0
- azure/ai/evaluation/_eval_mapping.py +10 -2
- azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +10 -0
- azure/ai/evaluation/_evaluate/_evaluate.py +1019 -5
- azure/ai/evaluation/_evaluate/_evaluate_aoai.py +86 -11
- azure/ai/evaluation/_evaluate/_utils.py +10 -3
- azure/ai/evaluation/_evaluator_definition.py +76 -0
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +1 -0
- azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +0 -17
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +16 -4
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +181 -3
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +107 -45
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +30 -29
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +19 -14
- azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +23 -4
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +14 -6
- azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +36 -19
- azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +43 -20
- azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +32 -6
- azure/ai/evaluation/_evaluators/{_path_efficiency → _task_completion}/__init__.py +2 -2
- azure/ai/evaluation/_evaluators/{_task_success/_task_success.py → _task_completion/_task_completion.py} +39 -30
- azure/ai/evaluation/_evaluators/{_task_success/task_success.prompty → _task_completion/task_completion.prompty} +2 -2
- azure/ai/evaluation/_evaluators/_task_navigation_efficiency/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/{_path_efficiency/_path_efficiency.py → _task_navigation_efficiency/_task_navigation_efficiency.py} +115 -73
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +23 -127
- azure/ai/evaluation/_evaluators/_tool_input_accuracy/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py +263 -0
- azure/ai/evaluation/_evaluators/_tool_input_accuracy/tool_input_accuracy.prompty +76 -0
- azure/ai/evaluation/_evaluators/_tool_output_utilization/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py +225 -0
- azure/ai/evaluation/_evaluators/_tool_output_utilization/tool_output_utilization.prompty +221 -0
- azure/ai/evaluation/_evaluators/_tool_selection/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py +266 -0
- azure/ai/evaluation/_evaluators/_tool_selection/tool_selection.prompty +104 -0
- azure/ai/evaluation/_evaluators/{_task_success → _tool_success}/__init__.py +2 -2
- azure/ai/evaluation/_evaluators/_tool_success/_tool_success.py +301 -0
- azure/ai/evaluation/_evaluators/_tool_success/tool_success.prompty +321 -0
- azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +0 -19
- azure/ai/evaluation/_exceptions.py +6 -1
- azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +14 -1
- azure/ai/evaluation/_legacy/prompty/_prompty.py +2 -1
- azure/ai/evaluation/_legacy/prompty/_utils.py +54 -6
- azure/ai/evaluation/_model_configurations.py +26 -0
- azure/ai/evaluation/_version.py +1 -1
- azure/ai/evaluation/red_team/_attack_objective_generator.py +3 -1
- azure/ai/evaluation/red_team/_attack_strategy.py +1 -0
- azure/ai/evaluation/red_team/_callback_chat_target.py +45 -14
- azure/ai/evaluation/red_team/_evaluation_processor.py +129 -12
- azure/ai/evaluation/red_team/_mlflow_integration.py +41 -352
- azure/ai/evaluation/red_team/_orchestrator_manager.py +309 -51
- azure/ai/evaluation/red_team/_red_team.py +494 -37
- azure/ai/evaluation/red_team/_red_team_result.py +48 -28
- azure/ai/evaluation/red_team/_result_processor.py +558 -29
- azure/ai/evaluation/red_team/_utils/constants.py +1 -0
- azure/ai/evaluation/red_team/_utils/formatting_utils.py +125 -24
- azure/ai/evaluation/red_team/_utils/metric_mapping.py +10 -7
- azure/ai/evaluation/red_team/_utils/strategy_utils.py +3 -25
- azure/ai/evaluation/simulator/_adversarial_simulator.py +1 -1
- azure/ai/evaluation/simulator/_conversation/__init__.py +1 -1
- azure/ai/evaluation/simulator/_conversation/_conversation.py +1 -1
- azure/ai/evaluation/simulator/_direct_attack_simulator.py +1 -1
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +1 -1
- azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +25 -2
- azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +1 -0
- azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +1 -1
- azure/ai/evaluation/simulator/_model_tools/_template_handler.py +1 -1
- azure/ai/evaluation/simulator/_model_tools/models.py +1 -1
- {azure_ai_evaluation-1.12.0.dist-info → azure_ai_evaluation-1.13.0.dist-info}/METADATA +38 -8
- {azure_ai_evaluation-1.12.0.dist-info → azure_ai_evaluation-1.13.0.dist-info}/RECORD +99 -86
- {azure_ai_evaluation-1.12.0.dist-info → azure_ai_evaluation-1.13.0.dist-info}/WHEEL +0 -0
- {azure_ai_evaluation-1.12.0.dist-info → azure_ai_evaluation-1.13.0.dist-info}/licenses/NOTICE.txt +0 -0
- {azure_ai_evaluation-1.12.0.dist-info → azure_ai_evaluation-1.13.0.dist-info}/top_level.txt +0 -0
|
@@ -5,10 +5,12 @@ import inspect
|
|
|
5
5
|
import contextlib
|
|
6
6
|
import json
|
|
7
7
|
import logging
|
|
8
|
+
import math
|
|
8
9
|
import os
|
|
9
10
|
import re
|
|
10
11
|
import tempfile
|
|
11
12
|
import json
|
|
13
|
+
import time
|
|
12
14
|
from typing import Any, Callable, Dict, Iterable, Iterator, List, Literal, Optional, Set, Tuple, TypedDict, Union, cast
|
|
13
15
|
|
|
14
16
|
from openai import OpenAI, AzureOpenAI
|
|
@@ -20,7 +22,6 @@ from azure.ai.evaluation._common.math import list_mean_nan_safe, apply_transform
|
|
|
20
22
|
from azure.ai.evaluation._common.utils import validate_azure_ai_project, is_onedp_project
|
|
21
23
|
from azure.ai.evaluation._evaluators._common._base_eval import EvaluatorBase
|
|
22
24
|
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
23
|
-
|
|
24
25
|
from azure.ai.evaluation._aoai.aoai_grader import AzureOpenAIGrader
|
|
25
26
|
|
|
26
27
|
from .._constants import (
|
|
@@ -32,8 +33,10 @@ from .._constants import (
|
|
|
32
33
|
_InternalEvaluationMetrics,
|
|
33
34
|
BINARY_AGGREGATE_SUFFIX,
|
|
34
35
|
DEFAULT_OAI_EVAL_RUN_NAME,
|
|
36
|
+
EVALUATION_EVENT_NAME,
|
|
37
|
+
_EvaluatorMetricMapping,
|
|
35
38
|
)
|
|
36
|
-
from .._model_configurations import AzureAIProject, EvaluationResult, EvaluatorConfig
|
|
39
|
+
from .._model_configurations import AzureAIProject, EvaluationResult, EvaluatorConfig, AppInsightsConfig
|
|
37
40
|
from .._user_agent import UserAgentSingleton
|
|
38
41
|
from ._batch_run import (
|
|
39
42
|
EvalRunContext,
|
|
@@ -283,6 +286,51 @@ def _aggregation_binary_output(df: pd.DataFrame) -> Dict[str, float]:
|
|
|
283
286
|
return results
|
|
284
287
|
|
|
285
288
|
|
|
289
|
+
def _get_token_count_columns_to_exclude(df: pd.DataFrame) -> List[str]:
|
|
290
|
+
"""Identify token count columns from known SDK metrics that should be excluded from aggregation.
|
|
291
|
+
|
|
292
|
+
Token counts from custom evaluators are not excluded, only those from EvaluationMetrics
|
|
293
|
+
and _InternalEvaluationMetrics.
|
|
294
|
+
|
|
295
|
+
:param df: The dataframe of evaluation results.
|
|
296
|
+
:type df: ~pandas.DataFrame
|
|
297
|
+
:return: List of column names to exclude from aggregation.
|
|
298
|
+
:rtype: List[str]
|
|
299
|
+
"""
|
|
300
|
+
# Get all metric values from EvaluationMetrics class
|
|
301
|
+
evaluation_metrics_values = [
|
|
302
|
+
getattr(EvaluationMetrics, attr)
|
|
303
|
+
for attr in dir(EvaluationMetrics)
|
|
304
|
+
if not attr.startswith("_") and isinstance(getattr(EvaluationMetrics, attr), str)
|
|
305
|
+
]
|
|
306
|
+
|
|
307
|
+
# Get all metric values from _InternalEvaluationMetrics class
|
|
308
|
+
internal_metrics_values = [
|
|
309
|
+
getattr(_InternalEvaluationMetrics, attr)
|
|
310
|
+
for attr in dir(_InternalEvaluationMetrics)
|
|
311
|
+
if not attr.startswith("_") and isinstance(getattr(_InternalEvaluationMetrics, attr), str)
|
|
312
|
+
]
|
|
313
|
+
|
|
314
|
+
# Combine all known metrics
|
|
315
|
+
all_known_metrics = evaluation_metrics_values + internal_metrics_values
|
|
316
|
+
|
|
317
|
+
# Find token count columns that belong to known metrics
|
|
318
|
+
token_count_cols = [
|
|
319
|
+
col
|
|
320
|
+
for col in df.columns
|
|
321
|
+
if (
|
|
322
|
+
any(
|
|
323
|
+
col.endswith(f"{metric}_prompt_tokens")
|
|
324
|
+
or col.endswith(f"{metric}_completion_tokens")
|
|
325
|
+
or col.endswith(f"{metric}_total_tokens")
|
|
326
|
+
for metric in all_known_metrics
|
|
327
|
+
)
|
|
328
|
+
)
|
|
329
|
+
]
|
|
330
|
+
|
|
331
|
+
return token_count_cols
|
|
332
|
+
|
|
333
|
+
|
|
286
334
|
def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, Callable]) -> Dict[str, float]:
|
|
287
335
|
"""Aggregate metrics from the evaluation results.
|
|
288
336
|
On top of naively calculating the mean of most metrics, this function also identifies certain columns
|
|
@@ -315,6 +363,10 @@ def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, Callable]) -> Dic
|
|
|
315
363
|
handled_columns.extend(label_cols)
|
|
316
364
|
defect_rates.update(label_defect_rates)
|
|
317
365
|
|
|
366
|
+
# Exclude token count columns from aggregation for known SDK metrics
|
|
367
|
+
token_count_cols = _get_token_count_columns_to_exclude(df)
|
|
368
|
+
handled_columns.extend(token_count_cols)
|
|
369
|
+
|
|
318
370
|
# For rest of metrics, we will calculate mean
|
|
319
371
|
df.drop(columns=handled_columns, inplace=True)
|
|
320
372
|
|
|
@@ -793,7 +845,7 @@ def evaluate(
|
|
|
793
845
|
try:
|
|
794
846
|
user_agent: Optional[str] = kwargs.get("user_agent")
|
|
795
847
|
with UserAgentSingleton().add_useragent_product(user_agent) if user_agent else contextlib.nullcontext():
|
|
796
|
-
|
|
848
|
+
results = _evaluate(
|
|
797
849
|
evaluation_name=evaluation_name,
|
|
798
850
|
target=target,
|
|
799
851
|
data=data,
|
|
@@ -805,6 +857,7 @@ def evaluate(
|
|
|
805
857
|
tags=tags,
|
|
806
858
|
**kwargs,
|
|
807
859
|
)
|
|
860
|
+
return results
|
|
808
861
|
except Exception as e:
|
|
809
862
|
# Handle multiprocess bootstrap error
|
|
810
863
|
bootstrap_error = (
|
|
@@ -900,6 +953,7 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
|
|
|
900
953
|
results_df = pd.DataFrame()
|
|
901
954
|
metrics: Dict[str, float] = {}
|
|
902
955
|
eval_run_info_list: List[OAIEvalRunCreationInfo] = []
|
|
956
|
+
eval_run_summary_dict = {}
|
|
903
957
|
|
|
904
958
|
# Start OAI eval runs if any graders are present.
|
|
905
959
|
need_oai_run = len(graders) > 0
|
|
@@ -934,6 +988,8 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
|
|
|
934
988
|
got_local_results = True
|
|
935
989
|
# TODO figure out how to update this printing to include OAI results?
|
|
936
990
|
_print_summary(per_evaluator_results)
|
|
991
|
+
eval_run_summary_dict = {name: result["run_summary"] for name, result in per_evaluator_results.items()}
|
|
992
|
+
LOGGER.info(f"run_summary: \r\n{json.dumps(eval_run_summary_dict, indent=4)}")
|
|
937
993
|
except EvaluationException as e:
|
|
938
994
|
if need_get_oai_results:
|
|
939
995
|
# If there are OAI graders, we only print a warning on local failures.
|
|
@@ -981,13 +1037,322 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
|
|
|
981
1037
|
|
|
982
1038
|
result_df_dict = results_df.to_dict("records")
|
|
983
1039
|
result: EvaluationResult = {"rows": result_df_dict, "metrics": metrics, "studio_url": studio_url} # type: ignore
|
|
1040
|
+
# _add_aoai_structured_results_to_results(result, LOGGER, kwargs.get("eval_meta_data"))
|
|
1041
|
+
|
|
1042
|
+
eval_id: Optional[str] = kwargs.get("_eval_id")
|
|
1043
|
+
eval_run_id: Optional[str] = kwargs.get("_eval_run_id")
|
|
1044
|
+
eval_meta_data: Optional[Dict[str, Any]] = kwargs.get("_eval_meta_data")
|
|
1045
|
+
if kwargs.get("_convert_to_aoai_evaluation_result", False):
|
|
1046
|
+
_convert_results_to_aoai_evaluation_results(
|
|
1047
|
+
result, LOGGER, eval_id, eval_run_id, evaluators_and_graders, eval_run_summary_dict, eval_meta_data
|
|
1048
|
+
)
|
|
1049
|
+
if app_insights_configuration := kwargs.get("_app_insights_configuration"):
|
|
1050
|
+
emit_eval_result_events_to_app_insights(
|
|
1051
|
+
app_insights_configuration, result["_evaluation_results_list"], evaluator_config
|
|
1052
|
+
)
|
|
984
1053
|
|
|
985
1054
|
if output_path:
|
|
986
1055
|
_write_output(output_path, result)
|
|
987
|
-
|
|
988
1056
|
return result
|
|
989
1057
|
|
|
990
1058
|
|
|
1059
|
+
def _build_internal_log_attributes(
|
|
1060
|
+
event_data: Dict[str, Any],
|
|
1061
|
+
metric_name: str,
|
|
1062
|
+
evaluator_config: Optional[Dict[str, EvaluatorConfig]],
|
|
1063
|
+
internal_log_attributes: Dict[str, str],
|
|
1064
|
+
) -> Dict[str, str]:
|
|
1065
|
+
"""
|
|
1066
|
+
Build internal log attributes for OpenTelemetry logging.
|
|
1067
|
+
|
|
1068
|
+
:param event_data: The event data containing threshold and name information
|
|
1069
|
+
:type event_data: Dict[str, Any]
|
|
1070
|
+
:param metric_name: The name of the metric being evaluated
|
|
1071
|
+
:type metric_name: str
|
|
1072
|
+
:param evaluator_config: Configuration for evaluators
|
|
1073
|
+
:type evaluator_config: Optional[Dict[str, EvaluatorConfig]]
|
|
1074
|
+
:return: Dictionary of internal log attributes
|
|
1075
|
+
:rtype: Dict[str, str]
|
|
1076
|
+
"""
|
|
1077
|
+
# Add threshold if present
|
|
1078
|
+
if event_data.get("threshold"):
|
|
1079
|
+
internal_log_attributes["gen_ai.evaluation.threshold"] = str(event_data["threshold"])
|
|
1080
|
+
|
|
1081
|
+
# Add testing criteria details if present
|
|
1082
|
+
testing_criteria_name = event_data.get("name")
|
|
1083
|
+
if testing_criteria_name:
|
|
1084
|
+
internal_log_attributes["gen_ai.evaluation.testing_criteria.name"] = testing_criteria_name
|
|
1085
|
+
|
|
1086
|
+
# Get evaluator definition details
|
|
1087
|
+
if evaluator_config and testing_criteria_name in evaluator_config:
|
|
1088
|
+
testing_criteria_config = evaluator_config[testing_criteria_name]
|
|
1089
|
+
|
|
1090
|
+
if evaluator_name := testing_criteria_config.get("_evaluator_name"):
|
|
1091
|
+
internal_log_attributes["gen_ai.evaluator.name"] = str(evaluator_name)
|
|
1092
|
+
|
|
1093
|
+
if evaluator_version := testing_criteria_config.get("_evaluator_version"):
|
|
1094
|
+
internal_log_attributes["gen_ai.evaluator.version"] = str(evaluator_version)
|
|
1095
|
+
|
|
1096
|
+
if evaluator_id := testing_criteria_config.get("_evaluator_id"):
|
|
1097
|
+
internal_log_attributes["gen_ai.evaluator.id"] = str(evaluator_id)
|
|
1098
|
+
|
|
1099
|
+
if evaluator_definition := testing_criteria_config.get("_evaluator_definition"):
|
|
1100
|
+
metric_config_detail = evaluator_definition.get("metrics").get(metric_name)
|
|
1101
|
+
|
|
1102
|
+
if metric_config_detail:
|
|
1103
|
+
if metric_config_detail.get("min_value") is not None:
|
|
1104
|
+
internal_log_attributes["gen_ai.evaluation.min_value"] = str(metric_config_detail["min_value"])
|
|
1105
|
+
if metric_config_detail.get("max_value") is not None:
|
|
1106
|
+
internal_log_attributes["gen_ai.evaluation.max_value"] = str(metric_config_detail["max_value"])
|
|
1107
|
+
|
|
1108
|
+
return internal_log_attributes
|
|
1109
|
+
|
|
1110
|
+
|
|
1111
|
+
def _log_events_to_app_insights(
|
|
1112
|
+
otel_logger,
|
|
1113
|
+
events: List[Dict[str, Any]],
|
|
1114
|
+
log_attributes: Dict[str, Any],
|
|
1115
|
+
app_insights_config: AppInsightsConfig,
|
|
1116
|
+
data_source_item: Optional[Dict[str, Any]] = None,
|
|
1117
|
+
evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None,
|
|
1118
|
+
) -> None:
|
|
1119
|
+
"""
|
|
1120
|
+
Log independent events directly to App Insights using OpenTelemetry logging.
|
|
1121
|
+
No spans are created - events are sent as pure log records.
|
|
1122
|
+
|
|
1123
|
+
:param otel_logger: OpenTelemetry logger instance
|
|
1124
|
+
:type otel_logger: Logger
|
|
1125
|
+
:param events: List of event data dictionaries to log
|
|
1126
|
+
:type events: List[Dict[str, Any]]
|
|
1127
|
+
:param log_attributes: Attributes dict to use for each event (already includes extra_attributes if present)
|
|
1128
|
+
:type log_attributes: Dict[str, Any]
|
|
1129
|
+
:param app_insights_config: App Insights configuration containing connection string
|
|
1130
|
+
:type app_insights_config: AppInsightsConfig
|
|
1131
|
+
:param data_source_item: Data source item containing trace, response, and agent information
|
|
1132
|
+
:type data_source_item: Optional[Dict[str, Any]]
|
|
1133
|
+
"""
|
|
1134
|
+
|
|
1135
|
+
from opentelemetry import trace
|
|
1136
|
+
from opentelemetry.trace import SpanContext, TraceFlags, NonRecordingSpan
|
|
1137
|
+
|
|
1138
|
+
try:
|
|
1139
|
+
# Initialize values from AppInsights config as defaults
|
|
1140
|
+
trace_id = None
|
|
1141
|
+
span_id = None
|
|
1142
|
+
response_id = None
|
|
1143
|
+
conversation_id = None
|
|
1144
|
+
previous_response_id = None
|
|
1145
|
+
agent_id = app_insights_config.get("agent_id", None)
|
|
1146
|
+
agent_version = app_insights_config.get("agent_version", None)
|
|
1147
|
+
agent_name = app_insights_config.get("agent_name", None)
|
|
1148
|
+
|
|
1149
|
+
# Data source item values have higher priority and will override AppInsights config defaults
|
|
1150
|
+
if data_source_item:
|
|
1151
|
+
for key, value in data_source_item.items():
|
|
1152
|
+
if key.endswith("trace_id") and value and isinstance(value, str):
|
|
1153
|
+
# Remove dashes if present
|
|
1154
|
+
trace_id_str = str(value).replace("-", "").lower()
|
|
1155
|
+
if len(trace_id_str) == 32: # Valid trace_id length
|
|
1156
|
+
trace_id = int(trace_id_str, 16)
|
|
1157
|
+
elif key == "previous_response_id" and value and isinstance(value, str):
|
|
1158
|
+
previous_response_id = value
|
|
1159
|
+
elif key == "response_id" and value and isinstance(value, str):
|
|
1160
|
+
response_id = value
|
|
1161
|
+
elif key == "conversation_id" and value and isinstance(value, str):
|
|
1162
|
+
conversation_id = value
|
|
1163
|
+
elif key == "agent_id" and value and isinstance(value, str):
|
|
1164
|
+
agent_id = value
|
|
1165
|
+
elif key.endswith("span_id") and value and isinstance(value, str):
|
|
1166
|
+
# Remove dashes if present and convert to int
|
|
1167
|
+
span_id_str = str(value).replace("-", "").lower()
|
|
1168
|
+
if len(span_id_str) == 16: # Valid span_id length (64-bit = 16 hex chars)
|
|
1169
|
+
span_id = int(span_id_str, 16)
|
|
1170
|
+
elif key == "agent_version" and value and isinstance(value, str):
|
|
1171
|
+
agent_version = value
|
|
1172
|
+
elif key == "agent_name" and value and isinstance(value, str):
|
|
1173
|
+
agent_name = value
|
|
1174
|
+
|
|
1175
|
+
# Log each event as a separate log record
|
|
1176
|
+
for i, event_data in enumerate(events):
|
|
1177
|
+
try:
|
|
1178
|
+
# Prepare log record attributes with specific mappings
|
|
1179
|
+
# The standard attributes are already in https://github.com/open-telemetry/semantic-conventions/blob/main/docs/gen-ai/gen-ai-events.md#event-eventgen_aievaluationresult
|
|
1180
|
+
metric_name = event_data.get("metric")
|
|
1181
|
+
standard_log_attributes = {}
|
|
1182
|
+
standard_log_attributes["microsoft.custom_event.name"] = EVALUATION_EVENT_NAME
|
|
1183
|
+
standard_log_attributes["gen_ai.evaluation.name"] = metric_name
|
|
1184
|
+
if event_data.get("score") is not None:
|
|
1185
|
+
standard_log_attributes["gen_ai.evaluation.score.value"] = event_data.get("score")
|
|
1186
|
+
if event_data.get("label") is not None:
|
|
1187
|
+
standard_log_attributes["gen_ai.evaluation.score.label"] = event_data.get("label")
|
|
1188
|
+
|
|
1189
|
+
# Internal proposed attributes
|
|
1190
|
+
# Put it in internal property bag for now, will be expanded if we got sign-off to Otel standard later.
|
|
1191
|
+
internal_log_attributes = _build_internal_log_attributes(
|
|
1192
|
+
event_data, metric_name, evaluator_config, log_attributes
|
|
1193
|
+
)
|
|
1194
|
+
|
|
1195
|
+
# Optional field that may not always be present
|
|
1196
|
+
if "reason" in event_data:
|
|
1197
|
+
standard_log_attributes["gen_ai.evaluation.explanation"] = str(event_data["reason"])
|
|
1198
|
+
|
|
1199
|
+
# Handle error from sample if present
|
|
1200
|
+
# Put the error message in error.type to follow OTel semantic conventions
|
|
1201
|
+
error = event_data.get("sample", {}).get("error", {}).get("message", None)
|
|
1202
|
+
if error:
|
|
1203
|
+
standard_log_attributes["error.type"] = error
|
|
1204
|
+
|
|
1205
|
+
# Handle redteam attack properties if present
|
|
1206
|
+
if "properties" in event_data:
|
|
1207
|
+
properties = event_data["properties"]
|
|
1208
|
+
|
|
1209
|
+
if "attack_success" in properties:
|
|
1210
|
+
internal_log_attributes["gen_ai.redteam.attack.success"] = str(properties["attack_success"])
|
|
1211
|
+
|
|
1212
|
+
if "attack_technique" in properties:
|
|
1213
|
+
internal_log_attributes["gen_ai.redteam.attack.technique"] = str(properties["attack_technique"])
|
|
1214
|
+
|
|
1215
|
+
if "attack_complexity" in properties:
|
|
1216
|
+
internal_log_attributes["gen_ai.redteam.attack.complexity"] = str(
|
|
1217
|
+
properties["attack_complexity"]
|
|
1218
|
+
)
|
|
1219
|
+
|
|
1220
|
+
if "attack_success_threshold" in properties:
|
|
1221
|
+
internal_log_attributes["gen_ai.redteam.attack.success_threshold"] = str(
|
|
1222
|
+
properties["attack_success_threshold"]
|
|
1223
|
+
)
|
|
1224
|
+
|
|
1225
|
+
# Add data source item attributes if present
|
|
1226
|
+
if response_id:
|
|
1227
|
+
standard_log_attributes["gen_ai.response.id"] = response_id
|
|
1228
|
+
if conversation_id:
|
|
1229
|
+
standard_log_attributes["gen_ai.conversation.id"] = conversation_id
|
|
1230
|
+
if previous_response_id:
|
|
1231
|
+
internal_log_attributes["gen_ai.previous.response.id"] = previous_response_id
|
|
1232
|
+
if agent_id:
|
|
1233
|
+
standard_log_attributes["gen_ai.agent.id"] = agent_id
|
|
1234
|
+
if agent_name:
|
|
1235
|
+
standard_log_attributes["gen_ai.agent.name"] = agent_name
|
|
1236
|
+
if agent_version:
|
|
1237
|
+
internal_log_attributes["gen_ai.agent.version"] = agent_version
|
|
1238
|
+
|
|
1239
|
+
# Combine standard and internal attributes, put internal under the properties bag
|
|
1240
|
+
standard_log_attributes["internal_properties"] = json.dumps(internal_log_attributes)
|
|
1241
|
+
# Anonymize IP address to prevent Azure GeoIP enrichment and location tracking
|
|
1242
|
+
standard_log_attributes["http.client_ip"] = "0.0.0.0"
|
|
1243
|
+
|
|
1244
|
+
# Create context with trace_id and span_id if present (for distributed tracing correlation)
|
|
1245
|
+
ctx = None
|
|
1246
|
+
if trace_id:
|
|
1247
|
+
span_context = SpanContext(
|
|
1248
|
+
trace_id=trace_id,
|
|
1249
|
+
span_id=span_id if span_id else 0, # Use extracted span_id or 0 if not available
|
|
1250
|
+
is_remote=False,
|
|
1251
|
+
trace_flags=TraceFlags(0x01),
|
|
1252
|
+
)
|
|
1253
|
+
span = NonRecordingSpan(span_context)
|
|
1254
|
+
ctx = trace.set_span_in_context(span)
|
|
1255
|
+
|
|
1256
|
+
otel_logger.emit(
|
|
1257
|
+
timestamp=time.time_ns(),
|
|
1258
|
+
observed_timestamp=time.time_ns(),
|
|
1259
|
+
body=EVALUATION_EVENT_NAME,
|
|
1260
|
+
attributes=standard_log_attributes,
|
|
1261
|
+
context=ctx,
|
|
1262
|
+
)
|
|
1263
|
+
|
|
1264
|
+
except Exception as e:
|
|
1265
|
+
LOGGER.warning(f"Failed to log event {i}: {e}")
|
|
1266
|
+
|
|
1267
|
+
except Exception as e:
|
|
1268
|
+
LOGGER.error(f"Failed to log events to App Insights: {e}")
|
|
1269
|
+
|
|
1270
|
+
|
|
1271
|
+
def emit_eval_result_events_to_app_insights(
|
|
1272
|
+
app_insights_config: AppInsightsConfig,
|
|
1273
|
+
results: List[Dict],
|
|
1274
|
+
evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None,
|
|
1275
|
+
) -> None:
|
|
1276
|
+
"""
|
|
1277
|
+
Emit evaluation result events to App Insights using OpenTelemetry logging.
|
|
1278
|
+
Each result is logged as an independent log record, potentially including trace context.
|
|
1279
|
+
|
|
1280
|
+
:param app_insights_config: App Insights configuration containing connection string
|
|
1281
|
+
:type app_insights_config: AppInsightsConfig
|
|
1282
|
+
:param results: List of evaluation results to log
|
|
1283
|
+
:type results: List[Dict]
|
|
1284
|
+
"""
|
|
1285
|
+
|
|
1286
|
+
from opentelemetry import _logs
|
|
1287
|
+
from opentelemetry.sdk._logs import LoggerProvider
|
|
1288
|
+
from opentelemetry.sdk._logs.export import BatchLogRecordProcessor
|
|
1289
|
+
from opentelemetry.sdk.resources import Resource
|
|
1290
|
+
from opentelemetry.semconv.resource import ResourceAttributes
|
|
1291
|
+
from azure.monitor.opentelemetry.exporter import AzureMonitorLogExporter
|
|
1292
|
+
|
|
1293
|
+
if not results:
|
|
1294
|
+
LOGGER.debug("No results to log to App Insights")
|
|
1295
|
+
return
|
|
1296
|
+
|
|
1297
|
+
try:
|
|
1298
|
+
# Configure OpenTelemetry logging with anonymized Resource attributes
|
|
1299
|
+
|
|
1300
|
+
# Create a resource with minimal attributes to prevent sensitive data collection
|
|
1301
|
+
# SERVICE_INSTANCE_ID maps to cloud_RoleInstance in Azure Monitor and prevents
|
|
1302
|
+
# Azure Monitor from auto-detecting the device hostname
|
|
1303
|
+
anonymized_resource = Resource.create(
|
|
1304
|
+
{
|
|
1305
|
+
ResourceAttributes.SERVICE_NAME: "unknown",
|
|
1306
|
+
ResourceAttributes.SERVICE_INSTANCE_ID: "unknown",
|
|
1307
|
+
}
|
|
1308
|
+
)
|
|
1309
|
+
|
|
1310
|
+
logger_provider = LoggerProvider(resource=anonymized_resource)
|
|
1311
|
+
_logs.set_logger_provider(logger_provider)
|
|
1312
|
+
|
|
1313
|
+
# Create Azure Monitor log exporter
|
|
1314
|
+
azure_log_exporter = AzureMonitorLogExporter(connection_string=app_insights_config["connection_string"])
|
|
1315
|
+
|
|
1316
|
+
# Add the Azure Monitor exporter to the logger provider
|
|
1317
|
+
logger_provider.add_log_record_processor(BatchLogRecordProcessor(azure_log_exporter))
|
|
1318
|
+
|
|
1319
|
+
# Create a logger from OUR configured logger_provider (not the global one)
|
|
1320
|
+
# This ensures the logger uses our anonymized resource
|
|
1321
|
+
otel_logger = logger_provider.get_logger(__name__)
|
|
1322
|
+
|
|
1323
|
+
# Initialize base log attributes with extra_attributes if present, otherwise empty dict
|
|
1324
|
+
base_log_attributes = app_insights_config.get("extra_attributes", {})
|
|
1325
|
+
|
|
1326
|
+
# Add AppInsights config attributes with proper semantic convention mappings
|
|
1327
|
+
if "run_type" in app_insights_config:
|
|
1328
|
+
base_log_attributes["gen_ai.evaluation.azure_ai_type"] = str(app_insights_config["run_type"])
|
|
1329
|
+
if "schedule_type" in app_insights_config:
|
|
1330
|
+
base_log_attributes["gen_ai.evaluation.azure_ai_scheduled"] = str(app_insights_config["schedule_type"])
|
|
1331
|
+
if "run_id" in app_insights_config:
|
|
1332
|
+
base_log_attributes["gen_ai.evaluation.run.id"] = str(app_insights_config["run_id"])
|
|
1333
|
+
if "project_id" in app_insights_config:
|
|
1334
|
+
base_log_attributes["gen_ai.azure_ai_project.id"] = str(app_insights_config["project_id"])
|
|
1335
|
+
|
|
1336
|
+
for result in results:
|
|
1337
|
+
# Create a copy of base attributes for this result's events
|
|
1338
|
+
log_attributes = base_log_attributes.copy()
|
|
1339
|
+
|
|
1340
|
+
_log_events_to_app_insights(
|
|
1341
|
+
otel_logger=otel_logger,
|
|
1342
|
+
events=result["results"],
|
|
1343
|
+
log_attributes=log_attributes,
|
|
1344
|
+
data_source_item=result["datasource_item"] if "datasource_item" in result else None,
|
|
1345
|
+
evaluator_config=evaluator_config,
|
|
1346
|
+
app_insights_config=app_insights_config,
|
|
1347
|
+
)
|
|
1348
|
+
# Force flush to ensure events are sent
|
|
1349
|
+
logger_provider.force_flush()
|
|
1350
|
+
LOGGER.info(f"Successfully logged {len(results)} evaluation results to App Insights")
|
|
1351
|
+
|
|
1352
|
+
except Exception as e:
|
|
1353
|
+
LOGGER.error(f"Failed to emit evaluation results to App Insights: {e}")
|
|
1354
|
+
|
|
1355
|
+
|
|
991
1356
|
def _preprocess_data(
|
|
992
1357
|
data: Union[str, os.PathLike],
|
|
993
1358
|
evaluators_and_graders: Dict[str, Union[Callable, AzureOpenAIGrader]],
|
|
@@ -1070,7 +1435,7 @@ def _preprocess_data(
|
|
|
1070
1435
|
batch_run_data = input_data_df
|
|
1071
1436
|
elif client_type == "pf_client":
|
|
1072
1437
|
batch_run_client = ProxyClient(user_agent=UserAgentSingleton().value)
|
|
1073
|
-
# Ensure the absolute path is
|
|
1438
|
+
# Ensure the absolute path is Re to pf.run, as relative path doesn't work with
|
|
1074
1439
|
# multiple evaluators. If the path is already absolute, abspath will return the original path.
|
|
1075
1440
|
batch_run_data = os.path.abspath(data)
|
|
1076
1441
|
elif client_type == "code_client":
|
|
@@ -1406,3 +1771,652 @@ def _turn_error_logs_into_exception(log_path: str) -> None:
|
|
|
1406
1771
|
category=ErrorCategory.FAILED_EXECUTION,
|
|
1407
1772
|
blame=ErrorBlame.UNKNOWN,
|
|
1408
1773
|
)
|
|
1774
|
+
|
|
1775
|
+
|
|
1776
|
+
def _convert_results_to_aoai_evaluation_results(
|
|
1777
|
+
results: EvaluationResult,
|
|
1778
|
+
logger: logging.Logger,
|
|
1779
|
+
eval_id: Optional[str] = None,
|
|
1780
|
+
eval_run_id: Optional[str] = None,
|
|
1781
|
+
evaluators: Dict[str, Union[Callable, AzureOpenAIGrader]] = None,
|
|
1782
|
+
eval_run_summary: Optional[Dict[str, Any]] = None,
|
|
1783
|
+
eval_meta_data: Optional[Dict[str, Any]] = None,
|
|
1784
|
+
) -> None:
|
|
1785
|
+
"""
|
|
1786
|
+
Convert evaluation results to AOAI evaluation results format.
|
|
1787
|
+
|
|
1788
|
+
Each row of input results.rows looks like:
|
|
1789
|
+
{"inputs.query":"What is the capital of France?","inputs.context":"France is in Europe",
|
|
1790
|
+
"inputs.generated_response":"Paris is the capital of France.","inputs.ground_truth":"Paris is the capital of France.",
|
|
1791
|
+
"outputs.F1_score.f1_score":1.0,"outputs.F1_score.f1_result":"pass","outputs.F1_score.f1_threshold":0.5}
|
|
1792
|
+
|
|
1793
|
+
Convert each row into new RunOutputItem object with results array.
|
|
1794
|
+
|
|
1795
|
+
:param results: The evaluation results to convert
|
|
1796
|
+
:type results: EvaluationResult
|
|
1797
|
+
:param eval_meta_data: The evaluation metadata, containing eval_id, eval_run_id, and testing_criteria
|
|
1798
|
+
:type eval_meta_data: Dict[str, Any]
|
|
1799
|
+
:param logger: Logger instance
|
|
1800
|
+
:type logger: logging.Logger
|
|
1801
|
+
:return: EvaluationResult with converted evaluation results in AOAI format
|
|
1802
|
+
:rtype: EvaluationResult
|
|
1803
|
+
"""
|
|
1804
|
+
|
|
1805
|
+
if evaluators is None:
|
|
1806
|
+
return
|
|
1807
|
+
|
|
1808
|
+
# Get the testing_criteria_name and testing_criteria_type from evaluators
|
|
1809
|
+
testing_criteria_name_types_metrics: Optional[Dict[str, Any]] = {}
|
|
1810
|
+
criteria_name_types_from_meta: Optional[Dict[str, str]] = {}
|
|
1811
|
+
if eval_meta_data and "testing_criteria" in eval_meta_data:
|
|
1812
|
+
testing_criteria_list: Optional[List[Dict[str, Any]]] = eval_meta_data.get("testing_criteria")
|
|
1813
|
+
if testing_criteria_list is not None:
|
|
1814
|
+
for criteria in testing_criteria_list:
|
|
1815
|
+
criteria_name = criteria.get("name")
|
|
1816
|
+
criteria_type = criteria.get("type")
|
|
1817
|
+
if criteria_name is not None and criteria_type is not None:
|
|
1818
|
+
criteria_name_types_from_meta[criteria_name] = criteria
|
|
1819
|
+
|
|
1820
|
+
for criteria_name, evaluator in evaluators.items():
|
|
1821
|
+
criteria_type = None
|
|
1822
|
+
metrics = []
|
|
1823
|
+
if criteria_name in criteria_name_types_from_meta:
|
|
1824
|
+
criteria_type = criteria_name_types_from_meta[criteria_name].get("type", None)
|
|
1825
|
+
evaluator_name = criteria_name_types_from_meta[criteria_name].get("evaluator_name", None)
|
|
1826
|
+
current_evaluator_metrics = criteria_name_types_from_meta[criteria_name].get("metrics", None)
|
|
1827
|
+
if current_evaluator_metrics and len(current_evaluator_metrics) > 0:
|
|
1828
|
+
metrics.extend(current_evaluator_metrics)
|
|
1829
|
+
elif evaluator_name:
|
|
1830
|
+
if criteria_type == "azure_ai_evaluator" and evaluator_name.startswith("builtin."):
|
|
1831
|
+
evaluator_name = evaluator_name.replace("builtin.", "")
|
|
1832
|
+
metrics_mapped = _EvaluatorMetricMapping.EVALUATOR_NAME_METRICS_MAPPINGS.get(evaluator_name, [])
|
|
1833
|
+
if metrics_mapped and len(metrics_mapped) > 0:
|
|
1834
|
+
metrics.extend(metrics_mapped)
|
|
1835
|
+
else:
|
|
1836
|
+
metrics.append(criteria_name)
|
|
1837
|
+
elif isinstance(evaluator, AzureOpenAIGrader):
|
|
1838
|
+
criteria_type = evaluator._type # pylint: disable=protected-access
|
|
1839
|
+
metrics.append(criteria_name)
|
|
1840
|
+
elif isinstance(evaluator, EvaluatorBase):
|
|
1841
|
+
criteria_type = "azure_ai_evaluator"
|
|
1842
|
+
evaluator_class_name = evaluator.__class__.__name__
|
|
1843
|
+
eval_name = _EvaluatorMetricMapping.EVAL_CLASS_NAME_MAP.get(evaluator_class_name, None)
|
|
1844
|
+
if eval_name:
|
|
1845
|
+
metrics_mapped = _EvaluatorMetricMapping.EVALUATOR_NAME_METRICS_MAPPINGS.get(eval_name, [])
|
|
1846
|
+
if metrics_mapped and len(metrics_mapped) > 0:
|
|
1847
|
+
metrics.extend(metrics_mapped)
|
|
1848
|
+
else:
|
|
1849
|
+
metrics.append(criteria_name)
|
|
1850
|
+
else:
|
|
1851
|
+
criteria_type = "unknown"
|
|
1852
|
+
metrics.append(criteria_name)
|
|
1853
|
+
testing_criteria_name_types_metrics[criteria_name] = {"type": criteria_type, "metrics": metrics}
|
|
1854
|
+
|
|
1855
|
+
created_time = int(time.time())
|
|
1856
|
+
converted_rows = []
|
|
1857
|
+
|
|
1858
|
+
for row_idx, row in enumerate(results.get("rows", [])):
|
|
1859
|
+
# Group outputs by test criteria name
|
|
1860
|
+
criteria_groups = {criteria: {} for criteria in testing_criteria_name_types_metrics.keys()}
|
|
1861
|
+
input_groups = {}
|
|
1862
|
+
top_sample = {}
|
|
1863
|
+
for key, value in row.items():
|
|
1864
|
+
if key.startswith("outputs."):
|
|
1865
|
+
# Parse key: outputs.<test-criteria-name>.<metric>
|
|
1866
|
+
parts = key.split(".", 2) # Split into max 3 parts: ['outputs', '<criteria-name>', '<metric>']
|
|
1867
|
+
if len(parts) >= 3:
|
|
1868
|
+
criteria_name = parts[1]
|
|
1869
|
+
metric_name = parts[2]
|
|
1870
|
+
|
|
1871
|
+
if criteria_name not in criteria_groups:
|
|
1872
|
+
criteria_groups[criteria_name] = {}
|
|
1873
|
+
|
|
1874
|
+
criteria_groups[criteria_name][metric_name] = value
|
|
1875
|
+
elif key.startswith("inputs."):
|
|
1876
|
+
input_key = key.replace("inputs.", "")
|
|
1877
|
+
if input_key not in input_groups:
|
|
1878
|
+
input_groups[input_key] = value
|
|
1879
|
+
|
|
1880
|
+
# Convert each criteria group to RunOutputItem result
|
|
1881
|
+
run_output_results = []
|
|
1882
|
+
for criteria_name, metrics in criteria_groups.items():
|
|
1883
|
+
# Extract metrics for this criteria
|
|
1884
|
+
expected_metrics = testing_criteria_name_types_metrics.get(criteria_name, {}).get("metrics", [])
|
|
1885
|
+
criteria_type = testing_criteria_name_types_metrics.get(criteria_name, {}).get("type", "unknown")
|
|
1886
|
+
result_per_metric = {}
|
|
1887
|
+
# Find score - look for various score patterns
|
|
1888
|
+
for metric_key, metric_value in metrics.items():
|
|
1889
|
+
if metric_key.endswith("_score") or metric_key == "score":
|
|
1890
|
+
metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
|
|
1891
|
+
if metric not in result_per_metric:
|
|
1892
|
+
result_per_metric[metric] = {"score": metric_value}
|
|
1893
|
+
else:
|
|
1894
|
+
result_per_metric[metric]["score"] = metric_value
|
|
1895
|
+
_append_indirect_attachments_to_results(result_per_metric, "score", metric, metric_value)
|
|
1896
|
+
if metric_key == "passed":
|
|
1897
|
+
metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
|
|
1898
|
+
if metric not in result_per_metric:
|
|
1899
|
+
result_per_metric[metric] = {"passed": metric_value}
|
|
1900
|
+
else:
|
|
1901
|
+
result_per_metric[metric]["passed"] = metric_value
|
|
1902
|
+
_append_indirect_attachments_to_results(result_per_metric, "passed", metric, metric_value)
|
|
1903
|
+
elif metric_key.endswith("_result") or metric_key == "result" or metric_key.endswith("_label"):
|
|
1904
|
+
metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
|
|
1905
|
+
label = metric_value
|
|
1906
|
+
passed = (
|
|
1907
|
+
True if (str(metric_value).lower() == "pass" or str(metric_value).lower() == "true") else False
|
|
1908
|
+
)
|
|
1909
|
+
if metric not in result_per_metric:
|
|
1910
|
+
if criteria_type == "azure_ai_evaluator":
|
|
1911
|
+
result_per_metric[metric] = {"label": label, "passed": passed}
|
|
1912
|
+
else:
|
|
1913
|
+
result_per_metric[metric] = {"label": label}
|
|
1914
|
+
else:
|
|
1915
|
+
result_per_metric[metric]["label"] = metric_value
|
|
1916
|
+
if criteria_type == "azure_ai_evaluator":
|
|
1917
|
+
result_per_metric[metric]["passed"] = passed
|
|
1918
|
+
_append_indirect_attachments_to_results(result_per_metric, "label", metric, label)
|
|
1919
|
+
if criteria_type == "azure_ai_evaluator":
|
|
1920
|
+
_append_indirect_attachments_to_results(result_per_metric, "passed", metric, passed)
|
|
1921
|
+
elif (
|
|
1922
|
+
metric_key.endswith("_reason") and not metric_key.endswith("_finish_reason")
|
|
1923
|
+
) or metric_key == "reason":
|
|
1924
|
+
metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
|
|
1925
|
+
if metric not in result_per_metric:
|
|
1926
|
+
result_per_metric[metric] = {"reason": metric_value}
|
|
1927
|
+
else:
|
|
1928
|
+
result_per_metric[metric]["reason"] = metric_value
|
|
1929
|
+
_append_indirect_attachments_to_results(result_per_metric, "reason", metric, metric_value)
|
|
1930
|
+
elif metric_key.endswith("_threshold") or metric_key == "threshold":
|
|
1931
|
+
metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
|
|
1932
|
+
if metric not in result_per_metric:
|
|
1933
|
+
result_per_metric[metric] = {"threshold": metric_value}
|
|
1934
|
+
else:
|
|
1935
|
+
result_per_metric[metric]["threshold"] = metric_value
|
|
1936
|
+
_append_indirect_attachments_to_results(result_per_metric, "threshold", metric, metric_value)
|
|
1937
|
+
elif metric_key == "sample":
|
|
1938
|
+
metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
|
|
1939
|
+
if metric not in result_per_metric:
|
|
1940
|
+
result_per_metric[metric] = {"sample": metric_value}
|
|
1941
|
+
else:
|
|
1942
|
+
result_per_metric[metric]["sample"] = metric_value
|
|
1943
|
+
_append_indirect_attachments_to_results(result_per_metric, "sample", metric, metric_value)
|
|
1944
|
+
elif metric_key.endswith("_finish_reason"):
|
|
1945
|
+
metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
|
|
1946
|
+
if metric not in result_per_metric:
|
|
1947
|
+
result_per_metric[metric] = {"sample": {"finish_reason": metric_value}}
|
|
1948
|
+
elif metric in result_per_metric and "sample" not in result_per_metric[metric]:
|
|
1949
|
+
result_per_metric[metric]["sample"] = {"finish_reason": metric_value}
|
|
1950
|
+
elif (
|
|
1951
|
+
metric in result_per_metric
|
|
1952
|
+
and "sample" in result_per_metric[metric]
|
|
1953
|
+
and "finish_reason" not in result_per_metric[metric]["sample"]
|
|
1954
|
+
):
|
|
1955
|
+
result_per_metric[metric]["sample"]["finish_reason"] = metric_value
|
|
1956
|
+
_append_indirect_attachments_to_results(
|
|
1957
|
+
result_per_metric, "sample", metric, metric_value, "finish_reason"
|
|
1958
|
+
)
|
|
1959
|
+
elif metric_key.endswith("_model"):
|
|
1960
|
+
metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
|
|
1961
|
+
if metric not in result_per_metric:
|
|
1962
|
+
result_per_metric[metric] = {"sample": {"model": metric_value}}
|
|
1963
|
+
elif metric in result_per_metric and "sample" not in result_per_metric[metric]:
|
|
1964
|
+
result_per_metric[metric]["sample"] = {"model": metric_value}
|
|
1965
|
+
elif (
|
|
1966
|
+
metric in result_per_metric
|
|
1967
|
+
and "sample" in result_per_metric[metric]
|
|
1968
|
+
and "model" not in result_per_metric[metric]["sample"]
|
|
1969
|
+
):
|
|
1970
|
+
result_per_metric[metric]["sample"]["model"] = metric_value
|
|
1971
|
+
_append_indirect_attachments_to_results(result_per_metric, "sample", metric, metric_value, "model")
|
|
1972
|
+
elif metric_key.endswith("_sample_input"):
|
|
1973
|
+
metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
|
|
1974
|
+
input_metric_val_json: Optional[List[Dict[str, Any]]] = []
|
|
1975
|
+
try:
|
|
1976
|
+
input_metric_val_json = json.loads(metric_value)
|
|
1977
|
+
except Exception as e:
|
|
1978
|
+
logger.warning(f"Failed to parse _sample_input value as JSON: {e}")
|
|
1979
|
+
if metric not in result_per_metric:
|
|
1980
|
+
result_per_metric[metric] = {"sample": {"input": input_metric_val_json}}
|
|
1981
|
+
elif metric in result_per_metric and "sample" not in result_per_metric[metric]:
|
|
1982
|
+
result_per_metric[metric]["sample"] = {"input": input_metric_val_json}
|
|
1983
|
+
elif (
|
|
1984
|
+
metric in result_per_metric
|
|
1985
|
+
and "sample" in result_per_metric[metric]
|
|
1986
|
+
and "input" not in result_per_metric[metric]["sample"]
|
|
1987
|
+
):
|
|
1988
|
+
result_per_metric[metric]["sample"]["input"] = input_metric_val_json
|
|
1989
|
+
_append_indirect_attachments_to_results(
|
|
1990
|
+
result_per_metric, "sample", metric, input_metric_val_json, "input"
|
|
1991
|
+
)
|
|
1992
|
+
elif metric_key.endswith("_sample_output"):
|
|
1993
|
+
metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
|
|
1994
|
+
output_metric_val_json: Optional[List[Dict[str, Any]]] = []
|
|
1995
|
+
try:
|
|
1996
|
+
output_metric_val_json = json.loads(metric_value)
|
|
1997
|
+
except Exception as e:
|
|
1998
|
+
logger.warning(f"Failed to parse _sample_output value as JSON: {e}")
|
|
1999
|
+
if metric not in result_per_metric:
|
|
2000
|
+
result_per_metric[metric] = {"sample": {"output": output_metric_val_json}}
|
|
2001
|
+
elif metric in result_per_metric and "sample" not in result_per_metric[metric]:
|
|
2002
|
+
result_per_metric[metric]["sample"] = {"output": output_metric_val_json}
|
|
2003
|
+
elif (
|
|
2004
|
+
metric in result_per_metric
|
|
2005
|
+
and "sample" in result_per_metric[metric]
|
|
2006
|
+
and "output" not in result_per_metric[metric]["sample"]
|
|
2007
|
+
):
|
|
2008
|
+
result_per_metric[metric]["sample"]["output"] = output_metric_val_json
|
|
2009
|
+
_append_indirect_attachments_to_results(
|
|
2010
|
+
result_per_metric, "sample", metric, output_metric_val_json, "output"
|
|
2011
|
+
)
|
|
2012
|
+
elif metric_key.endswith("_total_tokens"):
|
|
2013
|
+
metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
|
|
2014
|
+
if metric not in result_per_metric:
|
|
2015
|
+
result_per_metric[metric] = {"sample": {"usage": {"total_tokens": metric_value}}}
|
|
2016
|
+
elif metric in result_per_metric and "sample" not in result_per_metric[metric]:
|
|
2017
|
+
result_per_metric[metric]["sample"] = {"usage": {"total_tokens": metric_value}}
|
|
2018
|
+
elif (
|
|
2019
|
+
metric in result_per_metric
|
|
2020
|
+
and "sample" in result_per_metric[metric]
|
|
2021
|
+
and "usage" not in result_per_metric[metric]["sample"]
|
|
2022
|
+
):
|
|
2023
|
+
result_per_metric[metric]["sample"]["usage"] = {"total_tokens": metric_value}
|
|
2024
|
+
else:
|
|
2025
|
+
result_per_metric[metric]["sample"]["usage"]["total_tokens"] = metric_value
|
|
2026
|
+
_append_indirect_attachments_to_results(
|
|
2027
|
+
result_per_metric, "sample", metric, metric_value, "usage", "total_tokens"
|
|
2028
|
+
)
|
|
2029
|
+
elif metric_key.endswith("_prompt_tokens"):
|
|
2030
|
+
metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
|
|
2031
|
+
if metric not in result_per_metric:
|
|
2032
|
+
result_per_metric[metric] = {"sample": {"usage": {"prompt_tokens": metric_value}}}
|
|
2033
|
+
elif metric in result_per_metric and "sample" not in result_per_metric[metric]:
|
|
2034
|
+
result_per_metric[metric]["sample"] = {"usage": {"prompt_tokens": metric_value}}
|
|
2035
|
+
elif (
|
|
2036
|
+
metric in result_per_metric
|
|
2037
|
+
and "sample" in result_per_metric[metric]
|
|
2038
|
+
and "usage" not in result_per_metric[metric]["sample"]
|
|
2039
|
+
):
|
|
2040
|
+
result_per_metric[metric]["sample"]["usage"] = {"prompt_tokens": metric_value}
|
|
2041
|
+
else:
|
|
2042
|
+
result_per_metric[metric]["sample"]["usage"]["prompt_tokens"] = metric_value
|
|
2043
|
+
_append_indirect_attachments_to_results(
|
|
2044
|
+
result_per_metric, "sample", metric, metric_value, "usage", "prompt_tokens"
|
|
2045
|
+
)
|
|
2046
|
+
elif metric_key.endswith("_completion_tokens"):
|
|
2047
|
+
metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
|
|
2048
|
+
if metric not in result_per_metric:
|
|
2049
|
+
result_per_metric[metric] = {"sample": {"usage": {"completion_tokens": metric_value}}}
|
|
2050
|
+
elif metric in result_per_metric and "sample" not in result_per_metric[metric]:
|
|
2051
|
+
result_per_metric[metric]["sample"] = {"usage": {"completion_tokens": metric_value}}
|
|
2052
|
+
elif (
|
|
2053
|
+
metric in result_per_metric
|
|
2054
|
+
and "sample" in result_per_metric[metric]
|
|
2055
|
+
and "usage" not in result_per_metric[metric]["sample"]
|
|
2056
|
+
):
|
|
2057
|
+
result_per_metric[metric]["sample"]["usage"] = {"completion_tokens": metric_value}
|
|
2058
|
+
else:
|
|
2059
|
+
result_per_metric[metric]["sample"]["usage"]["completion_tokens"] = metric_value
|
|
2060
|
+
_append_indirect_attachments_to_results(
|
|
2061
|
+
result_per_metric, "sample", metric, metric_value, "usage", "completion_tokens"
|
|
2062
|
+
)
|
|
2063
|
+
elif not any(
|
|
2064
|
+
metric_key.endswith(suffix)
|
|
2065
|
+
for suffix in [
|
|
2066
|
+
"_result",
|
|
2067
|
+
"_reason",
|
|
2068
|
+
"_threshold",
|
|
2069
|
+
"_label",
|
|
2070
|
+
"_score",
|
|
2071
|
+
"_model",
|
|
2072
|
+
"_finish_reason",
|
|
2073
|
+
"_sample_input",
|
|
2074
|
+
"_sample_output",
|
|
2075
|
+
"_total_tokens",
|
|
2076
|
+
"_prompt_tokens",
|
|
2077
|
+
"_completion_tokens",
|
|
2078
|
+
]
|
|
2079
|
+
):
|
|
2080
|
+
metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
|
|
2081
|
+
# If no score found yet and this doesn't match other patterns, use as score
|
|
2082
|
+
if metric_key == metric and metric not in result_per_metric:
|
|
2083
|
+
result_per_metric[metric] = {"score": metric_value}
|
|
2084
|
+
elif metric_key == metric and result_per_metric[metric].get("score", None) is None:
|
|
2085
|
+
result_per_metric[metric]["score"] = metric_value
|
|
2086
|
+
|
|
2087
|
+
for metric, metric_values in result_per_metric.items():
|
|
2088
|
+
score = metric_values.get("score", None)
|
|
2089
|
+
label = metric_values.get("label", None)
|
|
2090
|
+
reason = metric_values.get("reason", None)
|
|
2091
|
+
threshold = metric_values.get("threshold", None)
|
|
2092
|
+
passed = metric_values.get("passed", None)
|
|
2093
|
+
sample = metric_values.get("sample", None)
|
|
2094
|
+
|
|
2095
|
+
# Create result object for this criteria
|
|
2096
|
+
result_obj = {
|
|
2097
|
+
"type": testing_criteria_name_types_metrics.get(criteria_name, {}).get(
|
|
2098
|
+
"type", "azure_ai_evaluator"
|
|
2099
|
+
),
|
|
2100
|
+
"name": criteria_name, # Use criteria name as name
|
|
2101
|
+
"metric": metric if metric is not None else criteria_name, # Use criteria name as metric
|
|
2102
|
+
}
|
|
2103
|
+
# Add optional fields
|
|
2104
|
+
if (
|
|
2105
|
+
metric in _EvaluatorMetricMapping.EVALUATOR_NAME_METRICS_MAPPINGS["indirect_attack"]
|
|
2106
|
+
or metric in _EvaluatorMetricMapping.EVALUATOR_NAME_METRICS_MAPPINGS["code_vulnerability"]
|
|
2107
|
+
or metric in _EvaluatorMetricMapping.EVALUATOR_NAME_METRICS_MAPPINGS["protected_material"]
|
|
2108
|
+
):
|
|
2109
|
+
copy_label = label
|
|
2110
|
+
if copy_label is not None and isinstance(copy_label, bool) and copy_label == True:
|
|
2111
|
+
label = "fail"
|
|
2112
|
+
score = 0.0
|
|
2113
|
+
passed = False
|
|
2114
|
+
else:
|
|
2115
|
+
label = "pass"
|
|
2116
|
+
score = 1.0
|
|
2117
|
+
passed = True
|
|
2118
|
+
result_obj["score"] = (
|
|
2119
|
+
score if not (score is None or (isinstance(score, float) and math.isnan(score))) else None
|
|
2120
|
+
)
|
|
2121
|
+
result_obj["label"] = label
|
|
2122
|
+
result_obj["reason"] = reason
|
|
2123
|
+
result_obj["threshold"] = threshold
|
|
2124
|
+
result_obj["passed"] = passed
|
|
2125
|
+
|
|
2126
|
+
if sample is not None:
|
|
2127
|
+
result_obj["sample"] = sample
|
|
2128
|
+
top_sample = sample # Save top sample for the row
|
|
2129
|
+
run_output_results.append(result_obj)
|
|
2130
|
+
|
|
2131
|
+
if (
|
|
2132
|
+
eval_run_summary
|
|
2133
|
+
and criteria_name in eval_run_summary
|
|
2134
|
+
and isinstance(eval_run_summary[criteria_name], dict)
|
|
2135
|
+
and "error_code" in eval_run_summary[criteria_name]
|
|
2136
|
+
) and eval_run_summary[criteria_name].get("error_code", None) is not None:
|
|
2137
|
+
error_info = (
|
|
2138
|
+
{
|
|
2139
|
+
"code": eval_run_summary[criteria_name].get("error_code", None),
|
|
2140
|
+
"message": eval_run_summary[criteria_name].get("error_message", None),
|
|
2141
|
+
}
|
|
2142
|
+
if eval_run_summary[criteria_name].get("error_code", None) is not None
|
|
2143
|
+
else None
|
|
2144
|
+
)
|
|
2145
|
+
sample = {"error": error_info} if error_info is not None else None
|
|
2146
|
+
# Create result object for this criteria
|
|
2147
|
+
metrics = testing_criteria_name_types_metrics.get(criteria_name, {}).get("metrics", [])
|
|
2148
|
+
for metric in metrics:
|
|
2149
|
+
result_obj = {
|
|
2150
|
+
"type": testing_criteria_name_types_metrics.get(criteria_name, {}).get(
|
|
2151
|
+
"type", "azure_ai_evaluator"
|
|
2152
|
+
),
|
|
2153
|
+
"name": criteria_name, # Use criteria name as name
|
|
2154
|
+
"metric": metric if metric is not None else criteria_name, # Use criteria name as metric
|
|
2155
|
+
"score": None,
|
|
2156
|
+
"label": None,
|
|
2157
|
+
"reason": None,
|
|
2158
|
+
"threshold": None,
|
|
2159
|
+
"passed": None,
|
|
2160
|
+
"sample": sample,
|
|
2161
|
+
}
|
|
2162
|
+
run_output_results.append(result_obj)
|
|
2163
|
+
|
|
2164
|
+
# Create RunOutputItem structure
|
|
2165
|
+
run_output_item = {
|
|
2166
|
+
"object": "eval.run.output_item",
|
|
2167
|
+
"id": f"{row_idx+1}",
|
|
2168
|
+
"run_id": eval_run_id,
|
|
2169
|
+
"eval_id": eval_id,
|
|
2170
|
+
"created_at": created_time,
|
|
2171
|
+
"datasource_item_id": row_idx,
|
|
2172
|
+
"datasource_item": input_groups,
|
|
2173
|
+
"results": run_output_results,
|
|
2174
|
+
"status": "completed" if len(run_output_results) > 0 else "error",
|
|
2175
|
+
}
|
|
2176
|
+
|
|
2177
|
+
run_output_item["sample"] = top_sample
|
|
2178
|
+
|
|
2179
|
+
converted_rows.append(run_output_item)
|
|
2180
|
+
|
|
2181
|
+
# Create converted results maintaining the same structure
|
|
2182
|
+
results["_evaluation_results_list"] = converted_rows
|
|
2183
|
+
logger.info(
|
|
2184
|
+
f"Converted {len(converted_rows)} rows to AOAI evaluation format, eval_id: {eval_id}, eval_run_id: {eval_run_id}"
|
|
2185
|
+
)
|
|
2186
|
+
# Calculate summary statistics
|
|
2187
|
+
evaluation_summary = _calculate_aoai_evaluation_summary(converted_rows, logger)
|
|
2188
|
+
results["_evaluation_summary"] = evaluation_summary
|
|
2189
|
+
logger.info(
|
|
2190
|
+
f"Summary statistics calculated for {len(converted_rows)} rows, eval_id: {eval_id}, eval_run_id: {eval_run_id}"
|
|
2191
|
+
)
|
|
2192
|
+
|
|
2193
|
+
|
|
2194
|
+
def _append_indirect_attachments_to_results(
|
|
2195
|
+
current_result_dict: Dict[str, Any],
|
|
2196
|
+
result_name: str,
|
|
2197
|
+
metric: str,
|
|
2198
|
+
metric_value: Any,
|
|
2199
|
+
nested_result_name: Optional[str] = None,
|
|
2200
|
+
secondnested_result_name: Optional[str] = None,
|
|
2201
|
+
) -> None:
|
|
2202
|
+
"""
|
|
2203
|
+
Append indirect attachments to the current result dictionary.
|
|
2204
|
+
|
|
2205
|
+
:param current_result_dict: The current result dictionary to update
|
|
2206
|
+
:type current_result_dict: Dict[str, Any]
|
|
2207
|
+
:param result_name: The result name
|
|
2208
|
+
:type result_name: str
|
|
2209
|
+
:param metric: The metric name
|
|
2210
|
+
:type metric: str
|
|
2211
|
+
:param metric_value: The value of the metric
|
|
2212
|
+
:type metric_value: Any
|
|
2213
|
+
"""
|
|
2214
|
+
if metric == "xpia" and result_name:
|
|
2215
|
+
for metric_extended in ["xpia_manipulated_content", "xpia_intrusion", "xpia_information_gathering"]:
|
|
2216
|
+
if nested_result_name is None:
|
|
2217
|
+
if metric_extended not in current_result_dict:
|
|
2218
|
+
current_result_dict[metric_extended] = {result_name: metric_value}
|
|
2219
|
+
else:
|
|
2220
|
+
current_result_dict[metric_extended][result_name] = metric_value
|
|
2221
|
+
elif nested_result_name is not None and secondnested_result_name is None:
|
|
2222
|
+
if metric_extended not in current_result_dict:
|
|
2223
|
+
current_result_dict[metric_extended] = {result_name: {nested_result_name: metric_value}}
|
|
2224
|
+
elif metric_extended in current_result_dict and result_name not in current_result_dict[metric_extended]:
|
|
2225
|
+
current_result_dict[metric_extended][result_name] = {nested_result_name: metric_value}
|
|
2226
|
+
elif (
|
|
2227
|
+
metric_extended in current_result_dict
|
|
2228
|
+
and result_name in current_result_dict[metric_extended]
|
|
2229
|
+
and nested_result_name not in current_result_dict[metric_extended][result_name]
|
|
2230
|
+
):
|
|
2231
|
+
current_result_dict[metric_extended][result_name][nested_result_name] = metric_value
|
|
2232
|
+
elif nested_result_name is not None and secondnested_result_name is not None:
|
|
2233
|
+
if metric_extended not in current_result_dict:
|
|
2234
|
+
current_result_dict[metric_extended] = {
|
|
2235
|
+
result_name: {nested_result_name: {secondnested_result_name: metric_value}}
|
|
2236
|
+
}
|
|
2237
|
+
elif metric_extended in current_result_dict and result_name not in current_result_dict[metric_extended]:
|
|
2238
|
+
current_result_dict[metric_extended][result_name] = {
|
|
2239
|
+
nested_result_name: {secondnested_result_name: metric_value}
|
|
2240
|
+
}
|
|
2241
|
+
elif (
|
|
2242
|
+
metric_extended in current_result_dict
|
|
2243
|
+
and result_name in current_result_dict[metric_extended]
|
|
2244
|
+
and nested_result_name not in current_result_dict[metric_extended][result_name]
|
|
2245
|
+
):
|
|
2246
|
+
current_result_dict[metric_extended][result_name][nested_result_name] = {
|
|
2247
|
+
secondnested_result_name: metric_value
|
|
2248
|
+
}
|
|
2249
|
+
else:
|
|
2250
|
+
(
|
|
2251
|
+
current_result_dict[metric_extended][result_name][nested_result_name][secondnested_result_name]
|
|
2252
|
+
) = metric_value
|
|
2253
|
+
|
|
2254
|
+
|
|
2255
|
+
def _get_metric_from_criteria(testing_criteria_name: str, metric_key: str, metric_list: List[str]) -> str:
|
|
2256
|
+
"""
|
|
2257
|
+
Get the metric name from the testing criteria and metric key.
|
|
2258
|
+
|
|
2259
|
+
:param testing_criteria_name: The name of the testing criteria
|
|
2260
|
+
:type testing_criteria_name: str
|
|
2261
|
+
:param metric_key: The metric key to look for
|
|
2262
|
+
:type metric_key: str
|
|
2263
|
+
:param metric_list: List of expected metrics for the testing criteria
|
|
2264
|
+
:type metric_list: List[str]
|
|
2265
|
+
:return: The metric name if found, otherwise the testing criteria name
|
|
2266
|
+
:rtype: str
|
|
2267
|
+
"""
|
|
2268
|
+
metric = None
|
|
2269
|
+
|
|
2270
|
+
if metric_key == "xpia_manipulated_content":
|
|
2271
|
+
metric = "xpia_manipulated_content"
|
|
2272
|
+
return metric
|
|
2273
|
+
elif metric_key == "xpia_intrusion":
|
|
2274
|
+
metric = "xpia_intrusion"
|
|
2275
|
+
return metric
|
|
2276
|
+
elif metric_key == "xpia_information_gathering":
|
|
2277
|
+
metric = "xpia_information_gathering"
|
|
2278
|
+
return metric
|
|
2279
|
+
for expected_metric in metric_list:
|
|
2280
|
+
if metric_key.startswith(expected_metric):
|
|
2281
|
+
metric = expected_metric
|
|
2282
|
+
break
|
|
2283
|
+
if metric is None:
|
|
2284
|
+
metric = testing_criteria_name
|
|
2285
|
+
return metric
|
|
2286
|
+
|
|
2287
|
+
|
|
2288
|
+
def _calculate_aoai_evaluation_summary(aoai_results: list, logger: logging.Logger) -> Dict[str, Any]:
|
|
2289
|
+
"""
|
|
2290
|
+
Calculate summary statistics for AOAI evaluation results.
|
|
2291
|
+
|
|
2292
|
+
:param aoai_results: List of AOAI result objects (run_output_items)
|
|
2293
|
+
:type aoai_results: list
|
|
2294
|
+
:return: Summary statistics dictionary
|
|
2295
|
+
:rtype: Dict[str, Any]
|
|
2296
|
+
"""
|
|
2297
|
+
# Calculate result counts based on aoaiResults
|
|
2298
|
+
result_counts = {"total": 0, "errored": 0, "failed": 0, "passed": 0}
|
|
2299
|
+
|
|
2300
|
+
# Count results by status and calculate per model usage
|
|
2301
|
+
model_usage_stats = {} # Dictionary to aggregate usage by model
|
|
2302
|
+
result_counts_stats = {} # Dictionary to aggregate usage by model
|
|
2303
|
+
|
|
2304
|
+
for aoai_result in aoai_results:
|
|
2305
|
+
logger.info(
|
|
2306
|
+
f"Processing aoai_result with id: {getattr(aoai_result, 'id', 'unknown')}, row keys: {aoai_result.keys() if hasattr(aoai_result, 'keys') else 'N/A'}"
|
|
2307
|
+
)
|
|
2308
|
+
result_counts["total"] += 1
|
|
2309
|
+
passed_count = 0
|
|
2310
|
+
failed_count = 0
|
|
2311
|
+
error_count = 0
|
|
2312
|
+
if isinstance(aoai_result, dict) and "results" in aoai_result:
|
|
2313
|
+
logger.info(
|
|
2314
|
+
f"Processing aoai_result with id: {getattr(aoai_result, 'id', 'unknown')}, results count: {len(aoai_result['results'])}"
|
|
2315
|
+
)
|
|
2316
|
+
for result_item in aoai_result["results"]:
|
|
2317
|
+
if isinstance(result_item, dict):
|
|
2318
|
+
# Check if the result has a 'passed' field
|
|
2319
|
+
if "passed" in result_item and result_item["passed"] is not None:
|
|
2320
|
+
testing_criteria = result_item.get("name", "")
|
|
2321
|
+
if testing_criteria not in result_counts_stats:
|
|
2322
|
+
result_counts_stats[testing_criteria] = {
|
|
2323
|
+
"testing_criteria": testing_criteria,
|
|
2324
|
+
"failed": 0,
|
|
2325
|
+
"passed": 0,
|
|
2326
|
+
}
|
|
2327
|
+
if result_item["passed"] is True:
|
|
2328
|
+
passed_count += 1
|
|
2329
|
+
result_counts_stats[testing_criteria]["passed"] += 1
|
|
2330
|
+
|
|
2331
|
+
elif result_item["passed"] is False:
|
|
2332
|
+
failed_count += 1
|
|
2333
|
+
result_counts_stats[testing_criteria]["failed"] += 1
|
|
2334
|
+
# Check if the result indicates an error status
|
|
2335
|
+
elif ("status" in result_item and result_item["status"] in ["error", "errored"]) or (
|
|
2336
|
+
"sample" in result_item
|
|
2337
|
+
and isinstance(result_item["sample"], dict)
|
|
2338
|
+
and result_item["sample"].get("error", None) is not None
|
|
2339
|
+
):
|
|
2340
|
+
error_count += 1
|
|
2341
|
+
elif hasattr(aoai_result, "status") and aoai_result.status == "error":
|
|
2342
|
+
error_count += 1
|
|
2343
|
+
elif isinstance(aoai_result, dict) and aoai_result.get("status") == "error":
|
|
2344
|
+
error_count += 1
|
|
2345
|
+
|
|
2346
|
+
if error_count > 0:
|
|
2347
|
+
result_counts["errored"] += 1
|
|
2348
|
+
elif failed_count > 0:
|
|
2349
|
+
result_counts["failed"] += 1
|
|
2350
|
+
elif (
|
|
2351
|
+
error_count == 0
|
|
2352
|
+
and failed_count == 0
|
|
2353
|
+
and passed_count > 0
|
|
2354
|
+
and passed_count == len(aoai_result.get("results", []))
|
|
2355
|
+
):
|
|
2356
|
+
result_counts["passed"] += 1
|
|
2357
|
+
|
|
2358
|
+
# Extract usage statistics from aoai_result.sample
|
|
2359
|
+
sample_data_list = []
|
|
2360
|
+
dup_usage_list = _EvaluatorMetricMapping.EVALUATOR_NAME_METRICS_MAPPINGS["indirect_attack"].copy()
|
|
2361
|
+
dup_usage_list.remove("xpia")
|
|
2362
|
+
if isinstance(aoai_result, dict) and aoai_result["results"] and isinstance(aoai_result["results"], list):
|
|
2363
|
+
for result_item in aoai_result["results"]:
|
|
2364
|
+
if (
|
|
2365
|
+
isinstance(result_item, dict)
|
|
2366
|
+
and "sample" in result_item
|
|
2367
|
+
and result_item["sample"]
|
|
2368
|
+
and result_item["metric"] not in dup_usage_list
|
|
2369
|
+
):
|
|
2370
|
+
sample_data_list.append(result_item["sample"])
|
|
2371
|
+
|
|
2372
|
+
for sample_data in sample_data_list:
|
|
2373
|
+
if sample_data and isinstance(sample_data, dict) and "usage" in sample_data:
|
|
2374
|
+
usage_data = sample_data["usage"]
|
|
2375
|
+
model_name = sample_data.get("model", "unknown")
|
|
2376
|
+
if model_name not in model_usage_stats:
|
|
2377
|
+
model_usage_stats[model_name] = {
|
|
2378
|
+
"invocation_count": 0,
|
|
2379
|
+
"total_tokens": 0,
|
|
2380
|
+
"prompt_tokens": 0,
|
|
2381
|
+
"completion_tokens": 0,
|
|
2382
|
+
"cached_tokens": 0,
|
|
2383
|
+
}
|
|
2384
|
+
# Aggregate usage statistics
|
|
2385
|
+
model_stats = model_usage_stats[model_name]
|
|
2386
|
+
model_stats["invocation_count"] += 1
|
|
2387
|
+
if isinstance(usage_data, dict):
|
|
2388
|
+
model_stats["total_tokens"] += usage_data.get("total_tokens", 0)
|
|
2389
|
+
model_stats["prompt_tokens"] += usage_data.get("prompt_tokens", 0)
|
|
2390
|
+
model_stats["completion_tokens"] += usage_data.get("completion_tokens", 0)
|
|
2391
|
+
model_stats["cached_tokens"] += usage_data.get("cached_tokens", 0)
|
|
2392
|
+
|
|
2393
|
+
# Convert model usage stats to list format matching EvaluationRunPerModelUsage
|
|
2394
|
+
per_model_usage = []
|
|
2395
|
+
for model_name, stats in model_usage_stats.items():
|
|
2396
|
+
per_model_usage.append(
|
|
2397
|
+
{
|
|
2398
|
+
"model_name": model_name,
|
|
2399
|
+
"invocation_count": stats["invocation_count"],
|
|
2400
|
+
"total_tokens": stats["total_tokens"],
|
|
2401
|
+
"prompt_tokens": stats["prompt_tokens"],
|
|
2402
|
+
"completion_tokens": stats["completion_tokens"],
|
|
2403
|
+
"cached_tokens": stats["cached_tokens"],
|
|
2404
|
+
}
|
|
2405
|
+
)
|
|
2406
|
+
result_counts_stats_val = []
|
|
2407
|
+
logger.info(f"\r\n Result counts stats: {result_counts_stats}")
|
|
2408
|
+
for criteria_name, stats_val in result_counts_stats.items():
|
|
2409
|
+
if isinstance(stats_val, dict):
|
|
2410
|
+
logger.info(f"\r\n Criteria: {criteria_name}, stats: {stats_val}")
|
|
2411
|
+
result_counts_stats_val.append(
|
|
2412
|
+
{
|
|
2413
|
+
"testing_criteria": criteria_name,
|
|
2414
|
+
"passed": stats_val.get("passed", 0),
|
|
2415
|
+
"failed": stats_val.get("failed", 0),
|
|
2416
|
+
}
|
|
2417
|
+
)
|
|
2418
|
+
return {
|
|
2419
|
+
"result_counts": result_counts,
|
|
2420
|
+
"per_model_usage": per_model_usage,
|
|
2421
|
+
"per_testing_criteria_results": result_counts_stats_val,
|
|
2422
|
+
}
|