azure-ai-evaluation 1.9.0__py3-none-any.whl → 1.11.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of azure-ai-evaluation might be problematic. Click here for more details.
- azure/ai/evaluation/__init__.py +46 -12
- azure/ai/evaluation/_aoai/python_grader.py +84 -0
- azure/ai/evaluation/_aoai/score_model_grader.py +1 -0
- azure/ai/evaluation/_common/onedp/models/_models.py +5 -0
- azure/ai/evaluation/_common/rai_service.py +3 -3
- azure/ai/evaluation/_common/utils.py +74 -17
- azure/ai/evaluation/_converters/_ai_services.py +60 -10
- azure/ai/evaluation/_converters/_models.py +75 -26
- azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +70 -22
- azure/ai/evaluation/_evaluate/_eval_run.py +14 -1
- azure/ai/evaluation/_evaluate/_evaluate.py +163 -44
- azure/ai/evaluation/_evaluate/_evaluate_aoai.py +79 -33
- azure/ai/evaluation/_evaluate/_utils.py +5 -2
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +1 -1
- azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +8 -1
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +3 -2
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +143 -25
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +7 -2
- azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +19 -9
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +15 -5
- azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +4 -1
- azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +4 -1
- azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +5 -2
- azure/ai/evaluation/_evaluators/_content_safety/_violence.py +4 -1
- azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +3 -0
- azure/ai/evaluation/_evaluators/_eci/_eci.py +3 -0
- azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +1 -1
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +3 -2
- azure/ai/evaluation/_evaluators/_gleu/_gleu.py +1 -1
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +114 -4
- azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +9 -3
- azure/ai/evaluation/_evaluators/_meteor/_meteor.py +1 -1
- azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +8 -1
- azure/ai/evaluation/_evaluators/_qa/_qa.py +1 -1
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +56 -3
- azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +140 -59
- azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +11 -3
- azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +3 -2
- azure/ai/evaluation/_evaluators/_rouge/_rouge.py +1 -1
- azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +2 -1
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +3 -2
- azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +24 -12
- azure/ai/evaluation/_evaluators/_task_adherence/task_adherence.prompty +354 -66
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +214 -187
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +126 -31
- azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +8 -1
- azure/ai/evaluation/_evaluators/_xpia/xpia.py +4 -1
- azure/ai/evaluation/_exceptions.py +1 -0
- azure/ai/evaluation/_legacy/_batch_engine/_config.py +6 -3
- azure/ai/evaluation/_legacy/_batch_engine/_engine.py +115 -30
- azure/ai/evaluation/_legacy/_batch_engine/_result.py +2 -0
- azure/ai/evaluation/_legacy/_batch_engine/_run.py +2 -2
- azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +28 -31
- azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +2 -0
- azure/ai/evaluation/_version.py +1 -1
- azure/ai/evaluation/red_team/__init__.py +4 -3
- azure/ai/evaluation/red_team/_attack_objective_generator.py +17 -0
- azure/ai/evaluation/red_team/_callback_chat_target.py +14 -1
- azure/ai/evaluation/red_team/_evaluation_processor.py +376 -0
- azure/ai/evaluation/red_team/_mlflow_integration.py +322 -0
- azure/ai/evaluation/red_team/_orchestrator_manager.py +661 -0
- azure/ai/evaluation/red_team/_red_team.py +655 -2665
- azure/ai/evaluation/red_team/_red_team_result.py +6 -0
- azure/ai/evaluation/red_team/_result_processor.py +610 -0
- azure/ai/evaluation/red_team/_utils/__init__.py +34 -0
- azure/ai/evaluation/red_team/_utils/_rai_service_eval_chat_target.py +11 -4
- azure/ai/evaluation/red_team/_utils/_rai_service_true_false_scorer.py +6 -0
- azure/ai/evaluation/red_team/_utils/constants.py +0 -2
- azure/ai/evaluation/red_team/_utils/exception_utils.py +345 -0
- azure/ai/evaluation/red_team/_utils/file_utils.py +266 -0
- azure/ai/evaluation/red_team/_utils/formatting_utils.py +115 -13
- azure/ai/evaluation/red_team/_utils/metric_mapping.py +24 -4
- azure/ai/evaluation/red_team/_utils/progress_utils.py +252 -0
- azure/ai/evaluation/red_team/_utils/retry_utils.py +218 -0
- azure/ai/evaluation/red_team/_utils/strategy_utils.py +17 -4
- azure/ai/evaluation/simulator/_adversarial_simulator.py +14 -2
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +13 -1
- azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +21 -7
- azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +24 -5
- azure/ai/evaluation/simulator/_simulator.py +12 -0
- {azure_ai_evaluation-1.9.0.dist-info → azure_ai_evaluation-1.11.0.dist-info}/METADATA +63 -4
- {azure_ai_evaluation-1.9.0.dist-info → azure_ai_evaluation-1.11.0.dist-info}/RECORD +85 -76
- {azure_ai_evaluation-1.9.0.dist-info → azure_ai_evaluation-1.11.0.dist-info}/WHEEL +1 -1
- {azure_ai_evaluation-1.9.0.dist-info → azure_ai_evaluation-1.11.0.dist-info/licenses}/NOTICE.txt +0 -0
- {azure_ai_evaluation-1.9.0.dist-info → azure_ai_evaluation-1.11.0.dist-info}/top_level.txt +0 -0
|
@@ -3,17 +3,31 @@ import json
|
|
|
3
3
|
|
|
4
4
|
from pydantic import BaseModel
|
|
5
5
|
|
|
6
|
-
from typing import List, Optional, Union
|
|
6
|
+
from typing import TYPE_CHECKING, Any, List, Optional, Union
|
|
7
7
|
|
|
8
8
|
# Models moved in a later version of agents SDK, so try a few different locations
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
9
|
+
# Only import for type checking to avoid runtime import errors
|
|
10
|
+
if TYPE_CHECKING:
|
|
11
|
+
try:
|
|
12
|
+
from azure.ai.projects.models import RunStepFunctionToolCall
|
|
13
|
+
except ImportError:
|
|
14
|
+
try:
|
|
15
|
+
from azure.ai.agents.models import RunStepFunctionToolCall
|
|
16
|
+
except ImportError:
|
|
17
|
+
# Create a protocol for type checking when the real class isn't available
|
|
18
|
+
from typing import Protocol
|
|
19
|
+
|
|
20
|
+
class RunStepFunctionToolCall(Protocol):
|
|
21
|
+
"""Protocol defining the expected interface for RunStepFunctionToolCall."""
|
|
22
|
+
|
|
23
|
+
id: str
|
|
24
|
+
type: str
|
|
25
|
+
|
|
26
|
+
def get(self, key: str, default: Any = None) -> Any: ...
|
|
27
|
+
|
|
28
|
+
else:
|
|
29
|
+
# At runtime, we don't need the actual class since it's only used in type annotations
|
|
30
|
+
RunStepFunctionToolCall = Any
|
|
17
31
|
|
|
18
32
|
# Message roles constants.
|
|
19
33
|
_SYSTEM = "system"
|
|
@@ -33,9 +47,12 @@ _TOOL_CALLS = "tool_calls"
|
|
|
33
47
|
# Constants to only be used internally in this file for the built-in tools.
|
|
34
48
|
_CODE_INTERPRETER = "code_interpreter"
|
|
35
49
|
_BING_GROUNDING = "bing_grounding"
|
|
50
|
+
_BING_CUSTOM_SEARCH = "bing_custom_search"
|
|
36
51
|
_FILE_SEARCH = "file_search"
|
|
37
52
|
_AZURE_AI_SEARCH = "azure_ai_search"
|
|
53
|
+
_SHAREPOINT_GROUNDING = "sharepoint_grounding"
|
|
38
54
|
_FABRIC_DATAAGENT = "fabric_dataagent"
|
|
55
|
+
_OPENAPI = "openapi"
|
|
39
56
|
|
|
40
57
|
# Built-in tool descriptions and parameters are hidden, but we include basic descriptions
|
|
41
58
|
# for evaluation purposes.
|
|
@@ -44,8 +61,10 @@ _BUILT_IN_DESCRIPTIONS = {
|
|
|
44
61
|
+ "generate code, and create graphs and charts using your data. Supports "
|
|
45
62
|
+ "up to 20 files.",
|
|
46
63
|
_BING_GROUNDING: "Enhance model output with web data.",
|
|
47
|
-
|
|
64
|
+
_BING_CUSTOM_SEARCH: "Enables agents to retrieve content from a curated subset of websites, enhancing relevance and reducing noise from public web searches.",
|
|
65
|
+
_FILE_SEARCH: "Search for data across uploaded files. A single call can return multiple results/files in the 'results' field.",
|
|
48
66
|
_AZURE_AI_SEARCH: "Search an Azure AI Search index for relevant data.",
|
|
67
|
+
_SHAREPOINT_GROUNDING: "Allows agents to access and retrieve relevant content from Microsoft SharePoint document libraries, grounding responses in organizational knowledge.",
|
|
49
68
|
_FABRIC_DATAAGENT: "Connect to Microsoft Fabric data agents to retrieve data across different data sources.",
|
|
50
69
|
}
|
|
51
70
|
|
|
@@ -59,6 +78,15 @@ _BUILT_IN_PARAMS = {
|
|
|
59
78
|
"type": "object",
|
|
60
79
|
"properties": {"requesturl": {"type": "string", "description": "URL used in Bing Search API."}},
|
|
61
80
|
},
|
|
81
|
+
_BING_CUSTOM_SEARCH: {
|
|
82
|
+
"type": "object",
|
|
83
|
+
"properties": {
|
|
84
|
+
"requesturl": {
|
|
85
|
+
"type": "string",
|
|
86
|
+
"description": "Search queries, along with pre-configured site restrictions or domain filters.",
|
|
87
|
+
}
|
|
88
|
+
},
|
|
89
|
+
},
|
|
62
90
|
_FILE_SEARCH: {
|
|
63
91
|
"type": "object",
|
|
64
92
|
"properties": {
|
|
@@ -76,6 +104,12 @@ _BUILT_IN_PARAMS = {
|
|
|
76
104
|
"type": "object",
|
|
77
105
|
"properties": {"input": {"type": "string", "description": "Search terms to use."}},
|
|
78
106
|
},
|
|
107
|
+
_SHAREPOINT_GROUNDING: {
|
|
108
|
+
"type": "object",
|
|
109
|
+
"properties": {
|
|
110
|
+
"input": {"type": "string", "description": "A natural language query to search SharePoint content."}
|
|
111
|
+
},
|
|
112
|
+
},
|
|
79
113
|
_FABRIC_DATAAGENT: {
|
|
80
114
|
"type": "object",
|
|
81
115
|
"properties": {"input": {"type": "string", "description": "Search terms to use."}},
|
|
@@ -217,6 +251,27 @@ class ToolDefinition(BaseModel):
|
|
|
217
251
|
parameters: dict
|
|
218
252
|
|
|
219
253
|
|
|
254
|
+
class OpenAPIToolDefinition(BaseModel):
|
|
255
|
+
"""Represents OpenAPI tool definition that will be used in the agent.
|
|
256
|
+
:param name: The name of the tool.
|
|
257
|
+
:type name: str
|
|
258
|
+
:param type: The type of the tool.
|
|
259
|
+
:type type: str
|
|
260
|
+
:param description: A description of the tool.
|
|
261
|
+
:type description: str
|
|
262
|
+
:param parameters: The parameters required by the tool.
|
|
263
|
+
:type parameters: dict
|
|
264
|
+
"""
|
|
265
|
+
|
|
266
|
+
name: str
|
|
267
|
+
type: str
|
|
268
|
+
description: Optional[str] = None
|
|
269
|
+
spec: object
|
|
270
|
+
auth: object
|
|
271
|
+
default_params: Optional[list[str]] = None
|
|
272
|
+
functions: list[ToolDefinition]
|
|
273
|
+
|
|
274
|
+
|
|
220
275
|
class ToolCall:
|
|
221
276
|
"""Represents a tool call, used as an intermediate step in the conversion process.
|
|
222
277
|
|
|
@@ -247,7 +302,7 @@ class EvaluatorData(BaseModel):
|
|
|
247
302
|
|
|
248
303
|
query: List[Message]
|
|
249
304
|
response: List[Message]
|
|
250
|
-
tool_definitions: List[ToolDefinition]
|
|
305
|
+
tool_definitions: List[Union[ToolDefinition, OpenAPIToolDefinition]]
|
|
251
306
|
|
|
252
307
|
def to_json(self):
|
|
253
308
|
"""Converts the result to a JSON string.
|
|
@@ -277,14 +332,16 @@ def break_tool_call_into_messages(tool_call: ToolCall, run_id: str) -> List[Mess
|
|
|
277
332
|
# all in most of the cases, and bing would only show the API URL, without arguments or results.
|
|
278
333
|
# Bing grounding would have "bing_grounding" in details with "requesturl" that will just be the API path with query.
|
|
279
334
|
# TODO: Work with AI Services to add converter support for BingGrounding and CodeInterpreter.
|
|
280
|
-
if hasattr(tool_call.details, _FUNCTION):
|
|
335
|
+
if hasattr(tool_call.details, _FUNCTION) or tool_call.details.get("function"):
|
|
281
336
|
# This is the internals of the content object that will be included with the tool call.
|
|
282
337
|
tool_call_id = tool_call.details.id
|
|
283
338
|
content_tool_call = {
|
|
284
339
|
"type": _TOOL_CALL,
|
|
285
340
|
"tool_call_id": tool_call_id,
|
|
286
|
-
"name": tool_call.details.
|
|
287
|
-
"arguments": safe_loads(
|
|
341
|
+
"name": tool_call.details.get(_FUNCTION).get("name") if tool_call.details.get(_FUNCTION) else None,
|
|
342
|
+
"arguments": safe_loads(
|
|
343
|
+
tool_call.details.get(_FUNCTION).get("arguments") if tool_call.details.get(_FUNCTION) else None
|
|
344
|
+
),
|
|
288
345
|
}
|
|
289
346
|
else:
|
|
290
347
|
# Treat built-in tools separately. Object models may be unique so handle each case separately
|
|
@@ -322,27 +379,19 @@ def break_tool_call_into_messages(tool_call: ToolCall, run_id: str) -> List[Mess
|
|
|
322
379
|
# assistant's action of calling the tool.
|
|
323
380
|
messages.append(AssistantMessage(run_id=run_id, content=[to_dict(content_tool_call)], createdAt=tool_call.created))
|
|
324
381
|
|
|
325
|
-
if hasattr(tool_call.details, _FUNCTION):
|
|
326
|
-
output = safe_loads(tool_call.details.function["output"])
|
|
382
|
+
if hasattr(tool_call.details, _FUNCTION) or tool_call.details.get("function"):
|
|
383
|
+
output = safe_loads(tool_call.details.get("function")["output"])
|
|
327
384
|
else:
|
|
328
385
|
try:
|
|
329
386
|
# Some built-ins may have output, others may not
|
|
330
387
|
# Try to retrieve it, but if we don't find anything, skip adding the message
|
|
331
388
|
# Just manually converting to dicts for easy serialization for now rather than custom serializers
|
|
332
389
|
if tool_call.details.type == _CODE_INTERPRETER:
|
|
333
|
-
output = tool_call.details.code_interpreter.outputs
|
|
390
|
+
output = [result.as_dict() for result in tool_call.details.code_interpreter.outputs]
|
|
334
391
|
elif tool_call.details.type == _BING_GROUNDING:
|
|
335
392
|
return messages # not supported yet from bing grounding tool
|
|
336
393
|
elif tool_call.details.type == _FILE_SEARCH:
|
|
337
|
-
output = [
|
|
338
|
-
{
|
|
339
|
-
"file_id": result.file_id,
|
|
340
|
-
"file_name": result.file_name,
|
|
341
|
-
"score": result.score,
|
|
342
|
-
"content": result.content,
|
|
343
|
-
}
|
|
344
|
-
for result in tool_call.details.file_search.results
|
|
345
|
-
]
|
|
394
|
+
output = [result.as_dict() for result in tool_call.details.file_search.results]
|
|
346
395
|
elif tool_call.details.type == _AZURE_AI_SEARCH:
|
|
347
396
|
output = tool_call.details.azure_ai_search["output"]
|
|
348
397
|
elif tool_call.details.type == _FABRIC_DATAAGENT:
|
|
@@ -6,6 +6,7 @@ import asyncio
|
|
|
6
6
|
import logging
|
|
7
7
|
import pandas as pd
|
|
8
8
|
import sys
|
|
9
|
+
import itertools
|
|
9
10
|
from collections import defaultdict
|
|
10
11
|
from concurrent.futures import Future
|
|
11
12
|
from os import PathLike
|
|
@@ -16,15 +17,34 @@ from ..._legacy._batch_engine._run_submitter import RunSubmitter
|
|
|
16
17
|
from ..._legacy._batch_engine._config import BatchEngineConfig
|
|
17
18
|
from ..._legacy._batch_engine._run import Run
|
|
18
19
|
from ..._legacy._adapters._constants import LINE_NUMBER
|
|
20
|
+
from ..._legacy._adapters.types import AttrDict
|
|
19
21
|
from ..._legacy._common._thread_pool_executor_with_context import ThreadPoolExecutorWithContext
|
|
22
|
+
from ..._evaluate._utils import _has_aggregator
|
|
23
|
+
from ..._constants import Prefixes, PF_BATCH_TIMEOUT_SEC
|
|
20
24
|
|
|
25
|
+
from .._utils import get_int_env_var as get_int
|
|
21
26
|
|
|
22
|
-
|
|
27
|
+
|
|
28
|
+
LOGGER = logging.getLogger("run")
|
|
29
|
+
MISSING_VALUE: Final[int] = sys.maxsize
|
|
23
30
|
|
|
24
31
|
|
|
25
32
|
class RunSubmitterClient:
|
|
26
|
-
def __init__(self, config: Optional[BatchEngineConfig] = None) -> None:
|
|
27
|
-
|
|
33
|
+
def __init__(self, *, raise_on_errors: bool = False, config: Optional[BatchEngineConfig] = None) -> None:
|
|
34
|
+
if config:
|
|
35
|
+
self._config = config
|
|
36
|
+
else:
|
|
37
|
+
# Generate default config and apply any overrides to the configuration from environment variables
|
|
38
|
+
self._config = BatchEngineConfig(LOGGER, use_async=True)
|
|
39
|
+
if (val := get_int(PF_BATCH_TIMEOUT_SEC, MISSING_VALUE)) != MISSING_VALUE:
|
|
40
|
+
self._config.batch_timeout_seconds = val
|
|
41
|
+
if (val := get_int("PF_LINE_TIMEOUT_SEC", MISSING_VALUE)) != MISSING_VALUE:
|
|
42
|
+
self._config.line_timeout_seconds = val
|
|
43
|
+
if (val := get_int("PF_WORKER_COUNT", MISSING_VALUE)) != MISSING_VALUE:
|
|
44
|
+
self._config.max_concurrency = val
|
|
45
|
+
|
|
46
|
+
self._config.raise_on_error = raise_on_errors
|
|
47
|
+
|
|
28
48
|
self._thread_pool = ThreadPoolExecutorWithContext(
|
|
29
49
|
thread_name_prefix="evaluators_thread", max_workers=self._config.max_concurrency
|
|
30
50
|
)
|
|
@@ -44,7 +64,6 @@ class RunSubmitterClient:
|
|
|
44
64
|
# input. Update the inputs so that each entry is a dictionary with a data key
|
|
45
65
|
# that contains the original input data.
|
|
46
66
|
inputs = [{"data": input_data} for input_data in data.to_dict(orient="records")]
|
|
47
|
-
|
|
48
67
|
# Pass the correct previous run to the evaluator
|
|
49
68
|
run: Optional[BatchClientRun] = kwargs.pop("run", None)
|
|
50
69
|
if run:
|
|
@@ -75,29 +94,58 @@ class RunSubmitterClient:
|
|
|
75
94
|
def get_details(self, client_run: BatchClientRun, all_results: bool = False) -> pd.DataFrame:
|
|
76
95
|
run = self._get_run(client_run)
|
|
77
96
|
|
|
78
|
-
|
|
79
|
-
|
|
97
|
+
def concat(*dataframes: pd.DataFrame) -> pd.DataFrame:
|
|
98
|
+
return pd.concat(dataframes, axis=1, verify_integrity=True)
|
|
80
99
|
|
|
81
|
-
def
|
|
82
|
-
|
|
83
|
-
if i >= stop_at:
|
|
84
|
-
break
|
|
85
|
-
for k, value in line.items():
|
|
86
|
-
key = f"{prefix}.{k}"
|
|
87
|
-
data[key].append(value)
|
|
100
|
+
def to_dataframe(items: Sequence[Mapping[str, Any]], *, max_length: Optional[int] = None) -> pd.DataFrame:
|
|
101
|
+
"""Convert a sequence of dictionaries to a DataFrame.
|
|
88
102
|
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
103
|
+
:param items: Sequence of dictionaries to convert.
|
|
104
|
+
:type items: Sequence[Mapping[str, Any]]
|
|
105
|
+
:param max_length: Maximum number of items to include in the DataFrame. If None, include all items.
|
|
106
|
+
:type max_length: Optional[int]
|
|
107
|
+
:return: DataFrame containing the items.
|
|
108
|
+
:rtype: pd.DataFrame
|
|
109
|
+
"""
|
|
110
|
+
max_length = None if all_results else self._config.default_num_results
|
|
111
|
+
return pd.DataFrame(data=items if all_results else itertools.islice(items, max_length))
|
|
94
112
|
|
|
95
|
-
|
|
96
|
-
|
|
113
|
+
inputs = concat(
|
|
114
|
+
to_dataframe(run.inputs), to_dataframe([{LINE_NUMBER: i} for i in range(len(run.inputs))])
|
|
115
|
+
).add_prefix(Prefixes.INPUTS)
|
|
116
|
+
|
|
117
|
+
outputs = to_dataframe(run.outputs).add_prefix(Prefixes.OUTPUTS)
|
|
118
|
+
|
|
119
|
+
return concat(inputs, outputs)
|
|
97
120
|
|
|
98
121
|
def get_metrics(self, client_run: BatchClientRun) -> Dict[str, Any]:
|
|
99
122
|
run = self._get_run(client_run)
|
|
100
|
-
return
|
|
123
|
+
return {**run.metrics, **self._get_aggregated_metrics(client_run)}
|
|
124
|
+
|
|
125
|
+
def _get_aggregated_metrics(self, client_run: BatchClientRun) -> Dict[str, Any]:
|
|
126
|
+
aggregated_metrics = None
|
|
127
|
+
run = self._get_run(client_run)
|
|
128
|
+
try:
|
|
129
|
+
if _has_aggregator(run.dynamic_callable):
|
|
130
|
+
result_df = pd.DataFrame(run.outputs)
|
|
131
|
+
if len(result_df.columns) == 1 and result_df.columns[0] == "output":
|
|
132
|
+
aggregate_input = result_df["output"].tolist()
|
|
133
|
+
else:
|
|
134
|
+
aggregate_input = [AttrDict(item) for item in result_df.to_dict("records")]
|
|
135
|
+
|
|
136
|
+
aggr_func = getattr(run.dynamic_callable, "__aggregate__")
|
|
137
|
+
aggregated_metrics = aggr_func(aggregate_input)
|
|
138
|
+
|
|
139
|
+
except Exception as ex: # pylint: disable=broad-exception-caught
|
|
140
|
+
LOGGER.warning("Error calculating aggregations for evaluator, failed with error %s", ex)
|
|
141
|
+
|
|
142
|
+
if not isinstance(aggregated_metrics, dict):
|
|
143
|
+
LOGGER.warning(
|
|
144
|
+
"Aggregated metrics for evaluator is not a dictionary will not be logged as metrics",
|
|
145
|
+
)
|
|
146
|
+
return {}
|
|
147
|
+
|
|
148
|
+
return aggregated_metrics
|
|
101
149
|
|
|
102
150
|
def get_run_summary(self, client_run: BatchClientRun) -> Dict[str, Any]:
|
|
103
151
|
run = self._get_run(client_run)
|
|
@@ -110,7 +158,7 @@ class RunSubmitterClient:
|
|
|
110
158
|
"duration": str(run.duration),
|
|
111
159
|
"completed_lines": total_lines - failed_lines,
|
|
112
160
|
"failed_lines": failed_lines,
|
|
113
|
-
|
|
161
|
+
"log_path": None,
|
|
114
162
|
}
|
|
115
163
|
|
|
116
164
|
@staticmethod
|
|
@@ -81,6 +81,8 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
|
|
|
81
81
|
~azure.ai.evaluation._promptflow.azure._lite_azure_management_client.LiteMLClient
|
|
82
82
|
:param promptflow_run: The promptflow run used by the
|
|
83
83
|
:type promptflow_run: Optional[promptflow._sdk.entities.Run]
|
|
84
|
+
:param tags: A dictionary of tags to be added to the evaluation run for tracking and organization purposes.
|
|
85
|
+
:type tags: Optional[Dict[str, str]]
|
|
84
86
|
"""
|
|
85
87
|
|
|
86
88
|
_MAX_RETRIES = 5
|
|
@@ -98,6 +100,7 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
|
|
|
98
100
|
workspace_name: str,
|
|
99
101
|
management_client: LiteMLClient,
|
|
100
102
|
promptflow_run: Optional[Run] = None,
|
|
103
|
+
tags: Optional[Dict[str, str]] = None,
|
|
101
104
|
) -> None:
|
|
102
105
|
self._tracking_uri: str = tracking_uri
|
|
103
106
|
self._subscription_id: str = subscription_id
|
|
@@ -107,6 +110,7 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
|
|
|
107
110
|
self._is_promptflow_run: bool = promptflow_run is not None
|
|
108
111
|
self._run_name = run_name
|
|
109
112
|
self._promptflow_run = promptflow_run
|
|
113
|
+
self._tags = tags or {}
|
|
110
114
|
self._status = RunStatus.NOT_STARTED
|
|
111
115
|
self._url_base: Optional[str] = None
|
|
112
116
|
self._info: Optional[RunInfo] = None
|
|
@@ -173,11 +177,20 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
|
|
|
173
177
|
)
|
|
174
178
|
else:
|
|
175
179
|
url = f"https://{self._url_base}/mlflow/v2.0" f"{self._get_scope()}/api/2.0/mlflow/runs/create"
|
|
180
|
+
|
|
181
|
+
# Prepare tags: start with user tags, ensure mlflow.user is set
|
|
182
|
+
run_tags = self._tags.copy()
|
|
183
|
+
if "mlflow.user" not in run_tags:
|
|
184
|
+
run_tags["mlflow.user"] = "azure-ai-evaluation"
|
|
185
|
+
|
|
186
|
+
# Convert tags to MLflow format
|
|
187
|
+
tags_list = [{"key": key, "value": value} for key, value in run_tags.items()]
|
|
188
|
+
|
|
176
189
|
body = {
|
|
177
190
|
"experiment_id": "0",
|
|
178
191
|
"user_id": "azure-ai-evaluation",
|
|
179
192
|
"start_time": int(time.time() * 1000),
|
|
180
|
-
"tags":
|
|
193
|
+
"tags": tags_list,
|
|
181
194
|
}
|
|
182
195
|
if self._run_name:
|
|
183
196
|
body["run_name"] = self._run_name
|