azure-ai-evaluation 1.9.0__py3-none-any.whl → 1.11.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (85) hide show
  1. azure/ai/evaluation/__init__.py +46 -12
  2. azure/ai/evaluation/_aoai/python_grader.py +84 -0
  3. azure/ai/evaluation/_aoai/score_model_grader.py +1 -0
  4. azure/ai/evaluation/_common/onedp/models/_models.py +5 -0
  5. azure/ai/evaluation/_common/rai_service.py +3 -3
  6. azure/ai/evaluation/_common/utils.py +74 -17
  7. azure/ai/evaluation/_converters/_ai_services.py +60 -10
  8. azure/ai/evaluation/_converters/_models.py +75 -26
  9. azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +70 -22
  10. azure/ai/evaluation/_evaluate/_eval_run.py +14 -1
  11. azure/ai/evaluation/_evaluate/_evaluate.py +163 -44
  12. azure/ai/evaluation/_evaluate/_evaluate_aoai.py +79 -33
  13. azure/ai/evaluation/_evaluate/_utils.py +5 -2
  14. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +1 -1
  15. azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +8 -1
  16. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +3 -2
  17. azure/ai/evaluation/_evaluators/_common/_base_eval.py +143 -25
  18. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +7 -2
  19. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +19 -9
  20. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +15 -5
  21. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +4 -1
  22. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +4 -1
  23. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +5 -2
  24. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +4 -1
  25. azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +3 -0
  26. azure/ai/evaluation/_evaluators/_eci/_eci.py +3 -0
  27. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +1 -1
  28. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +3 -2
  29. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +1 -1
  30. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +114 -4
  31. azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +9 -3
  32. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +1 -1
  33. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +8 -1
  34. azure/ai/evaluation/_evaluators/_qa/_qa.py +1 -1
  35. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +56 -3
  36. azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +140 -59
  37. azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +11 -3
  38. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +3 -2
  39. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +1 -1
  40. azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +2 -1
  41. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +3 -2
  42. azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +24 -12
  43. azure/ai/evaluation/_evaluators/_task_adherence/task_adherence.prompty +354 -66
  44. azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +214 -187
  45. azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +126 -31
  46. azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +8 -1
  47. azure/ai/evaluation/_evaluators/_xpia/xpia.py +4 -1
  48. azure/ai/evaluation/_exceptions.py +1 -0
  49. azure/ai/evaluation/_legacy/_batch_engine/_config.py +6 -3
  50. azure/ai/evaluation/_legacy/_batch_engine/_engine.py +115 -30
  51. azure/ai/evaluation/_legacy/_batch_engine/_result.py +2 -0
  52. azure/ai/evaluation/_legacy/_batch_engine/_run.py +2 -2
  53. azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +28 -31
  54. azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +2 -0
  55. azure/ai/evaluation/_version.py +1 -1
  56. azure/ai/evaluation/red_team/__init__.py +4 -3
  57. azure/ai/evaluation/red_team/_attack_objective_generator.py +17 -0
  58. azure/ai/evaluation/red_team/_callback_chat_target.py +14 -1
  59. azure/ai/evaluation/red_team/_evaluation_processor.py +376 -0
  60. azure/ai/evaluation/red_team/_mlflow_integration.py +322 -0
  61. azure/ai/evaluation/red_team/_orchestrator_manager.py +661 -0
  62. azure/ai/evaluation/red_team/_red_team.py +655 -2665
  63. azure/ai/evaluation/red_team/_red_team_result.py +6 -0
  64. azure/ai/evaluation/red_team/_result_processor.py +610 -0
  65. azure/ai/evaluation/red_team/_utils/__init__.py +34 -0
  66. azure/ai/evaluation/red_team/_utils/_rai_service_eval_chat_target.py +11 -4
  67. azure/ai/evaluation/red_team/_utils/_rai_service_true_false_scorer.py +6 -0
  68. azure/ai/evaluation/red_team/_utils/constants.py +0 -2
  69. azure/ai/evaluation/red_team/_utils/exception_utils.py +345 -0
  70. azure/ai/evaluation/red_team/_utils/file_utils.py +266 -0
  71. azure/ai/evaluation/red_team/_utils/formatting_utils.py +115 -13
  72. azure/ai/evaluation/red_team/_utils/metric_mapping.py +24 -4
  73. azure/ai/evaluation/red_team/_utils/progress_utils.py +252 -0
  74. azure/ai/evaluation/red_team/_utils/retry_utils.py +218 -0
  75. azure/ai/evaluation/red_team/_utils/strategy_utils.py +17 -4
  76. azure/ai/evaluation/simulator/_adversarial_simulator.py +14 -2
  77. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +13 -1
  78. azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +21 -7
  79. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +24 -5
  80. azure/ai/evaluation/simulator/_simulator.py +12 -0
  81. {azure_ai_evaluation-1.9.0.dist-info → azure_ai_evaluation-1.11.0.dist-info}/METADATA +63 -4
  82. {azure_ai_evaluation-1.9.0.dist-info → azure_ai_evaluation-1.11.0.dist-info}/RECORD +85 -76
  83. {azure_ai_evaluation-1.9.0.dist-info → azure_ai_evaluation-1.11.0.dist-info}/WHEEL +1 -1
  84. {azure_ai_evaluation-1.9.0.dist-info → azure_ai_evaluation-1.11.0.dist-info/licenses}/NOTICE.txt +0 -0
  85. {azure_ai_evaluation-1.9.0.dist-info → azure_ai_evaluation-1.11.0.dist-info}/top_level.txt +0 -0
@@ -3,17 +3,31 @@ import json
3
3
 
4
4
  from pydantic import BaseModel
5
5
 
6
- from typing import List, Optional, Union
6
+ from typing import TYPE_CHECKING, Any, List, Optional, Union
7
7
 
8
8
  # Models moved in a later version of agents SDK, so try a few different locations
9
- try:
10
- from azure.ai.projects.models import RunStepFunctionToolCall
11
- except ImportError:
12
- pass
13
- try:
14
- from azure.ai.agents.models import RunStepFunctionToolCall
15
- except ImportError:
16
- pass
9
+ # Only import for type checking to avoid runtime import errors
10
+ if TYPE_CHECKING:
11
+ try:
12
+ from azure.ai.projects.models import RunStepFunctionToolCall
13
+ except ImportError:
14
+ try:
15
+ from azure.ai.agents.models import RunStepFunctionToolCall
16
+ except ImportError:
17
+ # Create a protocol for type checking when the real class isn't available
18
+ from typing import Protocol
19
+
20
+ class RunStepFunctionToolCall(Protocol):
21
+ """Protocol defining the expected interface for RunStepFunctionToolCall."""
22
+
23
+ id: str
24
+ type: str
25
+
26
+ def get(self, key: str, default: Any = None) -> Any: ...
27
+
28
+ else:
29
+ # At runtime, we don't need the actual class since it's only used in type annotations
30
+ RunStepFunctionToolCall = Any
17
31
 
18
32
  # Message roles constants.
19
33
  _SYSTEM = "system"
@@ -33,9 +47,12 @@ _TOOL_CALLS = "tool_calls"
33
47
  # Constants to only be used internally in this file for the built-in tools.
34
48
  _CODE_INTERPRETER = "code_interpreter"
35
49
  _BING_GROUNDING = "bing_grounding"
50
+ _BING_CUSTOM_SEARCH = "bing_custom_search"
36
51
  _FILE_SEARCH = "file_search"
37
52
  _AZURE_AI_SEARCH = "azure_ai_search"
53
+ _SHAREPOINT_GROUNDING = "sharepoint_grounding"
38
54
  _FABRIC_DATAAGENT = "fabric_dataagent"
55
+ _OPENAPI = "openapi"
39
56
 
40
57
  # Built-in tool descriptions and parameters are hidden, but we include basic descriptions
41
58
  # for evaluation purposes.
@@ -44,8 +61,10 @@ _BUILT_IN_DESCRIPTIONS = {
44
61
  + "generate code, and create graphs and charts using your data. Supports "
45
62
  + "up to 20 files.",
46
63
  _BING_GROUNDING: "Enhance model output with web data.",
47
- _FILE_SEARCH: "Search for data across uploaded files.",
64
+ _BING_CUSTOM_SEARCH: "Enables agents to retrieve content from a curated subset of websites, enhancing relevance and reducing noise from public web searches.",
65
+ _FILE_SEARCH: "Search for data across uploaded files. A single call can return multiple results/files in the 'results' field.",
48
66
  _AZURE_AI_SEARCH: "Search an Azure AI Search index for relevant data.",
67
+ _SHAREPOINT_GROUNDING: "Allows agents to access and retrieve relevant content from Microsoft SharePoint document libraries, grounding responses in organizational knowledge.",
49
68
  _FABRIC_DATAAGENT: "Connect to Microsoft Fabric data agents to retrieve data across different data sources.",
50
69
  }
51
70
 
@@ -59,6 +78,15 @@ _BUILT_IN_PARAMS = {
59
78
  "type": "object",
60
79
  "properties": {"requesturl": {"type": "string", "description": "URL used in Bing Search API."}},
61
80
  },
81
+ _BING_CUSTOM_SEARCH: {
82
+ "type": "object",
83
+ "properties": {
84
+ "requesturl": {
85
+ "type": "string",
86
+ "description": "Search queries, along with pre-configured site restrictions or domain filters.",
87
+ }
88
+ },
89
+ },
62
90
  _FILE_SEARCH: {
63
91
  "type": "object",
64
92
  "properties": {
@@ -76,6 +104,12 @@ _BUILT_IN_PARAMS = {
76
104
  "type": "object",
77
105
  "properties": {"input": {"type": "string", "description": "Search terms to use."}},
78
106
  },
107
+ _SHAREPOINT_GROUNDING: {
108
+ "type": "object",
109
+ "properties": {
110
+ "input": {"type": "string", "description": "A natural language query to search SharePoint content."}
111
+ },
112
+ },
79
113
  _FABRIC_DATAAGENT: {
80
114
  "type": "object",
81
115
  "properties": {"input": {"type": "string", "description": "Search terms to use."}},
@@ -217,6 +251,27 @@ class ToolDefinition(BaseModel):
217
251
  parameters: dict
218
252
 
219
253
 
254
+ class OpenAPIToolDefinition(BaseModel):
255
+ """Represents OpenAPI tool definition that will be used in the agent.
256
+ :param name: The name of the tool.
257
+ :type name: str
258
+ :param type: The type of the tool.
259
+ :type type: str
260
+ :param description: A description of the tool.
261
+ :type description: str
262
+ :param parameters: The parameters required by the tool.
263
+ :type parameters: dict
264
+ """
265
+
266
+ name: str
267
+ type: str
268
+ description: Optional[str] = None
269
+ spec: object
270
+ auth: object
271
+ default_params: Optional[list[str]] = None
272
+ functions: list[ToolDefinition]
273
+
274
+
220
275
  class ToolCall:
221
276
  """Represents a tool call, used as an intermediate step in the conversion process.
222
277
 
@@ -247,7 +302,7 @@ class EvaluatorData(BaseModel):
247
302
 
248
303
  query: List[Message]
249
304
  response: List[Message]
250
- tool_definitions: List[ToolDefinition]
305
+ tool_definitions: List[Union[ToolDefinition, OpenAPIToolDefinition]]
251
306
 
252
307
  def to_json(self):
253
308
  """Converts the result to a JSON string.
@@ -277,14 +332,16 @@ def break_tool_call_into_messages(tool_call: ToolCall, run_id: str) -> List[Mess
277
332
  # all in most of the cases, and bing would only show the API URL, without arguments or results.
278
333
  # Bing grounding would have "bing_grounding" in details with "requesturl" that will just be the API path with query.
279
334
  # TODO: Work with AI Services to add converter support for BingGrounding and CodeInterpreter.
280
- if hasattr(tool_call.details, _FUNCTION):
335
+ if hasattr(tool_call.details, _FUNCTION) or tool_call.details.get("function"):
281
336
  # This is the internals of the content object that will be included with the tool call.
282
337
  tool_call_id = tool_call.details.id
283
338
  content_tool_call = {
284
339
  "type": _TOOL_CALL,
285
340
  "tool_call_id": tool_call_id,
286
- "name": tool_call.details.function.name,
287
- "arguments": safe_loads(tool_call.details.function.arguments),
341
+ "name": tool_call.details.get(_FUNCTION).get("name") if tool_call.details.get(_FUNCTION) else None,
342
+ "arguments": safe_loads(
343
+ tool_call.details.get(_FUNCTION).get("arguments") if tool_call.details.get(_FUNCTION) else None
344
+ ),
288
345
  }
289
346
  else:
290
347
  # Treat built-in tools separately. Object models may be unique so handle each case separately
@@ -322,27 +379,19 @@ def break_tool_call_into_messages(tool_call: ToolCall, run_id: str) -> List[Mess
322
379
  # assistant's action of calling the tool.
323
380
  messages.append(AssistantMessage(run_id=run_id, content=[to_dict(content_tool_call)], createdAt=tool_call.created))
324
381
 
325
- if hasattr(tool_call.details, _FUNCTION):
326
- output = safe_loads(tool_call.details.function["output"])
382
+ if hasattr(tool_call.details, _FUNCTION) or tool_call.details.get("function"):
383
+ output = safe_loads(tool_call.details.get("function")["output"])
327
384
  else:
328
385
  try:
329
386
  # Some built-ins may have output, others may not
330
387
  # Try to retrieve it, but if we don't find anything, skip adding the message
331
388
  # Just manually converting to dicts for easy serialization for now rather than custom serializers
332
389
  if tool_call.details.type == _CODE_INTERPRETER:
333
- output = tool_call.details.code_interpreter.outputs
390
+ output = [result.as_dict() for result in tool_call.details.code_interpreter.outputs]
334
391
  elif tool_call.details.type == _BING_GROUNDING:
335
392
  return messages # not supported yet from bing grounding tool
336
393
  elif tool_call.details.type == _FILE_SEARCH:
337
- output = [
338
- {
339
- "file_id": result.file_id,
340
- "file_name": result.file_name,
341
- "score": result.score,
342
- "content": result.content,
343
- }
344
- for result in tool_call.details.file_search.results
345
- ]
394
+ output = [result.as_dict() for result in tool_call.details.file_search.results]
346
395
  elif tool_call.details.type == _AZURE_AI_SEARCH:
347
396
  output = tool_call.details.azure_ai_search["output"]
348
397
  elif tool_call.details.type == _FABRIC_DATAAGENT:
@@ -6,6 +6,7 @@ import asyncio
6
6
  import logging
7
7
  import pandas as pd
8
8
  import sys
9
+ import itertools
9
10
  from collections import defaultdict
10
11
  from concurrent.futures import Future
11
12
  from os import PathLike
@@ -16,15 +17,34 @@ from ..._legacy._batch_engine._run_submitter import RunSubmitter
16
17
  from ..._legacy._batch_engine._config import BatchEngineConfig
17
18
  from ..._legacy._batch_engine._run import Run
18
19
  from ..._legacy._adapters._constants import LINE_NUMBER
20
+ from ..._legacy._adapters.types import AttrDict
19
21
  from ..._legacy._common._thread_pool_executor_with_context import ThreadPoolExecutorWithContext
22
+ from ..._evaluate._utils import _has_aggregator
23
+ from ..._constants import Prefixes, PF_BATCH_TIMEOUT_SEC
20
24
 
25
+ from .._utils import get_int_env_var as get_int
21
26
 
22
- LOGGER = logging.getLogger(__name__)
27
+
28
+ LOGGER = logging.getLogger("run")
29
+ MISSING_VALUE: Final[int] = sys.maxsize
23
30
 
24
31
 
25
32
  class RunSubmitterClient:
26
- def __init__(self, config: Optional[BatchEngineConfig] = None) -> None:
27
- self._config = config or BatchEngineConfig(LOGGER, use_async=True)
33
+ def __init__(self, *, raise_on_errors: bool = False, config: Optional[BatchEngineConfig] = None) -> None:
34
+ if config:
35
+ self._config = config
36
+ else:
37
+ # Generate default config and apply any overrides to the configuration from environment variables
38
+ self._config = BatchEngineConfig(LOGGER, use_async=True)
39
+ if (val := get_int(PF_BATCH_TIMEOUT_SEC, MISSING_VALUE)) != MISSING_VALUE:
40
+ self._config.batch_timeout_seconds = val
41
+ if (val := get_int("PF_LINE_TIMEOUT_SEC", MISSING_VALUE)) != MISSING_VALUE:
42
+ self._config.line_timeout_seconds = val
43
+ if (val := get_int("PF_WORKER_COUNT", MISSING_VALUE)) != MISSING_VALUE:
44
+ self._config.max_concurrency = val
45
+
46
+ self._config.raise_on_error = raise_on_errors
47
+
28
48
  self._thread_pool = ThreadPoolExecutorWithContext(
29
49
  thread_name_prefix="evaluators_thread", max_workers=self._config.max_concurrency
30
50
  )
@@ -44,7 +64,6 @@ class RunSubmitterClient:
44
64
  # input. Update the inputs so that each entry is a dictionary with a data key
45
65
  # that contains the original input data.
46
66
  inputs = [{"data": input_data} for input_data in data.to_dict(orient="records")]
47
-
48
67
  # Pass the correct previous run to the evaluator
49
68
  run: Optional[BatchClientRun] = kwargs.pop("run", None)
50
69
  if run:
@@ -75,29 +94,58 @@ class RunSubmitterClient:
75
94
  def get_details(self, client_run: BatchClientRun, all_results: bool = False) -> pd.DataFrame:
76
95
  run = self._get_run(client_run)
77
96
 
78
- data: Dict[str, List[Any]] = defaultdict(list)
79
- stop_at: Final[int] = self._config.default_num_results if not all_results else sys.maxsize
97
+ def concat(*dataframes: pd.DataFrame) -> pd.DataFrame:
98
+ return pd.concat(dataframes, axis=1, verify_integrity=True)
80
99
 
81
- def _update(prefix: str, items: Sequence[Mapping[str, Any]]) -> None:
82
- for i, line in enumerate(items):
83
- if i >= stop_at:
84
- break
85
- for k, value in line.items():
86
- key = f"{prefix}.{k}"
87
- data[key].append(value)
100
+ def to_dataframe(items: Sequence[Mapping[str, Any]], *, max_length: Optional[int] = None) -> pd.DataFrame:
101
+ """Convert a sequence of dictionaries to a DataFrame.
88
102
 
89
- # Go from a list of dictionaries (i.e. a row view of the data) to a dictionary of lists
90
- # (i.e. a column view of the data)
91
- _update("inputs", run.inputs)
92
- _update("inputs", [{LINE_NUMBER: i} for i in range(len(run.inputs))])
93
- _update("outputs", run.outputs)
103
+ :param items: Sequence of dictionaries to convert.
104
+ :type items: Sequence[Mapping[str, Any]]
105
+ :param max_length: Maximum number of items to include in the DataFrame. If None, include all items.
106
+ :type max_length: Optional[int]
107
+ :return: DataFrame containing the items.
108
+ :rtype: pd.DataFrame
109
+ """
110
+ max_length = None if all_results else self._config.default_num_results
111
+ return pd.DataFrame(data=items if all_results else itertools.islice(items, max_length))
94
112
 
95
- df = pd.DataFrame(data).reindex(columns=[k for k in data.keys()])
96
- return df
113
+ inputs = concat(
114
+ to_dataframe(run.inputs), to_dataframe([{LINE_NUMBER: i} for i in range(len(run.inputs))])
115
+ ).add_prefix(Prefixes.INPUTS)
116
+
117
+ outputs = to_dataframe(run.outputs).add_prefix(Prefixes.OUTPUTS)
118
+
119
+ return concat(inputs, outputs)
97
120
 
98
121
  def get_metrics(self, client_run: BatchClientRun) -> Dict[str, Any]:
99
122
  run = self._get_run(client_run)
100
- return dict(run.metrics)
123
+ return {**run.metrics, **self._get_aggregated_metrics(client_run)}
124
+
125
+ def _get_aggregated_metrics(self, client_run: BatchClientRun) -> Dict[str, Any]:
126
+ aggregated_metrics = None
127
+ run = self._get_run(client_run)
128
+ try:
129
+ if _has_aggregator(run.dynamic_callable):
130
+ result_df = pd.DataFrame(run.outputs)
131
+ if len(result_df.columns) == 1 and result_df.columns[0] == "output":
132
+ aggregate_input = result_df["output"].tolist()
133
+ else:
134
+ aggregate_input = [AttrDict(item) for item in result_df.to_dict("records")]
135
+
136
+ aggr_func = getattr(run.dynamic_callable, "__aggregate__")
137
+ aggregated_metrics = aggr_func(aggregate_input)
138
+
139
+ except Exception as ex: # pylint: disable=broad-exception-caught
140
+ LOGGER.warning("Error calculating aggregations for evaluator, failed with error %s", ex)
141
+
142
+ if not isinstance(aggregated_metrics, dict):
143
+ LOGGER.warning(
144
+ "Aggregated metrics for evaluator is not a dictionary will not be logged as metrics",
145
+ )
146
+ return {}
147
+
148
+ return aggregated_metrics
101
149
 
102
150
  def get_run_summary(self, client_run: BatchClientRun) -> Dict[str, Any]:
103
151
  run = self._get_run(client_run)
@@ -110,7 +158,7 @@ class RunSubmitterClient:
110
158
  "duration": str(run.duration),
111
159
  "completed_lines": total_lines - failed_lines,
112
160
  "failed_lines": failed_lines,
113
- # "log_path": "",
161
+ "log_path": None,
114
162
  }
115
163
 
116
164
  @staticmethod
@@ -81,6 +81,8 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
81
81
  ~azure.ai.evaluation._promptflow.azure._lite_azure_management_client.LiteMLClient
82
82
  :param promptflow_run: The promptflow run used by the
83
83
  :type promptflow_run: Optional[promptflow._sdk.entities.Run]
84
+ :param tags: A dictionary of tags to be added to the evaluation run for tracking and organization purposes.
85
+ :type tags: Optional[Dict[str, str]]
84
86
  """
85
87
 
86
88
  _MAX_RETRIES = 5
@@ -98,6 +100,7 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
98
100
  workspace_name: str,
99
101
  management_client: LiteMLClient,
100
102
  promptflow_run: Optional[Run] = None,
103
+ tags: Optional[Dict[str, str]] = None,
101
104
  ) -> None:
102
105
  self._tracking_uri: str = tracking_uri
103
106
  self._subscription_id: str = subscription_id
@@ -107,6 +110,7 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
107
110
  self._is_promptflow_run: bool = promptflow_run is not None
108
111
  self._run_name = run_name
109
112
  self._promptflow_run = promptflow_run
113
+ self._tags = tags or {}
110
114
  self._status = RunStatus.NOT_STARTED
111
115
  self._url_base: Optional[str] = None
112
116
  self._info: Optional[RunInfo] = None
@@ -173,11 +177,20 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
173
177
  )
174
178
  else:
175
179
  url = f"https://{self._url_base}/mlflow/v2.0" f"{self._get_scope()}/api/2.0/mlflow/runs/create"
180
+
181
+ # Prepare tags: start with user tags, ensure mlflow.user is set
182
+ run_tags = self._tags.copy()
183
+ if "mlflow.user" not in run_tags:
184
+ run_tags["mlflow.user"] = "azure-ai-evaluation"
185
+
186
+ # Convert tags to MLflow format
187
+ tags_list = [{"key": key, "value": value} for key, value in run_tags.items()]
188
+
176
189
  body = {
177
190
  "experiment_id": "0",
178
191
  "user_id": "azure-ai-evaluation",
179
192
  "start_time": int(time.time() * 1000),
180
- "tags": [{"key": "mlflow.user", "value": "azure-ai-evaluation"}],
193
+ "tags": tags_list,
181
194
  }
182
195
  if self._run_name:
183
196
  body["run_name"] = self._run_name