azure-ai-evaluation 1.5.0__py3-none-any.whl → 1.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (144) hide show
  1. azure/ai/evaluation/__init__.py +10 -0
  2. azure/ai/evaluation/_aoai/__init__.py +10 -0
  3. azure/ai/evaluation/_aoai/aoai_grader.py +89 -0
  4. azure/ai/evaluation/_aoai/label_grader.py +66 -0
  5. azure/ai/evaluation/_aoai/string_check_grader.py +65 -0
  6. azure/ai/evaluation/_aoai/text_similarity_grader.py +88 -0
  7. azure/ai/evaluation/_azure/_clients.py +4 -4
  8. azure/ai/evaluation/_azure/_envs.py +208 -0
  9. azure/ai/evaluation/_azure/_token_manager.py +12 -7
  10. azure/ai/evaluation/_common/__init__.py +7 -0
  11. azure/ai/evaluation/_common/evaluation_onedp_client.py +163 -0
  12. azure/ai/evaluation/_common/onedp/__init__.py +32 -0
  13. azure/ai/evaluation/_common/onedp/_client.py +139 -0
  14. azure/ai/evaluation/_common/onedp/_configuration.py +73 -0
  15. azure/ai/evaluation/_common/onedp/_model_base.py +1232 -0
  16. azure/ai/evaluation/_common/onedp/_patch.py +21 -0
  17. azure/ai/evaluation/_common/onedp/_serialization.py +2032 -0
  18. azure/ai/evaluation/_common/onedp/_types.py +21 -0
  19. azure/ai/evaluation/_common/onedp/_validation.py +50 -0
  20. azure/ai/evaluation/_common/onedp/_vendor.py +50 -0
  21. azure/ai/evaluation/_common/onedp/_version.py +9 -0
  22. azure/ai/evaluation/_common/onedp/aio/__init__.py +29 -0
  23. azure/ai/evaluation/_common/onedp/aio/_client.py +143 -0
  24. azure/ai/evaluation/_common/onedp/aio/_configuration.py +75 -0
  25. azure/ai/evaluation/_common/onedp/aio/_patch.py +21 -0
  26. azure/ai/evaluation/_common/onedp/aio/_vendor.py +40 -0
  27. azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +39 -0
  28. azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +4494 -0
  29. azure/ai/evaluation/_common/onedp/aio/operations/_patch.py +21 -0
  30. azure/ai/evaluation/_common/onedp/models/__init__.py +142 -0
  31. azure/ai/evaluation/_common/onedp/models/_enums.py +162 -0
  32. azure/ai/evaluation/_common/onedp/models/_models.py +2228 -0
  33. azure/ai/evaluation/_common/onedp/models/_patch.py +21 -0
  34. azure/ai/evaluation/_common/onedp/operations/__init__.py +39 -0
  35. azure/ai/evaluation/_common/onedp/operations/_operations.py +5655 -0
  36. azure/ai/evaluation/_common/onedp/operations/_patch.py +21 -0
  37. azure/ai/evaluation/_common/onedp/py.typed +1 -0
  38. azure/ai/evaluation/_common/onedp/servicepatterns/__init__.py +1 -0
  39. azure/ai/evaluation/_common/onedp/servicepatterns/aio/__init__.py +1 -0
  40. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/__init__.py +25 -0
  41. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_operations.py +34 -0
  42. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_patch.py +20 -0
  43. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/__init__.py +1 -0
  44. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/__init__.py +1 -0
  45. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/__init__.py +22 -0
  46. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_operations.py +29 -0
  47. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_patch.py +20 -0
  48. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/__init__.py +22 -0
  49. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_operations.py +29 -0
  50. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_patch.py +20 -0
  51. azure/ai/evaluation/_common/onedp/servicepatterns/operations/__init__.py +25 -0
  52. azure/ai/evaluation/_common/onedp/servicepatterns/operations/_operations.py +34 -0
  53. azure/ai/evaluation/_common/onedp/servicepatterns/operations/_patch.py +20 -0
  54. azure/ai/evaluation/_common/rai_service.py +165 -34
  55. azure/ai/evaluation/_common/raiclient/_version.py +1 -1
  56. azure/ai/evaluation/_common/utils.py +79 -1
  57. azure/ai/evaluation/_constants.py +16 -0
  58. azure/ai/evaluation/_converters/_ai_services.py +162 -118
  59. azure/ai/evaluation/_converters/_models.py +76 -6
  60. azure/ai/evaluation/_eval_mapping.py +73 -0
  61. azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +30 -16
  62. azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +8 -0
  63. azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +5 -0
  64. azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +17 -1
  65. azure/ai/evaluation/_evaluate/_eval_run.py +1 -1
  66. azure/ai/evaluation/_evaluate/_evaluate.py +325 -76
  67. azure/ai/evaluation/_evaluate/_evaluate_aoai.py +553 -0
  68. azure/ai/evaluation/_evaluate/_utils.py +117 -4
  69. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +11 -1
  70. azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +9 -1
  71. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +12 -2
  72. azure/ai/evaluation/_evaluators/_common/_base_eval.py +12 -3
  73. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +12 -3
  74. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +2 -2
  75. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +12 -2
  76. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +14 -4
  77. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +9 -8
  78. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +10 -0
  79. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +10 -0
  80. azure/ai/evaluation/_evaluators/_document_retrieval/__init__.py +11 -0
  81. azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +469 -0
  82. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +10 -0
  83. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +11 -1
  84. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +10 -0
  85. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +11 -1
  86. azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +16 -2
  87. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +10 -0
  88. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +11 -0
  89. azure/ai/evaluation/_evaluators/_qa/_qa.py +10 -0
  90. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +11 -1
  91. azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +20 -2
  92. azure/ai/evaluation/_evaluators/_response_completeness/response_completeness.prompty +31 -46
  93. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +10 -0
  94. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +10 -0
  95. azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +10 -0
  96. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +11 -1
  97. azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +16 -2
  98. azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +86 -12
  99. azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +10 -0
  100. azure/ai/evaluation/_evaluators/_xpia/xpia.py +11 -0
  101. azure/ai/evaluation/_exceptions.py +2 -0
  102. azure/ai/evaluation/_legacy/_adapters/__init__.py +0 -14
  103. azure/ai/evaluation/_legacy/_adapters/_check.py +17 -0
  104. azure/ai/evaluation/_legacy/_adapters/_flows.py +1 -1
  105. azure/ai/evaluation/_legacy/_batch_engine/_engine.py +51 -32
  106. azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py +114 -8
  107. azure/ai/evaluation/_legacy/_batch_engine/_result.py +6 -0
  108. azure/ai/evaluation/_legacy/_batch_engine/_run.py +6 -0
  109. azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +69 -29
  110. azure/ai/evaluation/_legacy/_batch_engine/_trace.py +54 -62
  111. azure/ai/evaluation/_legacy/_batch_engine/_utils.py +19 -1
  112. azure/ai/evaluation/_legacy/_common/__init__.py +3 -0
  113. azure/ai/evaluation/_legacy/_common/_async_token_provider.py +124 -0
  114. azure/ai/evaluation/_legacy/_common/_thread_pool_executor_with_context.py +15 -0
  115. azure/ai/evaluation/_legacy/prompty/_connection.py +11 -74
  116. azure/ai/evaluation/_legacy/prompty/_exceptions.py +80 -0
  117. azure/ai/evaluation/_legacy/prompty/_prompty.py +119 -9
  118. azure/ai/evaluation/_legacy/prompty/_utils.py +72 -2
  119. azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +114 -22
  120. azure/ai/evaluation/_version.py +1 -1
  121. azure/ai/evaluation/red_team/_attack_strategy.py +1 -1
  122. azure/ai/evaluation/red_team/_red_team.py +976 -546
  123. azure/ai/evaluation/red_team/_utils/metric_mapping.py +23 -0
  124. azure/ai/evaluation/red_team/_utils/strategy_utils.py +1 -1
  125. azure/ai/evaluation/simulator/_adversarial_simulator.py +63 -39
  126. azure/ai/evaluation/simulator/_constants.py +1 -0
  127. azure/ai/evaluation/simulator/_conversation/__init__.py +13 -6
  128. azure/ai/evaluation/simulator/_conversation/_conversation.py +2 -1
  129. azure/ai/evaluation/simulator/_conversation/constants.py +1 -1
  130. azure/ai/evaluation/simulator/_direct_attack_simulator.py +38 -25
  131. azure/ai/evaluation/simulator/_helpers/_language_suffix_mapping.py +1 -0
  132. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +43 -28
  133. azure/ai/evaluation/simulator/_model_tools/__init__.py +2 -1
  134. azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +26 -18
  135. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +5 -10
  136. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +65 -41
  137. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +15 -10
  138. azure/ai/evaluation/simulator/_model_tools/models.py +20 -17
  139. {azure_ai_evaluation-1.5.0.dist-info → azure_ai_evaluation-1.7.0.dist-info}/METADATA +49 -3
  140. {azure_ai_evaluation-1.5.0.dist-info → azure_ai_evaluation-1.7.0.dist-info}/RECORD +144 -86
  141. /azure/ai/evaluation/_legacy/{_batch_engine → _common}/_logging.py +0 -0
  142. {azure_ai_evaluation-1.5.0.dist-info → azure_ai_evaluation-1.7.0.dist-info}/NOTICE.txt +0 -0
  143. {azure_ai_evaluation-1.5.0.dist-info → azure_ai_evaluation-1.7.0.dist-info}/WHEEL +0 -0
  144. {azure_ai_evaluation-1.5.0.dist-info → azure_ai_evaluation-1.7.0.dist-info}/top_level.txt +0 -0
@@ -3,10 +3,18 @@ import json
3
3
 
4
4
  from pydantic import BaseModel
5
5
 
6
- from azure.ai.projects.models import RunStepFunctionToolCall
7
-
8
6
  from typing import List, Optional, Union
9
7
 
8
+ # Models moved in a later version of agents SDK, so try a few different locations
9
+ try:
10
+ from azure.ai.projects.models import RunStepFunctionToolCall
11
+ except ImportError:
12
+ pass
13
+ try:
14
+ from azure.ai.agents.models import RunStepFunctionToolCall
15
+ except ImportError:
16
+ pass
17
+
10
18
  # Message roles constants.
11
19
  _SYSTEM = "system"
12
20
  _USER = "user"
@@ -21,6 +29,57 @@ _FUNCTION = "function"
21
29
  # This is returned by AI services in the API to filter against tool invocations.
22
30
  _TOOL_CALLS = "tool_calls"
23
31
 
32
+ # Constants to only be used internally in this file for the built-in tools.
33
+ _CODE_INTERPRETER = "code_interpreter"
34
+ _BING_GROUNDING = "bing_grounding"
35
+ _FILE_SEARCH = "file_search"
36
+ _AZURE_AI_SEARCH = "azure_ai_search"
37
+ _FABRIC_DATAAGENT = "fabric_dataagent"
38
+
39
+ # Built-in tool descriptions and parameters are hidden, but we include basic descriptions
40
+ # for evaluation purposes.
41
+ _BUILT_IN_DESCRIPTIONS = {
42
+ _CODE_INTERPRETER: "Use code interpreter to read and interpret information from datasets, "
43
+ + "generate code, and create graphs and charts using your data. Supports "
44
+ + "up to 20 files.",
45
+ _BING_GROUNDING: "Enhance model output with web data.",
46
+ _FILE_SEARCH: "Search for data across uploaded files.",
47
+ _AZURE_AI_SEARCH: "Search an Azure AI Search index for relevant data.",
48
+ _FABRIC_DATAAGENT: "Connect to Microsoft Fabric data agents to retrieve data across different data sources.",
49
+ }
50
+
51
+ # Built-in tool parameters are hidden, but we include basic parameters for evaluation purposes.
52
+ _BUILT_IN_PARAMS = {
53
+ _CODE_INTERPRETER: {
54
+ "type": "object",
55
+ "properties": {"input": {"type": "string", "description": "Generated code to be executed."}},
56
+ },
57
+ _BING_GROUNDING: {
58
+ "type": "object",
59
+ "properties": {"requesturl": {"type": "string", "description": "URL used in Bing Search API."}},
60
+ },
61
+ _FILE_SEARCH: {
62
+ "type": "object",
63
+ "properties": {
64
+ "ranking_options": {
65
+ "type": "object",
66
+ "properties": {
67
+ "ranker": {"type": "string", "description": "Ranking algorithm to use."},
68
+ "score_threshold": {"type": "number", "description": "Threshold for search results."},
69
+ },
70
+ "description": "Ranking options for search results.",
71
+ }
72
+ },
73
+ },
74
+ _AZURE_AI_SEARCH: {
75
+ "type": "object",
76
+ "properties": {"input": {"type": "string", "description": "Search terms to use."}},
77
+ },
78
+ _FABRIC_DATAAGENT: {
79
+ "type": "object",
80
+ "properties": {"input": {"type": "string", "description": "Search terms to use."}},
81
+ },
82
+ }
24
83
 
25
84
  class Message(BaseModel):
26
85
  """Represents a message in a conversation with agents, assistants, and tools. We need to export these structures
@@ -98,6 +157,8 @@ class ToolDefinition(BaseModel):
98
157
 
99
158
  :param name: The name of the tool.
100
159
  :type name: str
160
+ :param type: The type of the tool.
161
+ :type type: str
101
162
  :param description: A description of the tool.
102
163
  :type description: str
103
164
  :param parameters: The parameters required by the tool.
@@ -105,6 +166,7 @@ class ToolDefinition(BaseModel):
105
166
  """
106
167
 
107
168
  name: str
169
+ type: str
108
170
  description: Optional[str] = None
109
171
  parameters: dict
110
172
 
@@ -191,6 +253,10 @@ def break_tool_call_into_messages(tool_call: ToolCall, run_id: str) -> List[Mess
191
253
  arguments = {
192
254
  "ranking_options": {"ranker": options["ranker"], "score_threshold": options["score_threshold"]}
193
255
  }
256
+ elif tool_call.details["type"] == "azure_ai_search":
257
+ arguments = {"input": tool_call.details["azure_ai_search"]["input"]}
258
+ elif tool_call.details["type"] == "fabric_dataagent":
259
+ arguments = {"input": tool_call.details["fabric_dataagent"]["input"]}
194
260
  else:
195
261
  # unsupported tool type, skip
196
262
  return messages
@@ -211,17 +277,17 @@ def break_tool_call_into_messages(tool_call: ToolCall, run_id: str) -> List[Mess
211
277
  messages.append(AssistantMessage(run_id=run_id, content=[to_dict(content_tool_call)], createdAt=tool_call.created))
212
278
 
213
279
  if hasattr(tool_call.details, _FUNCTION):
214
- output = safe_loads(tool_call.details.function.output)
280
+ output = safe_loads(tool_call.details.function["output"])
215
281
  else:
216
282
  try:
217
283
  # Some built-ins may have output, others may not
218
284
  # Try to retrieve it, but if we don't find anything, skip adding the message
219
285
  # Just manually converting to dicts for easy serialization for now rather than custom serializers
220
- if tool_call.details.type == "code_interpreter":
286
+ if tool_call.details.type == _CODE_INTERPRETER:
221
287
  output = tool_call.details.code_interpreter.outputs
222
- elif tool_call.details.type == "bing_grounding":
288
+ elif tool_call.details.type == _BING_GROUNDING:
223
289
  return messages # not supported yet from bing grounding tool
224
- elif tool_call.details.type == "file_search":
290
+ elif tool_call.details.type == _FILE_SEARCH:
225
291
  output = [
226
292
  {
227
293
  "file_id": result.file_id,
@@ -231,6 +297,10 @@ def break_tool_call_into_messages(tool_call: ToolCall, run_id: str) -> List[Mess
231
297
  }
232
298
  for result in tool_call.details.file_search.results
233
299
  ]
300
+ elif tool_call.details.type == _AZURE_AI_SEARCH:
301
+ output = tool_call.details.azure_ai_search["output"]
302
+ elif tool_call.details.type == _FABRIC_DATAAGENT:
303
+ output = tool_call.details.fabric_dataagent["output"]
234
304
  except:
235
305
  return messages
236
306
 
@@ -0,0 +1,73 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+
5
+ # Note: This was removed from the normal constants file due to circular import issues.
6
+
7
+ # In the future, it would be nice to instead rely on the id value
8
+ # of each eval class, but I wouldn't like to rely on those before
9
+ # we simplify them into version-less, static values, instead of the
10
+ # problematic registry references they currently are.
11
+
12
+ # Import all evals
13
+ from azure.ai.evaluation._evaluators._eci._eci import ECIEvaluator
14
+ from azure.ai.evaluation import (
15
+ BleuScoreEvaluator,
16
+ CodeVulnerabilityEvaluator,
17
+ CoherenceEvaluator,
18
+ ContentSafetyEvaluator,
19
+ DocumentRetrievalEvaluator,
20
+ F1ScoreEvaluator,
21
+ FluencyEvaluator,
22
+ GleuScoreEvaluator,
23
+ GroundednessEvaluator,
24
+ GroundednessProEvaluator,
25
+ HateUnfairnessEvaluator,
26
+ IndirectAttackEvaluator,
27
+ IntentResolutionEvaluator,
28
+ MeteorScoreEvaluator,
29
+ ProtectedMaterialEvaluator,
30
+ QAEvaluator,
31
+ RelevanceEvaluator,
32
+ ResponseCompletenessEvaluator,
33
+ RetrievalEvaluator,
34
+ RougeScoreEvaluator,
35
+ SelfHarmEvaluator,
36
+ SexualEvaluator,
37
+ SimilarityEvaluator,
38
+ TaskAdherenceEvaluator,
39
+ ToolCallAccuracyEvaluator,
40
+ UngroundedAttributesEvaluator,
41
+ ViolenceEvaluator
42
+ )
43
+
44
+ EVAL_CLASS_MAP = {
45
+ BleuScoreEvaluator: "bleu_score",
46
+ CodeVulnerabilityEvaluator: "code_vulnerability",
47
+ CoherenceEvaluator: "coherence",
48
+ ContentSafetyEvaluator: "content_safety",
49
+ DocumentRetrievalEvaluator: "document_retrieval",
50
+ ECIEvaluator: "eci",
51
+ F1ScoreEvaluator: "f1_score",
52
+ FluencyEvaluator: "fluency",
53
+ GleuScoreEvaluator: "gleu_score",
54
+ GroundednessEvaluator: "groundedness",
55
+ GroundednessProEvaluator: "groundedness_pro",
56
+ HateUnfairnessEvaluator: "hate_unfairness",
57
+ IndirectAttackEvaluator: "indirect_attack",
58
+ IntentResolutionEvaluator: "intent_resolution",
59
+ MeteorScoreEvaluator: "meteor_score",
60
+ ProtectedMaterialEvaluator: "protected_material",
61
+ QAEvaluator: "qa",
62
+ RelevanceEvaluator: "relevance",
63
+ ResponseCompletenessEvaluator: "response_completeness",
64
+ RetrievalEvaluator: "retrieval",
65
+ RougeScoreEvaluator: "rouge_score",
66
+ SelfHarmEvaluator: "self_harm",
67
+ SexualEvaluator: "sexual",
68
+ SimilarityEvaluator: "similarity",
69
+ TaskAdherenceEvaluator: "task_adherence",
70
+ ToolCallAccuracyEvaluator: "tool_call_accuracy",
71
+ UngroundedAttributesEvaluator: "ungrounded_attributes",
72
+ ViolenceEvaluator: "violence",
73
+ }
@@ -2,11 +2,12 @@
2
2
  # Copyright (c) Microsoft Corporation. All rights reserved.
3
3
  # ---------------------------------------------------------
4
4
 
5
+ import asyncio
5
6
  import logging
6
7
  import pandas as pd
7
8
  import sys
8
9
  from collections import defaultdict
9
- from concurrent.futures import Future, ThreadPoolExecutor
10
+ from concurrent.futures import Future
10
11
  from os import PathLike
11
12
  from typing import Any, Callable, Dict, Final, List, Mapping, Optional, Sequence, Union, cast
12
13
 
@@ -14,6 +15,8 @@ from .batch_clients import BatchClientRun, HasAsyncCallable
14
15
  from ..._legacy._batch_engine._run_submitter import RunSubmitter
15
16
  from ..._legacy._batch_engine._config import BatchEngineConfig
16
17
  from ..._legacy._batch_engine._run import Run
18
+ from ..._legacy._adapters._constants import LINE_NUMBER
19
+ from ..._legacy._common._thread_pool_executor_with_context import ThreadPoolExecutorWithContext
17
20
 
18
21
 
19
22
  LOGGER = logging.getLogger(__name__)
@@ -22,7 +25,9 @@ LOGGER = logging.getLogger(__name__)
22
25
  class RunSubmitterClient:
23
26
  def __init__(self, config: Optional[BatchEngineConfig] = None) -> None:
24
27
  self._config = config or BatchEngineConfig(LOGGER, use_async=True)
25
- self._thread_pool = ThreadPoolExecutor(thread_name_prefix="evaluators_thread")
28
+ self._thread_pool = ThreadPoolExecutorWithContext(
29
+ thread_name_prefix="evaluators_thread",
30
+ max_workers=self._config.max_concurrency)
26
31
 
27
32
  def run(
28
33
  self,
@@ -33,30 +38,36 @@ class RunSubmitterClient:
33
38
  **kwargs: Any,
34
39
  ) -> BatchClientRun:
35
40
  if not isinstance(data, pd.DataFrame):
36
- # Should never get here
37
41
  raise ValueError("Data must be a pandas DataFrame")
38
- if not column_mapping:
39
- raise ValueError("Column mapping must be provided")
40
42
 
41
- # The column mappings are index by data to indicate they come from the data
43
+ # The column mappings are indexed by data to indicate they come from the data
42
44
  # input. Update the inputs so that each entry is a dictionary with a data key
43
45
  # that contains the original input data.
44
46
  inputs = [{"data": input_data} for input_data in data.to_dict(orient="records")]
45
47
 
46
- # always uses async behind the scenes
48
+ # Pass the correct previous run to the evaluator
49
+ run: Optional[BatchClientRun] = kwargs.pop("run", None)
50
+ if run:
51
+ kwargs["run"] = self._get_run(run)
52
+
53
+ # Try to get async function to use
47
54
  if isinstance(flow, HasAsyncCallable):
48
55
  flow = flow._to_async() # pylint: disable=protected-access
49
56
 
50
- run_submitter = RunSubmitter(self._config)
57
+ # Start an event loop for async execution on a thread pool thread to separate it
58
+ # from the caller's thread.
59
+ run_submitter = RunSubmitter(self._config, self._thread_pool)
51
60
  run_future = self._thread_pool.submit(
52
- run_submitter.submit,
53
- dynamic_callable=flow,
54
- inputs=inputs,
55
- column_mapping=column_mapping,
56
- name_prefix=evaluator_name,
57
- created_on=kwargs.pop("created_on", None),
58
- storage_creator=kwargs.pop("storage_creator", None),
59
- **kwargs,
61
+ asyncio.run,
62
+ run_submitter.submit(
63
+ dynamic_callable=flow,
64
+ inputs=inputs,
65
+ column_mapping=column_mapping,
66
+ name_prefix=evaluator_name,
67
+ created_on=kwargs.pop("created_on", None),
68
+ storage_creator=kwargs.pop("storage_creator", None),
69
+ **kwargs,
70
+ )
60
71
  )
61
72
 
62
73
  return run_future
@@ -75,7 +86,10 @@ class RunSubmitterClient:
75
86
  key = f"{prefix}.{k}"
76
87
  data[key].append(value)
77
88
 
89
+ # Go from a list of dictionaries (i.e. a row view of the data) to a dictionary of lists
90
+ # (i.e. a column view of the data)
78
91
  _update("inputs", run.inputs)
92
+ _update("inputs", [{ LINE_NUMBER: i } for i in range(len(run.inputs)) ])
79
93
  _update("outputs", run.outputs)
80
94
 
81
95
  df = pd.DataFrame(data).reindex(columns=[k for k in data.keys()])
@@ -8,6 +8,10 @@ from typing import Optional, Type, Union
8
8
  from azure.ai.evaluation._legacy._adapters._constants import PF_FLOW_ENTRY_IN_TMP, PF_FLOW_META_LOAD_IN_SUBPROCESS
9
9
  from azure.ai.evaluation._legacy._adapters.utils import ClientUserAgentUtil
10
10
  from azure.ai.evaluation._legacy._adapters.tracing import inject_openai_api, recover_openai_api
11
+ from azure.ai.evaluation._legacy._batch_engine._openai_injector import (
12
+ inject_openai_api as ported_inject_openai_api,
13
+ recover_openai_api as ported_recover_openai_api,
14
+ )
11
15
 
12
16
  from azure.ai.evaluation._constants import (
13
17
  OTEL_EXPORTER_OTLP_TRACES_TIMEOUT,
@@ -68,6 +72,7 @@ class EvalRunContext:
68
72
 
69
73
  if isinstance(self.client, RunSubmitterClient):
70
74
  set_event_loop_policy()
75
+ ported_inject_openai_api()
71
76
 
72
77
  def __exit__(
73
78
  self,
@@ -92,3 +97,6 @@ class EvalRunContext:
92
97
  if self._is_otel_timeout_set_by_system:
93
98
  os.environ.pop(OTEL_EXPORTER_OTLP_TRACES_TIMEOUT, None)
94
99
  self._is_otel_timeout_set_by_system = False
100
+
101
+ if isinstance(self.client, RunSubmitterClient):
102
+ ported_recover_openai_api()
@@ -58,6 +58,11 @@ class ProxyClient: # pylint: disable=client-accepts-api-version-keyword
58
58
  if not name:
59
59
  name = f"azure_ai_evaluation_evaluators_{evaluator_name}_{datetime.now().strftime('%Y%m%d_%H%M%S_%f')}"
60
60
 
61
+ # Pass the correct previous run to the evaluator
62
+ run: Optional[BatchClientRun] = kwargs.pop("run", None)
63
+ if run:
64
+ kwargs["run"] = self.get_result(run)
65
+
61
66
  batch_use_async = self._should_batch_use_async(flow_to_run)
62
67
  eval_future = self._thread_pool.submit(
63
68
  self._pf_client.run,
@@ -5,8 +5,15 @@ import os
5
5
  import types
6
6
  from typing import Optional, Type
7
7
 
8
+ from azure.ai.evaluation._evaluate._batch_run.batch_clients import BatchClient
9
+ from azure.ai.evaluation._evaluate._batch_run import RunSubmitterClient
8
10
  from azure.ai.evaluation._legacy._adapters._constants import PF_FLOW_ENTRY_IN_TMP
11
+ from azure.ai.evaluation._legacy._batch_engine._openai_injector import (
12
+ inject_openai_api as ported_inject_openai_api,
13
+ recover_openai_api as ported_recover_openai_api,
14
+ )
9
15
  from azure.ai.evaluation._constants import PF_DISABLE_TRACING
16
+ from azure.ai.evaluation._evaluate._utils import set_event_loop_policy
10
17
 
11
18
 
12
19
  class TargetRunContext:
@@ -16,7 +23,8 @@ class TargetRunContext:
16
23
  :type upload_snapshot: bool
17
24
  """
18
25
 
19
- def __init__(self, upload_snapshot: bool = False) -> None:
26
+ def __init__(self, client: BatchClient, upload_snapshot: bool = False) -> None:
27
+ self._client = client
20
28
  self._upload_snapshot = upload_snapshot
21
29
  self._original_cwd = os.getcwd()
22
30
 
@@ -32,6 +40,11 @@ class TargetRunContext:
32
40
 
33
41
  os.environ[PF_DISABLE_TRACING] = "true"
34
42
 
43
+ if isinstance(self._client, RunSubmitterClient):
44
+ ported_inject_openai_api()
45
+ # For addressing the issue of asyncio event loop closed on Windows
46
+ set_event_loop_policy()
47
+
35
48
  def __exit__(
36
49
  self,
37
50
  exc_type: Optional[Type[BaseException]],
@@ -44,3 +57,6 @@ class TargetRunContext:
44
57
  os.environ.pop(PF_FLOW_ENTRY_IN_TMP, None)
45
58
 
46
59
  os.environ.pop(PF_DISABLE_TRACING, None)
60
+
61
+ if isinstance(self._client, RunSubmitterClient):
62
+ ported_recover_openai_api()
@@ -295,7 +295,7 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
295
295
  return f"https://{self._url_base}" "/mlflow/v2.0" f"{self._get_scope()}" f"/api/2.0/mlflow/runs/log-metric"
296
296
 
297
297
  def _get_token(self) -> str:
298
- return self._management_client.get_token()
298
+ return self._management_client.get_token().token
299
299
 
300
300
  def request_with_retry(
301
301
  self, url: str, method: str, json_dict: Dict[str, Any], headers: Optional[Dict[str, str]] = None