azure-ai-evaluation 1.4.0__py3-none-any.whl → 1.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (150) hide show
  1. azure/ai/evaluation/__init__.py +9 -16
  2. azure/ai/evaluation/_aoai/__init__.py +10 -0
  3. azure/ai/evaluation/_aoai/aoai_grader.py +89 -0
  4. azure/ai/evaluation/_aoai/label_grader.py +66 -0
  5. azure/ai/evaluation/_aoai/string_check_grader.py +65 -0
  6. azure/ai/evaluation/_aoai/text_similarity_grader.py +88 -0
  7. azure/ai/evaluation/_azure/_clients.py +4 -4
  8. azure/ai/evaluation/_azure/_envs.py +208 -0
  9. azure/ai/evaluation/_azure/_token_manager.py +12 -7
  10. azure/ai/evaluation/_common/__init__.py +5 -0
  11. azure/ai/evaluation/_common/evaluation_onedp_client.py +118 -0
  12. azure/ai/evaluation/_common/onedp/__init__.py +32 -0
  13. azure/ai/evaluation/_common/onedp/_client.py +139 -0
  14. azure/ai/evaluation/_common/onedp/_configuration.py +73 -0
  15. azure/ai/evaluation/_common/onedp/_model_base.py +1232 -0
  16. azure/ai/evaluation/_common/onedp/_patch.py +21 -0
  17. azure/ai/evaluation/_common/onedp/_serialization.py +2032 -0
  18. azure/ai/evaluation/_common/onedp/_types.py +21 -0
  19. azure/ai/evaluation/_common/onedp/_validation.py +50 -0
  20. azure/ai/evaluation/_common/onedp/_vendor.py +50 -0
  21. azure/ai/evaluation/_common/onedp/_version.py +9 -0
  22. azure/ai/evaluation/_common/onedp/aio/__init__.py +29 -0
  23. azure/ai/evaluation/_common/onedp/aio/_client.py +143 -0
  24. azure/ai/evaluation/_common/onedp/aio/_configuration.py +75 -0
  25. azure/ai/evaluation/_common/onedp/aio/_patch.py +21 -0
  26. azure/ai/evaluation/_common/onedp/aio/_vendor.py +40 -0
  27. azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +39 -0
  28. azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +4494 -0
  29. azure/ai/evaluation/_common/onedp/aio/operations/_patch.py +21 -0
  30. azure/ai/evaluation/_common/onedp/models/__init__.py +142 -0
  31. azure/ai/evaluation/_common/onedp/models/_enums.py +162 -0
  32. azure/ai/evaluation/_common/onedp/models/_models.py +2228 -0
  33. azure/ai/evaluation/_common/onedp/models/_patch.py +21 -0
  34. azure/ai/evaluation/_common/onedp/operations/__init__.py +39 -0
  35. azure/ai/evaluation/_common/onedp/operations/_operations.py +5655 -0
  36. azure/ai/evaluation/_common/onedp/operations/_patch.py +21 -0
  37. azure/ai/evaluation/_common/onedp/py.typed +1 -0
  38. azure/ai/evaluation/_common/onedp/servicepatterns/__init__.py +1 -0
  39. azure/ai/evaluation/_common/onedp/servicepatterns/aio/__init__.py +1 -0
  40. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/__init__.py +25 -0
  41. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_operations.py +34 -0
  42. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_patch.py +20 -0
  43. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/__init__.py +1 -0
  44. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/__init__.py +1 -0
  45. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/__init__.py +22 -0
  46. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_operations.py +29 -0
  47. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_patch.py +20 -0
  48. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/__init__.py +22 -0
  49. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_operations.py +29 -0
  50. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_patch.py +20 -0
  51. azure/ai/evaluation/_common/onedp/servicepatterns/operations/__init__.py +25 -0
  52. azure/ai/evaluation/_common/onedp/servicepatterns/operations/_operations.py +34 -0
  53. azure/ai/evaluation/_common/onedp/servicepatterns/operations/_patch.py +20 -0
  54. azure/ai/evaluation/_common/rai_service.py +159 -29
  55. azure/ai/evaluation/_common/raiclient/_version.py +1 -1
  56. azure/ai/evaluation/_common/utils.py +80 -2
  57. azure/ai/evaluation/_constants.py +16 -0
  58. azure/ai/evaluation/_converters/__init__.py +1 -1
  59. azure/ai/evaluation/_converters/_ai_services.py +4 -4
  60. azure/ai/evaluation/_eval_mapping.py +71 -0
  61. azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +30 -16
  62. azure/ai/evaluation/_evaluate/_batch_run/code_client.py +18 -12
  63. azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +17 -4
  64. azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +47 -22
  65. azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +18 -2
  66. azure/ai/evaluation/_evaluate/_eval_run.py +2 -2
  67. azure/ai/evaluation/_evaluate/_evaluate.py +372 -105
  68. azure/ai/evaluation/_evaluate/_evaluate_aoai.py +534 -0
  69. azure/ai/evaluation/_evaluate/_telemetry/__init__.py +5 -89
  70. azure/ai/evaluation/_evaluate/_utils.py +120 -7
  71. azure/ai/evaluation/_evaluators/_common/_base_eval.py +9 -4
  72. azure/ai/evaluation/_evaluators/_common/_base_multi_eval.py +1 -1
  73. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +12 -3
  74. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +2 -2
  75. azure/ai/evaluation/_evaluators/_document_retrieval/__init__.py +11 -0
  76. azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +467 -0
  77. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +1 -1
  78. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +2 -2
  79. azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +6 -2
  80. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +1 -1
  81. azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +8 -2
  82. azure/ai/evaluation/_evaluators/_response_completeness/response_completeness.prompty +31 -46
  83. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +1 -1
  84. azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +5 -2
  85. azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +6 -2
  86. azure/ai/evaluation/_exceptions.py +2 -0
  87. azure/ai/evaluation/_legacy/_adapters/__init__.py +7 -0
  88. azure/ai/evaluation/_legacy/_adapters/_check.py +17 -0
  89. azure/ai/evaluation/_legacy/_adapters/_configuration.py +45 -0
  90. azure/ai/evaluation/_legacy/_adapters/_constants.py +10 -0
  91. azure/ai/evaluation/_legacy/_adapters/_errors.py +29 -0
  92. azure/ai/evaluation/_legacy/_adapters/_flows.py +28 -0
  93. azure/ai/evaluation/_legacy/_adapters/_service.py +16 -0
  94. azure/ai/evaluation/_legacy/_adapters/client.py +51 -0
  95. azure/ai/evaluation/_legacy/_adapters/entities.py +26 -0
  96. azure/ai/evaluation/_legacy/_adapters/tracing.py +28 -0
  97. azure/ai/evaluation/_legacy/_adapters/types.py +15 -0
  98. azure/ai/evaluation/_legacy/_adapters/utils.py +31 -0
  99. azure/ai/evaluation/_legacy/_batch_engine/_engine.py +51 -32
  100. azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py +114 -8
  101. azure/ai/evaluation/_legacy/_batch_engine/_result.py +7 -1
  102. azure/ai/evaluation/_legacy/_batch_engine/_run.py +6 -0
  103. azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +69 -29
  104. azure/ai/evaluation/_legacy/_batch_engine/_status.py +1 -1
  105. azure/ai/evaluation/_legacy/_batch_engine/_trace.py +54 -62
  106. azure/ai/evaluation/_legacy/_batch_engine/_utils.py +19 -1
  107. azure/ai/evaluation/{_red_team/_utils → _legacy/_common}/__init__.py +1 -1
  108. azure/ai/evaluation/_legacy/_common/_async_token_provider.py +124 -0
  109. azure/ai/evaluation/_legacy/_common/_thread_pool_executor_with_context.py +15 -0
  110. azure/ai/evaluation/_legacy/prompty/_connection.py +11 -74
  111. azure/ai/evaluation/_legacy/prompty/_exceptions.py +80 -0
  112. azure/ai/evaluation/_legacy/prompty/_prompty.py +119 -9
  113. azure/ai/evaluation/_legacy/prompty/_utils.py +72 -2
  114. azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +90 -17
  115. azure/ai/evaluation/_version.py +1 -1
  116. azure/ai/evaluation/red_team/__init__.py +19 -0
  117. azure/ai/evaluation/{_red_team → red_team}/_attack_objective_generator.py +3 -0
  118. azure/ai/evaluation/{_red_team → red_team}/_attack_strategy.py +4 -1
  119. azure/ai/evaluation/{_red_team → red_team}/_red_team.py +885 -481
  120. azure/ai/evaluation/red_team/_red_team_result.py +382 -0
  121. azure/ai/evaluation/{_red_team → red_team}/_utils/constants.py +2 -1
  122. azure/ai/evaluation/{_red_team → red_team}/_utils/formatting_utils.py +23 -22
  123. azure/ai/evaluation/{_red_team → red_team}/_utils/logging_utils.py +1 -1
  124. azure/ai/evaluation/red_team/_utils/metric_mapping.py +23 -0
  125. azure/ai/evaluation/{_red_team → red_team}/_utils/strategy_utils.py +9 -5
  126. azure/ai/evaluation/simulator/_adversarial_simulator.py +63 -39
  127. azure/ai/evaluation/simulator/_constants.py +1 -0
  128. azure/ai/evaluation/simulator/_conversation/__init__.py +13 -6
  129. azure/ai/evaluation/simulator/_conversation/_conversation.py +2 -1
  130. azure/ai/evaluation/simulator/_direct_attack_simulator.py +35 -22
  131. azure/ai/evaluation/simulator/_helpers/_language_suffix_mapping.py +1 -0
  132. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +40 -25
  133. azure/ai/evaluation/simulator/_model_tools/__init__.py +2 -1
  134. azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +24 -18
  135. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +5 -10
  136. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +65 -41
  137. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +9 -5
  138. azure/ai/evaluation/simulator/_model_tools/models.py +20 -17
  139. azure/ai/evaluation/simulator/_simulator.py +1 -1
  140. {azure_ai_evaluation-1.4.0.dist-info → azure_ai_evaluation-1.6.0.dist-info}/METADATA +36 -2
  141. {azure_ai_evaluation-1.4.0.dist-info → azure_ai_evaluation-1.6.0.dist-info}/RECORD +148 -80
  142. azure/ai/evaluation/_red_team/_red_team_result.py +0 -246
  143. azure/ai/evaluation/simulator/_tracing.py +0 -89
  144. /azure/ai/evaluation/_legacy/{_batch_engine → _common}/_logging.py +0 -0
  145. /azure/ai/evaluation/{_red_team → red_team}/_callback_chat_target.py +0 -0
  146. /azure/ai/evaluation/{_red_team → red_team}/_default_converter.py +0 -0
  147. /azure/ai/evaluation/{_red_team → red_team/_utils}/__init__.py +0 -0
  148. {azure_ai_evaluation-1.4.0.dist-info → azure_ai_evaluation-1.6.0.dist-info}/NOTICE.txt +0 -0
  149. {azure_ai_evaluation-1.4.0.dist-info → azure_ai_evaluation-1.6.0.dist-info}/WHEEL +0 -0
  150. {azure_ai_evaluation-1.4.0.dist-info → azure_ai_evaluation-1.6.0.dist-info}/top_level.txt +0 -0
@@ -10,21 +10,24 @@
10
10
  # porting over the code largely as is to remove the Promptflow dependency
11
11
  # as quickly as possible. In phase 2 this code will be heavily refactored.
12
12
 
13
+ import inspect
13
14
  import re
14
15
  import asyncio
16
+
15
17
  from math import floor
16
18
  from asyncio import Semaphore
19
+ from concurrent.futures import Executor
20
+ from functools import partial
17
21
  from contextlib import contextmanager
18
- from dataclasses import dataclass
19
- from datetime import datetime, timedelta, timezone
20
- from typing import Any, Callable, Dict, Final, Generator, Mapping, MutableMapping, Optional, Sequence, Set, Tuple
22
+ from datetime import datetime, timezone
23
+ from typing import Any, Callable, Dict, Final, Generator, Mapping, MutableMapping, Optional, Sequence, Set, Tuple, cast
21
24
  from uuid import uuid4
22
25
 
23
- from ._utils import get_int_env_var, get_value_from_path
26
+ from ._utils import DEFAULTS_KEY, get_int_env_var, get_value_from_path, is_async_callable
24
27
  from ._status import BatchStatus
25
28
  from ._result import BatchResult, BatchRunDetails, BatchRunError, TokenMetrics
26
29
  from ._run_storage import AbstractRunStorage, NoOpRunStorage
27
- from ._logging import log_progress, NodeLogManager
30
+ from .._common._logging import log_progress, NodeLogManager
28
31
  from ..._exceptions import ErrorBlame
29
32
  from ._exceptions import (
30
33
  BatchEngineCanceledError,
@@ -37,6 +40,7 @@ from ._utils_deprecated import (
37
40
  async_run_allowing_running_loop,
38
41
  convert_eager_flow_output_to_dict,
39
42
  )
43
+ from ._openai_injector import CaptureOpenAITokenUsage
40
44
 
41
45
 
42
46
  MAX_WORKER_COUNT: Final[int] = 10
@@ -48,51 +52,37 @@ class BatchEngine:
48
52
 
49
53
  def __init__(
50
54
  self,
51
- executor: Callable,
55
+ func: Callable,
52
56
  *,
53
57
  storage: Optional[AbstractRunStorage] = None,
54
58
  batch_timeout_sec: Optional[int] = None,
55
59
  line_timeout_sec: Optional[int] = None,
56
60
  max_worker_count: Optional[int] = None,
57
- **kwargs: Any,
61
+ executor: Optional[Executor] = None,
58
62
  ):
59
63
  """Create a new batch engine instance
60
64
 
61
- :param Callable executor: The executor to run the flow
65
+ :param Callable func: The function to run the flow
62
66
  :param Optional[AbstractRunStorage] storage: The storage to store execution results
63
67
  :param Optional[int] batch_timeout_sec: The timeout of batch run in seconds
64
68
  :param Optional[int] line_timeout_sec: The timeout of each line in seconds
65
69
  :param Optional[int] max_worker_count: The concurrency limit of batch run
66
- :param kwargs: The keyword arguments related to creating the executor proxy class
67
- :type kwargs: Any
70
+ :param Optional[Executor] executor: The executor to run the flow (if needed)
68
71
  """
69
72
 
70
- self._executor = executor
71
- # self._working_dir = working_dir
72
-
73
- # self._is_eager_flow = True
74
- # self._is_prompty_flow = False
75
- # self._program_language = FlowLanguage.Python
76
- # self._message_format = MessageFormatType.BASIC
77
- # self._multimedia_processor = MultimediaProcessor.create(self._message_format)
78
- # self._connections = {}
79
-
73
+ self._func: Callable = func
80
74
  self._storage: AbstractRunStorage = storage or NoOpRunStorage()
81
75
 
82
76
  # TODO ralphe: Consume these from the batch context/config instead of from
83
77
  # kwargs or (even worse) environment variables
84
- # self._batch_use_async = kwargs.get("batch_use_async", True)
85
78
  self._batch_timeout_sec = batch_timeout_sec or get_int_env_var("PF_BATCH_TIMEOUT_SEC")
86
79
  self._line_timeout_sec = line_timeout_sec or get_int_env_var("PF_LINE_TIMEOUT_SEC", 600)
87
80
  self._max_worker_count = max_worker_count or get_int_env_var("PF_WORKER_COUNT") or MAX_WORKER_COUNT
88
- # update kwargs with worker_count and line_timeout_sec
89
- kwargs.update({"worker_count": self._max_worker_count, "line_timeout_sec": self._line_timeout_sec})
90
81
 
82
+ self._executor: Optional[Executor] = executor
91
83
  self._is_canceled: bool = False
92
- self._kwargs: Mapping[str, Any] = kwargs
93
- # self._init_kwargs: Mapping[str, Any] = init_kwargs or {}
94
84
 
95
- def run(
85
+ async def run(
96
86
  self,
97
87
  data: Sequence[Mapping[str, Any]],
98
88
  column_mapping: Mapping[str, str],
@@ -113,9 +103,7 @@ class BatchEngine:
113
103
 
114
104
  try:
115
105
  id = id or str(uuid4())
116
-
117
- result: BatchResult = async_run_allowing_running_loop(self._exec_in_task, id, batch_inputs, start_time)
118
-
106
+ result: BatchResult = await self._exec_in_task(id, batch_inputs, start_time)
119
107
  return result
120
108
  except Exception as ex:
121
109
  raise BatchEngineError(
@@ -136,6 +124,7 @@ class BatchEngine:
136
124
 
137
125
  inputs: Sequence[Mapping[str, Any]] = []
138
126
  line: int = 0
127
+ defaults = cast(Mapping[str, Any], column_mapping.get(DEFAULTS_KEY, {}))
139
128
 
140
129
  for input in data:
141
130
  line += 1
@@ -143,6 +132,10 @@ class BatchEngine:
143
132
  missing_inputs: Set[str] = set()
144
133
 
145
134
  for key, value in column_mapping.items():
135
+ if key == DEFAULTS_KEY:
136
+ # Skip the defaults key
137
+ continue
138
+
146
139
  if not isinstance(value, str):
147
140
  # All non-string values are literal values.
148
141
  mapped[key] = value
@@ -156,6 +149,9 @@ class BatchEngine:
156
149
 
157
150
  dict_path = match.group(1)
158
151
  found, value = get_value_from_path(dict_path, input)
152
+ if not found: # try default value
153
+ found, value = get_value_from_path(dict_path, defaults)
154
+
159
155
  if found:
160
156
  mapped[key] = value
161
157
  else:
@@ -306,11 +302,34 @@ class BatchEngine:
306
302
 
307
303
  try:
308
304
  # TODO ralphe: Handle line timeouts here
309
- output: Any = await self._executor(**inputs)
305
+ with CaptureOpenAITokenUsage() as captured_tokens:
306
+ # NOTE: In the legacy code, any synchronous functions were executed in a different process
307
+ # for isolation reasons. However this isolation was violated in the way the code was
308
+ # used by the evaluation SDK (e.g. you need to have the module already loaded to pass the
309
+ # callable into the batch engine, so starting a new process to examine it was redundant).
310
+ # It also came with performance and memory usage costs (each line was processed in a
311
+ # separate process up to a maximum of 4), and these processes were created and torn down
312
+ # too frequently.
313
+ # For now we will just run the function in the current process, but in the future we may
314
+ # want to consider running the function in a separate process for isolation reasons.
315
+ output: Any
316
+ if is_async_callable(self._func):
317
+ output = await self._func(**inputs)
318
+ else:
319
+ # to maximize the parallelism, we run the synchronous function in a separate thread
320
+ # and await its result
321
+ output = await asyncio.get_event_loop().run_in_executor(
322
+ self._executor,
323
+ partial(self._func, **inputs))
324
+
325
+ # This should in theory never happen but as an extra precaution, let's check if the output
326
+ # is awaitable and await it if it is.
327
+ if inspect.isawaitable(output):
328
+ output = await output
329
+
310
330
  details.status = BatchStatus.Completed
311
331
  details.result = convert_eager_flow_output_to_dict(output)
312
-
313
- # TODO figure out how to get the token metrics here
332
+ details.tokens.update(captured_tokens)
314
333
  except Exception as ex:
315
334
  details.status = BatchStatus.Failed
316
335
  details.error = BatchRunError(
@@ -4,20 +4,126 @@
4
4
 
5
5
  # Original source code: promptflow-tracing/promptflow/tracing/_integrations/_openai_injector.py
6
6
 
7
+ import functools
8
+ import importlib
9
+ import logging
10
+
11
+ from contextvars import ContextVar
12
+ from typing import Any, Callable, Final, Generator, Optional, Protocol, Sequence, Tuple
13
+
14
+ from azure.ai.evaluation._legacy._adapters._errors import MissingRequiredPackage
15
+ from azure.ai.evaluation._legacy._batch_engine._result import TokenMetrics
16
+
17
+
18
+ _token_metrics: ContextVar[TokenMetrics] = ContextVar("token_metrics", default=TokenMetrics(0, 0, 0))
19
+ KEY_ATTR_ORIGINAL: Final[str] = "_original"
20
+
21
+
22
+ class _TokenMetrics(Protocol):
23
+ """Protocol class to represent the token metrics."""
24
+
25
+ prompt_tokens: int
26
+ completion_tokens: int
27
+ total_tokens: int
28
+
29
+
30
+ class _WithUsage(Protocol):
31
+ """Protocol class to represent an OpenAI object that may have a token usage property/attribute."""
32
+
33
+ usage: Optional[_TokenMetrics]
34
+
35
+
36
+ def _wrap_openai_api_method(method: Callable, is_async: bool) -> Callable:
37
+ """Wraps the OpenAI API method to inject logic to run on the result of the call."""
38
+
39
+ def update_usage(result: _WithUsage) -> None:
40
+ if hasattr(result, "usage") and result.usage is not None:
41
+ usage = _token_metrics.get()
42
+ usage.prompt_tokens += result.usage.prompt_tokens
43
+ usage.completion_tokens += result.usage.completion_tokens
44
+ usage.total_tokens += result.usage.total_tokens
45
+
46
+ if is_async:
47
+
48
+ @functools.wraps(method)
49
+ async def async_wrapper(*args: Any, **kwargs: Any) -> Any:
50
+ result: _WithUsage = await method(*args, **kwargs)
51
+ update_usage(result)
52
+ return result
53
+
54
+ return async_wrapper
55
+ else:
56
+
57
+ @functools.wraps(method)
58
+ def sync_wrapper(*args: Any, **kwargs: Any) -> Any:
59
+ result: _WithUsage = method(*args, **kwargs)
60
+ update_usage(result)
61
+ return result
62
+
63
+ return sync_wrapper
64
+
65
+
66
+ def _openai_api_list() -> Generator[Tuple[Any, Callable, bool], None, None]:
67
+ """Load the list of OpenAI API classes and their corresponding method names."""
68
+
69
+ apis: Sequence[Tuple[str, str, str, bool]] = [
70
+ ("openai.resources.chat", "Completions", "create", False),
71
+ ("openai.resources.chat", "AsyncCompletions", "create", True),
72
+ ("openai.resources", "Completions", "create", False),
73
+ ("openai.resources", "AsyncCompletions", "create", True),
74
+ ("openai.resources", "Embeddings", "create", False),
75
+ ("openai.resources", "AsyncEmbeddings", "create", True),
76
+ ("openai.resources", "Responses", "create", False),
77
+ ("openai.resources", "AsyncResponses", "create", True),
78
+ ]
79
+
80
+ for module_name, class_name, method_name, is_async in apis:
81
+ try:
82
+ module = importlib.import_module(module_name)
83
+ cls = getattr(module, class_name, None)
84
+ if cls is None:
85
+ continue
86
+ method = getattr(cls, method_name, None)
87
+ if method is None:
88
+ continue
89
+ yield cls, method, is_async
90
+ except ImportError:
91
+ raise MissingRequiredPackage("Please install the 'openai' package to use the Azure AI Evaluation SDK")
92
+ except AttributeError:
93
+ logging.warning("The module '%s' does not have class '%s' or method '%s'", module_name, class_name, method_name)
94
+
7
95
 
8
96
  def inject_openai_api():
9
- """This function:
10
- 1. Modifies the create methods of the OpenAI API classes to inject logic before calling the original methods.
11
- It stores the original methods as _original attributes of the create methods.
12
- 2. Updates the openai api configs from environment variables.
97
+ """This function modifies the create methods of the OpenAI API classes to inject logic
98
+ to enable us to collect token usage data.
13
99
  """
14
- # TODO ralphe: Port function?
15
- pass
100
+ for cls, method, is_async in _openai_api_list():
101
+ # Check if the create method of the openai_api class has already been modified
102
+ if not hasattr(method, KEY_ATTR_ORIGINAL):
103
+ wrapper_method: Callable = _wrap_openai_api_method(method, is_async)
104
+ setattr(wrapper_method, KEY_ATTR_ORIGINAL, method)
105
+ setattr(cls, method.__name__, wrapper_method)
16
106
 
17
107
 
18
108
  def recover_openai_api():
19
109
  """This function restores the original create methods of the OpenAI API classes
20
110
  by assigning them back from the _original attributes of the modified methods.
21
111
  """
22
- # TODO ralphe: Port function?
23
- pass
112
+ for cls, method, _ in _openai_api_list():
113
+ if hasattr(method, KEY_ATTR_ORIGINAL):
114
+ original_method = getattr(method, KEY_ATTR_ORIGINAL)
115
+ setattr(cls, method.__name__, original_method)
116
+
117
+
118
+ class CaptureOpenAITokenUsage:
119
+ """Context manager to capture OpenAI token usage."""
120
+ def __init__(self):
121
+ self._tokens = TokenMetrics(0, 0, 0)
122
+
123
+ def __enter__(self) -> TokenMetrics:
124
+ _token_metrics.set(TokenMetrics(0, 0, 0))
125
+ return self._tokens
126
+
127
+ def __exit__(self, exc_type: Optional[Exception], exc_value: Optional[Exception], traceback: Optional[Any]) -> None:
128
+ captured_metrics = _token_metrics.get()
129
+ self._tokens.update(captured_metrics)
@@ -20,6 +20,12 @@ class TokenMetrics:
20
20
  total_tokens: int
21
21
  """The total number of tokens used in the run."""
22
22
 
23
+ def update(self, other: "TokenMetrics") -> None:
24
+ """Update the token metrics with another set of token metrics."""
25
+ self.prompt_tokens += other.prompt_tokens
26
+ self.completion_tokens += other.completion_tokens
27
+ self.total_tokens += other.total_tokens
28
+
23
29
 
24
30
  @dataclass
25
31
  class BatchRunError:
@@ -96,4 +102,4 @@ class BatchResult:
96
102
  """The results of the batch run."""
97
103
  if not self.details:
98
104
  return []
99
- return [d.result for d in self.details]
105
+ return [d.result for d in self.details]
@@ -60,6 +60,7 @@ class Run:
60
60
  inputs: Sequence[Mapping[str, Any]],
61
61
  column_mapping: Mapping[str, str],
62
62
  created_on: Optional[datetime] = None,
63
+ run: Optional["Run"] = None,
63
64
  ):
64
65
  self._status: RunStatus = RunStatus.NOT_STARTED
65
66
  self._created_on = created_on or datetime.now(timezone.utc)
@@ -72,6 +73,7 @@ class Run:
72
73
  self.column_mapping = column_mapping
73
74
  self.result: Optional[BatchResult] = None
74
75
  self.metrics: Mapping[str, Any] = {}
76
+ self._run = run
75
77
 
76
78
  # self._use_remote_flow = False
77
79
  # self._from_flex_flow = True
@@ -105,6 +107,10 @@ class Run:
105
107
 
106
108
  return [value or {} for value in self.result.results]
107
109
 
110
+ @property
111
+ def previous_run(self) -> Optional["Run"]:
112
+ return self._run
113
+
108
114
  @staticmethod
109
115
  def _generate_run_name(name_prefix: Optional[str], creation_time: datetime) -> str:
110
116
  # The Promptflow code looked at the folder name of the temporary folder used to
@@ -3,17 +3,20 @@
3
3
  # ---------------------------------------------------------
4
4
 
5
5
  import dataclasses
6
+ import inspect
6
7
  import sys
8
+
9
+ from concurrent.futures import Executor
7
10
  from datetime import datetime, timezone
8
11
  from typing import Any, Callable, Dict, Mapping, Optional, Sequence, TextIO, Union
9
12
 
10
13
  from ._run import Run, RunStatus
11
- from ._trace import start_trace, is_collection_writeable
14
+ from ._trace import start_trace
12
15
  from ._run_storage import AbstractRunStorage, NoOpRunStorage
13
- from ._logging import incremental_print, print_red_error
16
+ from .._common._logging import incremental_print, print_red_error
14
17
  from ._config import BatchEngineConfig
15
18
  from ._exceptions import BatchEngineValidationError
16
- from ._engine import BatchEngine, BatchEngineError, BatchResult
19
+ from ._engine import DEFAULTS_KEY, BatchEngine, BatchEngineError, BatchResult
17
20
 
18
21
 
19
22
  class RunSubmitter:
@@ -22,25 +25,32 @@ class RunSubmitter:
22
25
 
23
26
  THIS WILL BE REMOVED IN A FUTURE CODE UPDATE"""
24
27
 
25
- def __init__(self, config: BatchEngineConfig):
28
+ def __init__(self, config: BatchEngineConfig, executor: Optional[Executor] = None):
26
29
  # self._client = PFClient instance
27
30
  # self._config = PFClient config
28
31
  # self.run_operations = RunOperations instance
29
32
 
30
33
  # TODO ralphe: Use proper logger here. Old code did LoggerFactory.get_logger(__name__)
31
34
  self._config = config
35
+ self._executor = executor
32
36
 
33
- def submit(
37
+ async def submit(
34
38
  self,
35
39
  dynamic_callable: Callable,
36
40
  inputs: Sequence[Mapping[str, Any]],
37
- column_mapping: Mapping[str, str],
41
+ column_mapping: Optional[Mapping[str, str]],
38
42
  *,
39
43
  name_prefix: Optional[str] = None,
40
44
  created_on: Optional[datetime] = None,
41
45
  storage_creator: Optional[Callable[[Run], AbstractRunStorage]] = None,
42
46
  **kwargs,
43
47
  ) -> Run:
48
+
49
+ # if the column mappings are not provided, generate them based on the arguments to the
50
+ # flow function.
51
+ if column_mapping is None:
52
+ column_mapping = self._generate_column_mapping(dynamic_callable)
53
+
44
54
  # The old code always spun up two threads here using a ThreadPoolExecutor:
45
55
  # 1. One thread essentially did nothing of value (since tracing was disabled, and we
46
56
  # don't care about checking for the latest PromptFlow version number now)
@@ -51,27 +61,18 @@ class RunSubmitter:
51
61
  # of the _run_bulk code here directly.
52
62
  # In a future code refactor, all of this will be cleaned up in favour of proper
53
63
  # async/await code.
54
- run: Run = kwargs.pop("run", None) or Run(
64
+
65
+ run: Run = Run(
55
66
  dynamic_callable=dynamic_callable,
56
67
  name_prefix=name_prefix,
57
68
  inputs=inputs,
58
69
  column_mapping=column_mapping,
59
70
  created_on=created_on,
71
+ run=kwargs.pop("run", None),
60
72
  )
61
73
 
62
- logger = self._config.logger
63
74
  attributes: Dict[str, Any] = kwargs.get("attributes", {})
64
- collection_for_run: Optional[str] = None
65
-
66
- logger.debug("start trace for flow run...")
67
- logger.debug("flow path for run.start_trace: %s", run.name)
68
-
69
- if is_collection_writeable():
70
- logger.debug("trace collection is writeable, will use flow name as collection...")
71
- collection_for_run = run.name
72
- logger.debug("collection for run: %s", collection_for_run)
73
- else:
74
- logger.debug("trace collection is protected, will honor existing collection.")
75
+ collection_for_run: str = run.name
75
76
  start_trace(attributes=attributes, run=run, _collection=collection_for_run)
76
77
 
77
78
  self._validate_inputs(run=run)
@@ -81,12 +82,12 @@ class RunSubmitter:
81
82
  run._status = RunStatus.PREPARING
82
83
 
83
84
  # unnecessary Flow loading code was removed here. Instead do direct calls to _submit_bulk_run
84
- self._submit_bulk_run(run=run, local_storage=local_storage, **kwargs)
85
+ await self._submit_bulk_run(run=run, local_storage=local_storage, **kwargs)
85
86
 
86
87
  self.stream_run(run=run, storage=local_storage, raise_on_error=True)
87
88
  return run
88
89
 
89
- def _submit_bulk_run(self, run: Run, local_storage: AbstractRunStorage, **kwargs) -> None:
90
+ async def _submit_bulk_run(self, run: Run, local_storage: AbstractRunStorage, **kwargs) -> None:
90
91
  logger = self._config.logger
91
92
 
92
93
  logger.info(f"Submitting run {run.name}, log path: {local_storage.logger.file_path}")
@@ -95,6 +96,29 @@ class RunSubmitter:
95
96
  # removed since it is unnecessary. It also parsed and set environment variables. This
96
97
  # has also been removed since it can be problematic in a multi-threaded environment.
97
98
 
99
+ if run.previous_run:
100
+ previous: Optional[Run] = run.previous_run
101
+ if previous.status != RunStatus.COMPLETED:
102
+ raise BatchEngineValidationError(
103
+ f"Referenced run {previous.name} is not completed, got status {previous.status.value}."
104
+ )
105
+ if previous.outputs is not None:
106
+ if len(previous.outputs) != len(run.inputs):
107
+ raise BatchEngineValidationError(
108
+ f"Referenced run {previous.name} has {len(previous.outputs)} outputs, "
109
+ f"but {len(run.inputs)} inputs are provided."
110
+ )
111
+
112
+ # load in the previous run's outputs and inputs into the list of dictionaries to allow for
113
+ # the previous run's outputs to be used as inputs for the current run
114
+ run.inputs = [
115
+ {
116
+ "run.outputs": previous.outputs[i],
117
+ "run.inputs": previous.inputs[i],
118
+ **run.inputs[i]
119
+ }
120
+ for i in range(len(run.inputs))]
121
+
98
122
  self._validate_column_mapping(run.column_mapping)
99
123
 
100
124
  run._status = RunStatus.RUNNING
@@ -108,10 +132,10 @@ class RunSubmitter:
108
132
  batch_timeout_sec=self._config.batch_timeout_seconds,
109
133
  line_timeout_sec=self._config.run_timeout_seconds,
110
134
  max_worker_count=self._config.max_concurrency,
111
- **kwargs,
135
+ executor=self._executor,
112
136
  )
113
137
 
114
- batch_result = batch_engine.run(data=run.inputs, column_mapping=run.column_mapping, id=run.name)
138
+ batch_result = await batch_engine.run(data=run.inputs, column_mapping=run.column_mapping, id=run.name)
115
139
  run._status = RunStatus.from_batch_result_status(batch_result.status)
116
140
 
117
141
  error_logs: Sequence[str] = []
@@ -152,10 +176,30 @@ class RunSubmitter:
152
176
  run.metrics = system_metrics
153
177
  run.result = batch_result
154
178
 
179
+ @staticmethod
180
+ def _generate_column_mapping(function: Callable) -> Mapping[str, Any]:
181
+ args = inspect.signature(function).parameters
182
+ default_values: Dict[str, Any] = {}
183
+ mapping: Dict[str, Any] = {}
184
+ for key, value in args.items():
185
+ if key in ["self", "cls"] or value.kind in [value.VAR_POSITIONAL, value.VAR_KEYWORD]:
186
+ continue
187
+
188
+ mapping[key] = f"${{data.{key}}}"
189
+ if value.default != inspect.Parameter.empty:
190
+ default_values[key] = value.default
191
+
192
+ return {
193
+ **mapping,
194
+ DEFAULTS_KEY: default_values,
195
+ }
196
+
155
197
  @staticmethod
156
198
  def _validate_inputs(run: Run):
157
- if not run.inputs:
158
- raise BatchEngineValidationError("Data must be specified for evaluation run.")
199
+ if not run.inputs and not run.previous_run:
200
+ raise BatchEngineValidationError(
201
+ "Either data, or a previous run must be specified for the evaluation run."
202
+ )
159
203
 
160
204
  @staticmethod
161
205
  def _validate_column_mapping(column_mapping: Mapping[str, str]):
@@ -178,10 +222,6 @@ class RunSubmitter:
178
222
  :param AbstractRunStorage storage: The storage to use for the output.
179
223
  """
180
224
 
181
- # TODO ralphe: This doesn't seem to be do anything useful beyond just print
182
- # a run summary at the end. This is because by the time it gets
183
- # invoked even in the original code, the run has already completed.
184
-
185
225
  if run is None or storage is None:
186
226
  return
187
227
 
@@ -22,4 +22,4 @@ class BatchStatus(IntEnum):
22
22
 
23
23
  @staticmethod
24
24
  def is_failed(status: "BatchStatus") -> bool:
25
- return status == BatchStatus.Failed or status == BatchStatus.Canceled
25
+ return status == BatchStatus.Failed or status == BatchStatus.Canceled