azure-ai-evaluation 1.4.0__py3-none-any.whl → 1.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of azure-ai-evaluation might be problematic. Click here for more details.
- azure/ai/evaluation/__init__.py +9 -16
- azure/ai/evaluation/_aoai/__init__.py +10 -0
- azure/ai/evaluation/_aoai/aoai_grader.py +89 -0
- azure/ai/evaluation/_aoai/label_grader.py +66 -0
- azure/ai/evaluation/_aoai/string_check_grader.py +65 -0
- azure/ai/evaluation/_aoai/text_similarity_grader.py +88 -0
- azure/ai/evaluation/_azure/_clients.py +4 -4
- azure/ai/evaluation/_azure/_envs.py +208 -0
- azure/ai/evaluation/_azure/_token_manager.py +12 -7
- azure/ai/evaluation/_common/__init__.py +5 -0
- azure/ai/evaluation/_common/evaluation_onedp_client.py +118 -0
- azure/ai/evaluation/_common/onedp/__init__.py +32 -0
- azure/ai/evaluation/_common/onedp/_client.py +139 -0
- azure/ai/evaluation/_common/onedp/_configuration.py +73 -0
- azure/ai/evaluation/_common/onedp/_model_base.py +1232 -0
- azure/ai/evaluation/_common/onedp/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/_serialization.py +2032 -0
- azure/ai/evaluation/_common/onedp/_types.py +21 -0
- azure/ai/evaluation/_common/onedp/_validation.py +50 -0
- azure/ai/evaluation/_common/onedp/_vendor.py +50 -0
- azure/ai/evaluation/_common/onedp/_version.py +9 -0
- azure/ai/evaluation/_common/onedp/aio/__init__.py +29 -0
- azure/ai/evaluation/_common/onedp/aio/_client.py +143 -0
- azure/ai/evaluation/_common/onedp/aio/_configuration.py +75 -0
- azure/ai/evaluation/_common/onedp/aio/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/aio/_vendor.py +40 -0
- azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +39 -0
- azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +4494 -0
- azure/ai/evaluation/_common/onedp/aio/operations/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/models/__init__.py +142 -0
- azure/ai/evaluation/_common/onedp/models/_enums.py +162 -0
- azure/ai/evaluation/_common/onedp/models/_models.py +2228 -0
- azure/ai/evaluation/_common/onedp/models/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/operations/__init__.py +39 -0
- azure/ai/evaluation/_common/onedp/operations/_operations.py +5655 -0
- azure/ai/evaluation/_common/onedp/operations/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/py.typed +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/__init__.py +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/aio/__init__.py +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/__init__.py +25 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_operations.py +34 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/__init__.py +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/__init__.py +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/__init__.py +22 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_operations.py +29 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/__init__.py +22 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_operations.py +29 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/operations/__init__.py +25 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/operations/_operations.py +34 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/rai_service.py +159 -29
- azure/ai/evaluation/_common/raiclient/_version.py +1 -1
- azure/ai/evaluation/_common/utils.py +80 -2
- azure/ai/evaluation/_constants.py +16 -0
- azure/ai/evaluation/_converters/__init__.py +1 -1
- azure/ai/evaluation/_converters/_ai_services.py +4 -4
- azure/ai/evaluation/_eval_mapping.py +71 -0
- azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +30 -16
- azure/ai/evaluation/_evaluate/_batch_run/code_client.py +18 -12
- azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +17 -4
- azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +47 -22
- azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +18 -2
- azure/ai/evaluation/_evaluate/_eval_run.py +2 -2
- azure/ai/evaluation/_evaluate/_evaluate.py +372 -105
- azure/ai/evaluation/_evaluate/_evaluate_aoai.py +534 -0
- azure/ai/evaluation/_evaluate/_telemetry/__init__.py +5 -89
- azure/ai/evaluation/_evaluate/_utils.py +120 -7
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +9 -4
- azure/ai/evaluation/_evaluators/_common/_base_multi_eval.py +1 -1
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +12 -3
- azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +2 -2
- azure/ai/evaluation/_evaluators/_document_retrieval/__init__.py +11 -0
- azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +467 -0
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +1 -1
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +2 -2
- azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +6 -2
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +1 -1
- azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +8 -2
- azure/ai/evaluation/_evaluators/_response_completeness/response_completeness.prompty +31 -46
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +1 -1
- azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +5 -2
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +6 -2
- azure/ai/evaluation/_exceptions.py +2 -0
- azure/ai/evaluation/_legacy/_adapters/__init__.py +7 -0
- azure/ai/evaluation/_legacy/_adapters/_check.py +17 -0
- azure/ai/evaluation/_legacy/_adapters/_configuration.py +45 -0
- azure/ai/evaluation/_legacy/_adapters/_constants.py +10 -0
- azure/ai/evaluation/_legacy/_adapters/_errors.py +29 -0
- azure/ai/evaluation/_legacy/_adapters/_flows.py +28 -0
- azure/ai/evaluation/_legacy/_adapters/_service.py +16 -0
- azure/ai/evaluation/_legacy/_adapters/client.py +51 -0
- azure/ai/evaluation/_legacy/_adapters/entities.py +26 -0
- azure/ai/evaluation/_legacy/_adapters/tracing.py +28 -0
- azure/ai/evaluation/_legacy/_adapters/types.py +15 -0
- azure/ai/evaluation/_legacy/_adapters/utils.py +31 -0
- azure/ai/evaluation/_legacy/_batch_engine/_engine.py +51 -32
- azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py +114 -8
- azure/ai/evaluation/_legacy/_batch_engine/_result.py +7 -1
- azure/ai/evaluation/_legacy/_batch_engine/_run.py +6 -0
- azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +69 -29
- azure/ai/evaluation/_legacy/_batch_engine/_status.py +1 -1
- azure/ai/evaluation/_legacy/_batch_engine/_trace.py +54 -62
- azure/ai/evaluation/_legacy/_batch_engine/_utils.py +19 -1
- azure/ai/evaluation/{_red_team/_utils → _legacy/_common}/__init__.py +1 -1
- azure/ai/evaluation/_legacy/_common/_async_token_provider.py +124 -0
- azure/ai/evaluation/_legacy/_common/_thread_pool_executor_with_context.py +15 -0
- azure/ai/evaluation/_legacy/prompty/_connection.py +11 -74
- azure/ai/evaluation/_legacy/prompty/_exceptions.py +80 -0
- azure/ai/evaluation/_legacy/prompty/_prompty.py +119 -9
- azure/ai/evaluation/_legacy/prompty/_utils.py +72 -2
- azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +90 -17
- azure/ai/evaluation/_version.py +1 -1
- azure/ai/evaluation/red_team/__init__.py +19 -0
- azure/ai/evaluation/{_red_team → red_team}/_attack_objective_generator.py +3 -0
- azure/ai/evaluation/{_red_team → red_team}/_attack_strategy.py +4 -1
- azure/ai/evaluation/{_red_team → red_team}/_red_team.py +885 -481
- azure/ai/evaluation/red_team/_red_team_result.py +382 -0
- azure/ai/evaluation/{_red_team → red_team}/_utils/constants.py +2 -1
- azure/ai/evaluation/{_red_team → red_team}/_utils/formatting_utils.py +23 -22
- azure/ai/evaluation/{_red_team → red_team}/_utils/logging_utils.py +1 -1
- azure/ai/evaluation/red_team/_utils/metric_mapping.py +23 -0
- azure/ai/evaluation/{_red_team → red_team}/_utils/strategy_utils.py +9 -5
- azure/ai/evaluation/simulator/_adversarial_simulator.py +63 -39
- azure/ai/evaluation/simulator/_constants.py +1 -0
- azure/ai/evaluation/simulator/_conversation/__init__.py +13 -6
- azure/ai/evaluation/simulator/_conversation/_conversation.py +2 -1
- azure/ai/evaluation/simulator/_direct_attack_simulator.py +35 -22
- azure/ai/evaluation/simulator/_helpers/_language_suffix_mapping.py +1 -0
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +40 -25
- azure/ai/evaluation/simulator/_model_tools/__init__.py +2 -1
- azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +24 -18
- azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +5 -10
- azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +65 -41
- azure/ai/evaluation/simulator/_model_tools/_template_handler.py +9 -5
- azure/ai/evaluation/simulator/_model_tools/models.py +20 -17
- azure/ai/evaluation/simulator/_simulator.py +1 -1
- {azure_ai_evaluation-1.4.0.dist-info → azure_ai_evaluation-1.6.0.dist-info}/METADATA +36 -2
- {azure_ai_evaluation-1.4.0.dist-info → azure_ai_evaluation-1.6.0.dist-info}/RECORD +148 -80
- azure/ai/evaluation/_red_team/_red_team_result.py +0 -246
- azure/ai/evaluation/simulator/_tracing.py +0 -89
- /azure/ai/evaluation/_legacy/{_batch_engine → _common}/_logging.py +0 -0
- /azure/ai/evaluation/{_red_team → red_team}/_callback_chat_target.py +0 -0
- /azure/ai/evaluation/{_red_team → red_team}/_default_converter.py +0 -0
- /azure/ai/evaluation/{_red_team → red_team/_utils}/__init__.py +0 -0
- {azure_ai_evaluation-1.4.0.dist-info → azure_ai_evaluation-1.6.0.dist-info}/NOTICE.txt +0 -0
- {azure_ai_evaluation-1.4.0.dist-info → azure_ai_evaluation-1.6.0.dist-info}/WHEEL +0 -0
- {azure_ai_evaluation-1.4.0.dist-info → azure_ai_evaluation-1.6.0.dist-info}/top_level.txt +0 -0
|
@@ -10,21 +10,24 @@
|
|
|
10
10
|
# porting over the code largely as is to remove the Promptflow dependency
|
|
11
11
|
# as quickly as possible. In phase 2 this code will be heavily refactored.
|
|
12
12
|
|
|
13
|
+
import inspect
|
|
13
14
|
import re
|
|
14
15
|
import asyncio
|
|
16
|
+
|
|
15
17
|
from math import floor
|
|
16
18
|
from asyncio import Semaphore
|
|
19
|
+
from concurrent.futures import Executor
|
|
20
|
+
from functools import partial
|
|
17
21
|
from contextlib import contextmanager
|
|
18
|
-
from
|
|
19
|
-
from
|
|
20
|
-
from typing import Any, Callable, Dict, Final, Generator, Mapping, MutableMapping, Optional, Sequence, Set, Tuple
|
|
22
|
+
from datetime import datetime, timezone
|
|
23
|
+
from typing import Any, Callable, Dict, Final, Generator, Mapping, MutableMapping, Optional, Sequence, Set, Tuple, cast
|
|
21
24
|
from uuid import uuid4
|
|
22
25
|
|
|
23
|
-
from ._utils import get_int_env_var, get_value_from_path
|
|
26
|
+
from ._utils import DEFAULTS_KEY, get_int_env_var, get_value_from_path, is_async_callable
|
|
24
27
|
from ._status import BatchStatus
|
|
25
28
|
from ._result import BatchResult, BatchRunDetails, BatchRunError, TokenMetrics
|
|
26
29
|
from ._run_storage import AbstractRunStorage, NoOpRunStorage
|
|
27
|
-
from ._logging import log_progress, NodeLogManager
|
|
30
|
+
from .._common._logging import log_progress, NodeLogManager
|
|
28
31
|
from ..._exceptions import ErrorBlame
|
|
29
32
|
from ._exceptions import (
|
|
30
33
|
BatchEngineCanceledError,
|
|
@@ -37,6 +40,7 @@ from ._utils_deprecated import (
|
|
|
37
40
|
async_run_allowing_running_loop,
|
|
38
41
|
convert_eager_flow_output_to_dict,
|
|
39
42
|
)
|
|
43
|
+
from ._openai_injector import CaptureOpenAITokenUsage
|
|
40
44
|
|
|
41
45
|
|
|
42
46
|
MAX_WORKER_COUNT: Final[int] = 10
|
|
@@ -48,51 +52,37 @@ class BatchEngine:
|
|
|
48
52
|
|
|
49
53
|
def __init__(
|
|
50
54
|
self,
|
|
51
|
-
|
|
55
|
+
func: Callable,
|
|
52
56
|
*,
|
|
53
57
|
storage: Optional[AbstractRunStorage] = None,
|
|
54
58
|
batch_timeout_sec: Optional[int] = None,
|
|
55
59
|
line_timeout_sec: Optional[int] = None,
|
|
56
60
|
max_worker_count: Optional[int] = None,
|
|
57
|
-
|
|
61
|
+
executor: Optional[Executor] = None,
|
|
58
62
|
):
|
|
59
63
|
"""Create a new batch engine instance
|
|
60
64
|
|
|
61
|
-
:param Callable
|
|
65
|
+
:param Callable func: The function to run the flow
|
|
62
66
|
:param Optional[AbstractRunStorage] storage: The storage to store execution results
|
|
63
67
|
:param Optional[int] batch_timeout_sec: The timeout of batch run in seconds
|
|
64
68
|
:param Optional[int] line_timeout_sec: The timeout of each line in seconds
|
|
65
69
|
:param Optional[int] max_worker_count: The concurrency limit of batch run
|
|
66
|
-
:param
|
|
67
|
-
:type kwargs: Any
|
|
70
|
+
:param Optional[Executor] executor: The executor to run the flow (if needed)
|
|
68
71
|
"""
|
|
69
72
|
|
|
70
|
-
self.
|
|
71
|
-
# self._working_dir = working_dir
|
|
72
|
-
|
|
73
|
-
# self._is_eager_flow = True
|
|
74
|
-
# self._is_prompty_flow = False
|
|
75
|
-
# self._program_language = FlowLanguage.Python
|
|
76
|
-
# self._message_format = MessageFormatType.BASIC
|
|
77
|
-
# self._multimedia_processor = MultimediaProcessor.create(self._message_format)
|
|
78
|
-
# self._connections = {}
|
|
79
|
-
|
|
73
|
+
self._func: Callable = func
|
|
80
74
|
self._storage: AbstractRunStorage = storage or NoOpRunStorage()
|
|
81
75
|
|
|
82
76
|
# TODO ralphe: Consume these from the batch context/config instead of from
|
|
83
77
|
# kwargs or (even worse) environment variables
|
|
84
|
-
# self._batch_use_async = kwargs.get("batch_use_async", True)
|
|
85
78
|
self._batch_timeout_sec = batch_timeout_sec or get_int_env_var("PF_BATCH_TIMEOUT_SEC")
|
|
86
79
|
self._line_timeout_sec = line_timeout_sec or get_int_env_var("PF_LINE_TIMEOUT_SEC", 600)
|
|
87
80
|
self._max_worker_count = max_worker_count or get_int_env_var("PF_WORKER_COUNT") or MAX_WORKER_COUNT
|
|
88
|
-
# update kwargs with worker_count and line_timeout_sec
|
|
89
|
-
kwargs.update({"worker_count": self._max_worker_count, "line_timeout_sec": self._line_timeout_sec})
|
|
90
81
|
|
|
82
|
+
self._executor: Optional[Executor] = executor
|
|
91
83
|
self._is_canceled: bool = False
|
|
92
|
-
self._kwargs: Mapping[str, Any] = kwargs
|
|
93
|
-
# self._init_kwargs: Mapping[str, Any] = init_kwargs or {}
|
|
94
84
|
|
|
95
|
-
def run(
|
|
85
|
+
async def run(
|
|
96
86
|
self,
|
|
97
87
|
data: Sequence[Mapping[str, Any]],
|
|
98
88
|
column_mapping: Mapping[str, str],
|
|
@@ -113,9 +103,7 @@ class BatchEngine:
|
|
|
113
103
|
|
|
114
104
|
try:
|
|
115
105
|
id = id or str(uuid4())
|
|
116
|
-
|
|
117
|
-
result: BatchResult = async_run_allowing_running_loop(self._exec_in_task, id, batch_inputs, start_time)
|
|
118
|
-
|
|
106
|
+
result: BatchResult = await self._exec_in_task(id, batch_inputs, start_time)
|
|
119
107
|
return result
|
|
120
108
|
except Exception as ex:
|
|
121
109
|
raise BatchEngineError(
|
|
@@ -136,6 +124,7 @@ class BatchEngine:
|
|
|
136
124
|
|
|
137
125
|
inputs: Sequence[Mapping[str, Any]] = []
|
|
138
126
|
line: int = 0
|
|
127
|
+
defaults = cast(Mapping[str, Any], column_mapping.get(DEFAULTS_KEY, {}))
|
|
139
128
|
|
|
140
129
|
for input in data:
|
|
141
130
|
line += 1
|
|
@@ -143,6 +132,10 @@ class BatchEngine:
|
|
|
143
132
|
missing_inputs: Set[str] = set()
|
|
144
133
|
|
|
145
134
|
for key, value in column_mapping.items():
|
|
135
|
+
if key == DEFAULTS_KEY:
|
|
136
|
+
# Skip the defaults key
|
|
137
|
+
continue
|
|
138
|
+
|
|
146
139
|
if not isinstance(value, str):
|
|
147
140
|
# All non-string values are literal values.
|
|
148
141
|
mapped[key] = value
|
|
@@ -156,6 +149,9 @@ class BatchEngine:
|
|
|
156
149
|
|
|
157
150
|
dict_path = match.group(1)
|
|
158
151
|
found, value = get_value_from_path(dict_path, input)
|
|
152
|
+
if not found: # try default value
|
|
153
|
+
found, value = get_value_from_path(dict_path, defaults)
|
|
154
|
+
|
|
159
155
|
if found:
|
|
160
156
|
mapped[key] = value
|
|
161
157
|
else:
|
|
@@ -306,11 +302,34 @@ class BatchEngine:
|
|
|
306
302
|
|
|
307
303
|
try:
|
|
308
304
|
# TODO ralphe: Handle line timeouts here
|
|
309
|
-
|
|
305
|
+
with CaptureOpenAITokenUsage() as captured_tokens:
|
|
306
|
+
# NOTE: In the legacy code, any synchronous functions were executed in a different process
|
|
307
|
+
# for isolation reasons. However this isolation was violated in the way the code was
|
|
308
|
+
# used by the evaluation SDK (e.g. you need to have the module already loaded to pass the
|
|
309
|
+
# callable into the batch engine, so starting a new process to examine it was redundant).
|
|
310
|
+
# It also came with performance and memory usage costs (each line was processed in a
|
|
311
|
+
# separate process up to a maximum of 4), and these processes were created and torn down
|
|
312
|
+
# too frequently.
|
|
313
|
+
# For now we will just run the function in the current process, but in the future we may
|
|
314
|
+
# want to consider running the function in a separate process for isolation reasons.
|
|
315
|
+
output: Any
|
|
316
|
+
if is_async_callable(self._func):
|
|
317
|
+
output = await self._func(**inputs)
|
|
318
|
+
else:
|
|
319
|
+
# to maximize the parallelism, we run the synchronous function in a separate thread
|
|
320
|
+
# and await its result
|
|
321
|
+
output = await asyncio.get_event_loop().run_in_executor(
|
|
322
|
+
self._executor,
|
|
323
|
+
partial(self._func, **inputs))
|
|
324
|
+
|
|
325
|
+
# This should in theory never happen but as an extra precaution, let's check if the output
|
|
326
|
+
# is awaitable and await it if it is.
|
|
327
|
+
if inspect.isawaitable(output):
|
|
328
|
+
output = await output
|
|
329
|
+
|
|
310
330
|
details.status = BatchStatus.Completed
|
|
311
331
|
details.result = convert_eager_flow_output_to_dict(output)
|
|
312
|
-
|
|
313
|
-
# TODO figure out how to get the token metrics here
|
|
332
|
+
details.tokens.update(captured_tokens)
|
|
314
333
|
except Exception as ex:
|
|
315
334
|
details.status = BatchStatus.Failed
|
|
316
335
|
details.error = BatchRunError(
|
|
@@ -4,20 +4,126 @@
|
|
|
4
4
|
|
|
5
5
|
# Original source code: promptflow-tracing/promptflow/tracing/_integrations/_openai_injector.py
|
|
6
6
|
|
|
7
|
+
import functools
|
|
8
|
+
import importlib
|
|
9
|
+
import logging
|
|
10
|
+
|
|
11
|
+
from contextvars import ContextVar
|
|
12
|
+
from typing import Any, Callable, Final, Generator, Optional, Protocol, Sequence, Tuple
|
|
13
|
+
|
|
14
|
+
from azure.ai.evaluation._legacy._adapters._errors import MissingRequiredPackage
|
|
15
|
+
from azure.ai.evaluation._legacy._batch_engine._result import TokenMetrics
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
_token_metrics: ContextVar[TokenMetrics] = ContextVar("token_metrics", default=TokenMetrics(0, 0, 0))
|
|
19
|
+
KEY_ATTR_ORIGINAL: Final[str] = "_original"
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class _TokenMetrics(Protocol):
|
|
23
|
+
"""Protocol class to represent the token metrics."""
|
|
24
|
+
|
|
25
|
+
prompt_tokens: int
|
|
26
|
+
completion_tokens: int
|
|
27
|
+
total_tokens: int
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class _WithUsage(Protocol):
|
|
31
|
+
"""Protocol class to represent an OpenAI object that may have a token usage property/attribute."""
|
|
32
|
+
|
|
33
|
+
usage: Optional[_TokenMetrics]
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def _wrap_openai_api_method(method: Callable, is_async: bool) -> Callable:
|
|
37
|
+
"""Wraps the OpenAI API method to inject logic to run on the result of the call."""
|
|
38
|
+
|
|
39
|
+
def update_usage(result: _WithUsage) -> None:
|
|
40
|
+
if hasattr(result, "usage") and result.usage is not None:
|
|
41
|
+
usage = _token_metrics.get()
|
|
42
|
+
usage.prompt_tokens += result.usage.prompt_tokens
|
|
43
|
+
usage.completion_tokens += result.usage.completion_tokens
|
|
44
|
+
usage.total_tokens += result.usage.total_tokens
|
|
45
|
+
|
|
46
|
+
if is_async:
|
|
47
|
+
|
|
48
|
+
@functools.wraps(method)
|
|
49
|
+
async def async_wrapper(*args: Any, **kwargs: Any) -> Any:
|
|
50
|
+
result: _WithUsage = await method(*args, **kwargs)
|
|
51
|
+
update_usage(result)
|
|
52
|
+
return result
|
|
53
|
+
|
|
54
|
+
return async_wrapper
|
|
55
|
+
else:
|
|
56
|
+
|
|
57
|
+
@functools.wraps(method)
|
|
58
|
+
def sync_wrapper(*args: Any, **kwargs: Any) -> Any:
|
|
59
|
+
result: _WithUsage = method(*args, **kwargs)
|
|
60
|
+
update_usage(result)
|
|
61
|
+
return result
|
|
62
|
+
|
|
63
|
+
return sync_wrapper
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def _openai_api_list() -> Generator[Tuple[Any, Callable, bool], None, None]:
|
|
67
|
+
"""Load the list of OpenAI API classes and their corresponding method names."""
|
|
68
|
+
|
|
69
|
+
apis: Sequence[Tuple[str, str, str, bool]] = [
|
|
70
|
+
("openai.resources.chat", "Completions", "create", False),
|
|
71
|
+
("openai.resources.chat", "AsyncCompletions", "create", True),
|
|
72
|
+
("openai.resources", "Completions", "create", False),
|
|
73
|
+
("openai.resources", "AsyncCompletions", "create", True),
|
|
74
|
+
("openai.resources", "Embeddings", "create", False),
|
|
75
|
+
("openai.resources", "AsyncEmbeddings", "create", True),
|
|
76
|
+
("openai.resources", "Responses", "create", False),
|
|
77
|
+
("openai.resources", "AsyncResponses", "create", True),
|
|
78
|
+
]
|
|
79
|
+
|
|
80
|
+
for module_name, class_name, method_name, is_async in apis:
|
|
81
|
+
try:
|
|
82
|
+
module = importlib.import_module(module_name)
|
|
83
|
+
cls = getattr(module, class_name, None)
|
|
84
|
+
if cls is None:
|
|
85
|
+
continue
|
|
86
|
+
method = getattr(cls, method_name, None)
|
|
87
|
+
if method is None:
|
|
88
|
+
continue
|
|
89
|
+
yield cls, method, is_async
|
|
90
|
+
except ImportError:
|
|
91
|
+
raise MissingRequiredPackage("Please install the 'openai' package to use the Azure AI Evaluation SDK")
|
|
92
|
+
except AttributeError:
|
|
93
|
+
logging.warning("The module '%s' does not have class '%s' or method '%s'", module_name, class_name, method_name)
|
|
94
|
+
|
|
7
95
|
|
|
8
96
|
def inject_openai_api():
|
|
9
|
-
"""This function
|
|
10
|
-
|
|
11
|
-
It stores the original methods as _original attributes of the create methods.
|
|
12
|
-
2. Updates the openai api configs from environment variables.
|
|
97
|
+
"""This function modifies the create methods of the OpenAI API classes to inject logic
|
|
98
|
+
to enable us to collect token usage data.
|
|
13
99
|
"""
|
|
14
|
-
|
|
15
|
-
|
|
100
|
+
for cls, method, is_async in _openai_api_list():
|
|
101
|
+
# Check if the create method of the openai_api class has already been modified
|
|
102
|
+
if not hasattr(method, KEY_ATTR_ORIGINAL):
|
|
103
|
+
wrapper_method: Callable = _wrap_openai_api_method(method, is_async)
|
|
104
|
+
setattr(wrapper_method, KEY_ATTR_ORIGINAL, method)
|
|
105
|
+
setattr(cls, method.__name__, wrapper_method)
|
|
16
106
|
|
|
17
107
|
|
|
18
108
|
def recover_openai_api():
|
|
19
109
|
"""This function restores the original create methods of the OpenAI API classes
|
|
20
110
|
by assigning them back from the _original attributes of the modified methods.
|
|
21
111
|
"""
|
|
22
|
-
|
|
23
|
-
|
|
112
|
+
for cls, method, _ in _openai_api_list():
|
|
113
|
+
if hasattr(method, KEY_ATTR_ORIGINAL):
|
|
114
|
+
original_method = getattr(method, KEY_ATTR_ORIGINAL)
|
|
115
|
+
setattr(cls, method.__name__, original_method)
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
class CaptureOpenAITokenUsage:
|
|
119
|
+
"""Context manager to capture OpenAI token usage."""
|
|
120
|
+
def __init__(self):
|
|
121
|
+
self._tokens = TokenMetrics(0, 0, 0)
|
|
122
|
+
|
|
123
|
+
def __enter__(self) -> TokenMetrics:
|
|
124
|
+
_token_metrics.set(TokenMetrics(0, 0, 0))
|
|
125
|
+
return self._tokens
|
|
126
|
+
|
|
127
|
+
def __exit__(self, exc_type: Optional[Exception], exc_value: Optional[Exception], traceback: Optional[Any]) -> None:
|
|
128
|
+
captured_metrics = _token_metrics.get()
|
|
129
|
+
self._tokens.update(captured_metrics)
|
|
@@ -20,6 +20,12 @@ class TokenMetrics:
|
|
|
20
20
|
total_tokens: int
|
|
21
21
|
"""The total number of tokens used in the run."""
|
|
22
22
|
|
|
23
|
+
def update(self, other: "TokenMetrics") -> None:
|
|
24
|
+
"""Update the token metrics with another set of token metrics."""
|
|
25
|
+
self.prompt_tokens += other.prompt_tokens
|
|
26
|
+
self.completion_tokens += other.completion_tokens
|
|
27
|
+
self.total_tokens += other.total_tokens
|
|
28
|
+
|
|
23
29
|
|
|
24
30
|
@dataclass
|
|
25
31
|
class BatchRunError:
|
|
@@ -96,4 +102,4 @@ class BatchResult:
|
|
|
96
102
|
"""The results of the batch run."""
|
|
97
103
|
if not self.details:
|
|
98
104
|
return []
|
|
99
|
-
return [d.result for d in self.details]
|
|
105
|
+
return [d.result for d in self.details]
|
|
@@ -60,6 +60,7 @@ class Run:
|
|
|
60
60
|
inputs: Sequence[Mapping[str, Any]],
|
|
61
61
|
column_mapping: Mapping[str, str],
|
|
62
62
|
created_on: Optional[datetime] = None,
|
|
63
|
+
run: Optional["Run"] = None,
|
|
63
64
|
):
|
|
64
65
|
self._status: RunStatus = RunStatus.NOT_STARTED
|
|
65
66
|
self._created_on = created_on or datetime.now(timezone.utc)
|
|
@@ -72,6 +73,7 @@ class Run:
|
|
|
72
73
|
self.column_mapping = column_mapping
|
|
73
74
|
self.result: Optional[BatchResult] = None
|
|
74
75
|
self.metrics: Mapping[str, Any] = {}
|
|
76
|
+
self._run = run
|
|
75
77
|
|
|
76
78
|
# self._use_remote_flow = False
|
|
77
79
|
# self._from_flex_flow = True
|
|
@@ -105,6 +107,10 @@ class Run:
|
|
|
105
107
|
|
|
106
108
|
return [value or {} for value in self.result.results]
|
|
107
109
|
|
|
110
|
+
@property
|
|
111
|
+
def previous_run(self) -> Optional["Run"]:
|
|
112
|
+
return self._run
|
|
113
|
+
|
|
108
114
|
@staticmethod
|
|
109
115
|
def _generate_run_name(name_prefix: Optional[str], creation_time: datetime) -> str:
|
|
110
116
|
# The Promptflow code looked at the folder name of the temporary folder used to
|
|
@@ -3,17 +3,20 @@
|
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
4
|
|
|
5
5
|
import dataclasses
|
|
6
|
+
import inspect
|
|
6
7
|
import sys
|
|
8
|
+
|
|
9
|
+
from concurrent.futures import Executor
|
|
7
10
|
from datetime import datetime, timezone
|
|
8
11
|
from typing import Any, Callable, Dict, Mapping, Optional, Sequence, TextIO, Union
|
|
9
12
|
|
|
10
13
|
from ._run import Run, RunStatus
|
|
11
|
-
from ._trace import start_trace
|
|
14
|
+
from ._trace import start_trace
|
|
12
15
|
from ._run_storage import AbstractRunStorage, NoOpRunStorage
|
|
13
|
-
from ._logging import incremental_print, print_red_error
|
|
16
|
+
from .._common._logging import incremental_print, print_red_error
|
|
14
17
|
from ._config import BatchEngineConfig
|
|
15
18
|
from ._exceptions import BatchEngineValidationError
|
|
16
|
-
from ._engine import BatchEngine, BatchEngineError, BatchResult
|
|
19
|
+
from ._engine import DEFAULTS_KEY, BatchEngine, BatchEngineError, BatchResult
|
|
17
20
|
|
|
18
21
|
|
|
19
22
|
class RunSubmitter:
|
|
@@ -22,25 +25,32 @@ class RunSubmitter:
|
|
|
22
25
|
|
|
23
26
|
THIS WILL BE REMOVED IN A FUTURE CODE UPDATE"""
|
|
24
27
|
|
|
25
|
-
def __init__(self, config: BatchEngineConfig):
|
|
28
|
+
def __init__(self, config: BatchEngineConfig, executor: Optional[Executor] = None):
|
|
26
29
|
# self._client = PFClient instance
|
|
27
30
|
# self._config = PFClient config
|
|
28
31
|
# self.run_operations = RunOperations instance
|
|
29
32
|
|
|
30
33
|
# TODO ralphe: Use proper logger here. Old code did LoggerFactory.get_logger(__name__)
|
|
31
34
|
self._config = config
|
|
35
|
+
self._executor = executor
|
|
32
36
|
|
|
33
|
-
def submit(
|
|
37
|
+
async def submit(
|
|
34
38
|
self,
|
|
35
39
|
dynamic_callable: Callable,
|
|
36
40
|
inputs: Sequence[Mapping[str, Any]],
|
|
37
|
-
column_mapping: Mapping[str, str],
|
|
41
|
+
column_mapping: Optional[Mapping[str, str]],
|
|
38
42
|
*,
|
|
39
43
|
name_prefix: Optional[str] = None,
|
|
40
44
|
created_on: Optional[datetime] = None,
|
|
41
45
|
storage_creator: Optional[Callable[[Run], AbstractRunStorage]] = None,
|
|
42
46
|
**kwargs,
|
|
43
47
|
) -> Run:
|
|
48
|
+
|
|
49
|
+
# if the column mappings are not provided, generate them based on the arguments to the
|
|
50
|
+
# flow function.
|
|
51
|
+
if column_mapping is None:
|
|
52
|
+
column_mapping = self._generate_column_mapping(dynamic_callable)
|
|
53
|
+
|
|
44
54
|
# The old code always spun up two threads here using a ThreadPoolExecutor:
|
|
45
55
|
# 1. One thread essentially did nothing of value (since tracing was disabled, and we
|
|
46
56
|
# don't care about checking for the latest PromptFlow version number now)
|
|
@@ -51,27 +61,18 @@ class RunSubmitter:
|
|
|
51
61
|
# of the _run_bulk code here directly.
|
|
52
62
|
# In a future code refactor, all of this will be cleaned up in favour of proper
|
|
53
63
|
# async/await code.
|
|
54
|
-
|
|
64
|
+
|
|
65
|
+
run: Run = Run(
|
|
55
66
|
dynamic_callable=dynamic_callable,
|
|
56
67
|
name_prefix=name_prefix,
|
|
57
68
|
inputs=inputs,
|
|
58
69
|
column_mapping=column_mapping,
|
|
59
70
|
created_on=created_on,
|
|
71
|
+
run=kwargs.pop("run", None),
|
|
60
72
|
)
|
|
61
73
|
|
|
62
|
-
logger = self._config.logger
|
|
63
74
|
attributes: Dict[str, Any] = kwargs.get("attributes", {})
|
|
64
|
-
collection_for_run:
|
|
65
|
-
|
|
66
|
-
logger.debug("start trace for flow run...")
|
|
67
|
-
logger.debug("flow path for run.start_trace: %s", run.name)
|
|
68
|
-
|
|
69
|
-
if is_collection_writeable():
|
|
70
|
-
logger.debug("trace collection is writeable, will use flow name as collection...")
|
|
71
|
-
collection_for_run = run.name
|
|
72
|
-
logger.debug("collection for run: %s", collection_for_run)
|
|
73
|
-
else:
|
|
74
|
-
logger.debug("trace collection is protected, will honor existing collection.")
|
|
75
|
+
collection_for_run: str = run.name
|
|
75
76
|
start_trace(attributes=attributes, run=run, _collection=collection_for_run)
|
|
76
77
|
|
|
77
78
|
self._validate_inputs(run=run)
|
|
@@ -81,12 +82,12 @@ class RunSubmitter:
|
|
|
81
82
|
run._status = RunStatus.PREPARING
|
|
82
83
|
|
|
83
84
|
# unnecessary Flow loading code was removed here. Instead do direct calls to _submit_bulk_run
|
|
84
|
-
self._submit_bulk_run(run=run, local_storage=local_storage, **kwargs)
|
|
85
|
+
await self._submit_bulk_run(run=run, local_storage=local_storage, **kwargs)
|
|
85
86
|
|
|
86
87
|
self.stream_run(run=run, storage=local_storage, raise_on_error=True)
|
|
87
88
|
return run
|
|
88
89
|
|
|
89
|
-
def _submit_bulk_run(self, run: Run, local_storage: AbstractRunStorage, **kwargs) -> None:
|
|
90
|
+
async def _submit_bulk_run(self, run: Run, local_storage: AbstractRunStorage, **kwargs) -> None:
|
|
90
91
|
logger = self._config.logger
|
|
91
92
|
|
|
92
93
|
logger.info(f"Submitting run {run.name}, log path: {local_storage.logger.file_path}")
|
|
@@ -95,6 +96,29 @@ class RunSubmitter:
|
|
|
95
96
|
# removed since it is unnecessary. It also parsed and set environment variables. This
|
|
96
97
|
# has also been removed since it can be problematic in a multi-threaded environment.
|
|
97
98
|
|
|
99
|
+
if run.previous_run:
|
|
100
|
+
previous: Optional[Run] = run.previous_run
|
|
101
|
+
if previous.status != RunStatus.COMPLETED:
|
|
102
|
+
raise BatchEngineValidationError(
|
|
103
|
+
f"Referenced run {previous.name} is not completed, got status {previous.status.value}."
|
|
104
|
+
)
|
|
105
|
+
if previous.outputs is not None:
|
|
106
|
+
if len(previous.outputs) != len(run.inputs):
|
|
107
|
+
raise BatchEngineValidationError(
|
|
108
|
+
f"Referenced run {previous.name} has {len(previous.outputs)} outputs, "
|
|
109
|
+
f"but {len(run.inputs)} inputs are provided."
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
# load in the previous run's outputs and inputs into the list of dictionaries to allow for
|
|
113
|
+
# the previous run's outputs to be used as inputs for the current run
|
|
114
|
+
run.inputs = [
|
|
115
|
+
{
|
|
116
|
+
"run.outputs": previous.outputs[i],
|
|
117
|
+
"run.inputs": previous.inputs[i],
|
|
118
|
+
**run.inputs[i]
|
|
119
|
+
}
|
|
120
|
+
for i in range(len(run.inputs))]
|
|
121
|
+
|
|
98
122
|
self._validate_column_mapping(run.column_mapping)
|
|
99
123
|
|
|
100
124
|
run._status = RunStatus.RUNNING
|
|
@@ -108,10 +132,10 @@ class RunSubmitter:
|
|
|
108
132
|
batch_timeout_sec=self._config.batch_timeout_seconds,
|
|
109
133
|
line_timeout_sec=self._config.run_timeout_seconds,
|
|
110
134
|
max_worker_count=self._config.max_concurrency,
|
|
111
|
-
|
|
135
|
+
executor=self._executor,
|
|
112
136
|
)
|
|
113
137
|
|
|
114
|
-
batch_result = batch_engine.run(data=run.inputs, column_mapping=run.column_mapping, id=run.name)
|
|
138
|
+
batch_result = await batch_engine.run(data=run.inputs, column_mapping=run.column_mapping, id=run.name)
|
|
115
139
|
run._status = RunStatus.from_batch_result_status(batch_result.status)
|
|
116
140
|
|
|
117
141
|
error_logs: Sequence[str] = []
|
|
@@ -152,10 +176,30 @@ class RunSubmitter:
|
|
|
152
176
|
run.metrics = system_metrics
|
|
153
177
|
run.result = batch_result
|
|
154
178
|
|
|
179
|
+
@staticmethod
|
|
180
|
+
def _generate_column_mapping(function: Callable) -> Mapping[str, Any]:
|
|
181
|
+
args = inspect.signature(function).parameters
|
|
182
|
+
default_values: Dict[str, Any] = {}
|
|
183
|
+
mapping: Dict[str, Any] = {}
|
|
184
|
+
for key, value in args.items():
|
|
185
|
+
if key in ["self", "cls"] or value.kind in [value.VAR_POSITIONAL, value.VAR_KEYWORD]:
|
|
186
|
+
continue
|
|
187
|
+
|
|
188
|
+
mapping[key] = f"${{data.{key}}}"
|
|
189
|
+
if value.default != inspect.Parameter.empty:
|
|
190
|
+
default_values[key] = value.default
|
|
191
|
+
|
|
192
|
+
return {
|
|
193
|
+
**mapping,
|
|
194
|
+
DEFAULTS_KEY: default_values,
|
|
195
|
+
}
|
|
196
|
+
|
|
155
197
|
@staticmethod
|
|
156
198
|
def _validate_inputs(run: Run):
|
|
157
|
-
if not run.inputs:
|
|
158
|
-
raise BatchEngineValidationError(
|
|
199
|
+
if not run.inputs and not run.previous_run:
|
|
200
|
+
raise BatchEngineValidationError(
|
|
201
|
+
"Either data, or a previous run must be specified for the evaluation run."
|
|
202
|
+
)
|
|
159
203
|
|
|
160
204
|
@staticmethod
|
|
161
205
|
def _validate_column_mapping(column_mapping: Mapping[str, str]):
|
|
@@ -178,10 +222,6 @@ class RunSubmitter:
|
|
|
178
222
|
:param AbstractRunStorage storage: The storage to use for the output.
|
|
179
223
|
"""
|
|
180
224
|
|
|
181
|
-
# TODO ralphe: This doesn't seem to be do anything useful beyond just print
|
|
182
|
-
# a run summary at the end. This is because by the time it gets
|
|
183
|
-
# invoked even in the original code, the run has already completed.
|
|
184
|
-
|
|
185
225
|
if run is None or storage is None:
|
|
186
226
|
return
|
|
187
227
|
|