azure-ai-evaluation 1.0.0b2__py3-none-any.whl → 1.0.0b4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of azure-ai-evaluation might be problematic. Click here for more details.
- azure/ai/evaluation/__init__.py +9 -5
- azure/ai/evaluation/_common/constants.py +4 -2
- azure/ai/evaluation/_common/math.py +18 -0
- azure/ai/evaluation/_common/rai_service.py +54 -62
- azure/ai/evaluation/_common/utils.py +201 -16
- azure/ai/evaluation/_constants.py +12 -0
- azure/ai/evaluation/_evaluate/_batch_run_client/batch_run_context.py +10 -3
- azure/ai/evaluation/_evaluate/_batch_run_client/code_client.py +33 -17
- azure/ai/evaluation/_evaluate/_batch_run_client/proxy_client.py +17 -2
- azure/ai/evaluation/_evaluate/_eval_run.py +26 -10
- azure/ai/evaluation/_evaluate/_evaluate.py +161 -89
- azure/ai/evaluation/_evaluate/_telemetry/__init__.py +16 -17
- azure/ai/evaluation/_evaluate/_utils.py +44 -25
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +33 -79
- azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +0 -5
- azure/ai/evaluation/_evaluators/_common/__init__.py +13 -0
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +331 -0
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +76 -0
- azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +97 -0
- azure/ai/evaluation/_evaluators/_content_safety/__init__.py +0 -4
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +15 -20
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +63 -42
- azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +18 -41
- azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +18 -39
- azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +18 -39
- azure/ai/evaluation/_evaluators/_content_safety/_violence.py +18 -39
- azure/ai/evaluation/_evaluators/_eci/_eci.py +18 -55
- azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +14 -6
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +30 -74
- azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +0 -5
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +34 -80
- azure/ai/evaluation/_evaluators/_groundedness/groundedness.prompty +0 -5
- azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +18 -65
- azure/ai/evaluation/_evaluators/_qa/_qa.py +4 -3
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +35 -83
- azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +0 -5
- azure/ai/evaluation/_evaluators/{_chat → _retrieval}/__init__.py +2 -2
- azure/ai/evaluation/_evaluators/{_chat/retrieval → _retrieval}/_retrieval.py +25 -28
- azure/ai/evaluation/_evaluators/{_chat/retrieval → _retrieval}/retrieval.prompty +0 -5
- azure/ai/evaluation/_evaluators/_rouge/_rouge.py +1 -1
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +23 -17
- azure/ai/evaluation/_evaluators/_similarity/similarity.prompty +0 -5
- azure/ai/evaluation/_evaluators/_xpia/xpia.py +15 -90
- azure/ai/evaluation/_exceptions.py +9 -7
- azure/ai/evaluation/_http_utils.py +203 -132
- azure/ai/evaluation/_model_configurations.py +37 -9
- azure/ai/evaluation/{_evaluators/_chat/retrieval → _vendor}/__init__.py +0 -6
- azure/ai/evaluation/_vendor/rouge_score/__init__.py +14 -0
- azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +328 -0
- azure/ai/evaluation/_vendor/rouge_score/scoring.py +63 -0
- azure/ai/evaluation/_vendor/rouge_score/tokenize.py +63 -0
- azure/ai/evaluation/_vendor/rouge_score/tokenizers.py +53 -0
- azure/ai/evaluation/_version.py +1 -1
- azure/ai/evaluation/simulator/_adversarial_simulator.py +85 -60
- azure/ai/evaluation/simulator/_conversation/__init__.py +13 -12
- azure/ai/evaluation/simulator/_conversation/_conversation.py +4 -4
- azure/ai/evaluation/simulator/_direct_attack_simulator.py +24 -66
- azure/ai/evaluation/simulator/_helpers/_experimental.py +20 -9
- azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +4 -4
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +22 -64
- azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +67 -21
- azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +28 -11
- azure/ai/evaluation/simulator/_model_tools/_template_handler.py +68 -24
- azure/ai/evaluation/simulator/_model_tools/models.py +10 -10
- azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +2 -6
- azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +0 -4
- azure/ai/evaluation/simulator/_simulator.py +127 -117
- azure/ai/evaluation/simulator/_tracing.py +4 -4
- {azure_ai_evaluation-1.0.0b2.dist-info → azure_ai_evaluation-1.0.0b4.dist-info}/METADATA +129 -43
- azure_ai_evaluation-1.0.0b4.dist-info/NOTICE.txt +50 -0
- azure_ai_evaluation-1.0.0b4.dist-info/RECORD +106 -0
- azure/ai/evaluation/_evaluators/_chat/_chat.py +0 -357
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety_base.py +0 -65
- azure/ai/evaluation/_evaluators/_protected_materials/__init__.py +0 -5
- azure/ai/evaluation/_evaluators/_protected_materials/_protected_materials.py +0 -104
- azure_ai_evaluation-1.0.0b2.dist-info/RECORD +0 -99
- {azure_ai_evaluation-1.0.0b2.dist-info → azure_ai_evaluation-1.0.0b4.dist-info}/WHEEL +0 -0
- {azure_ai_evaluation-1.0.0b2.dist-info → azure_ai_evaluation-1.0.0b4.dist-info}/top_level.txt +0 -0
|
@@ -2,6 +2,8 @@
|
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
4
|
import os
|
|
5
|
+
import types
|
|
6
|
+
from typing import Optional, Type, Union
|
|
5
7
|
|
|
6
8
|
from promptflow._sdk._constants import PF_FLOW_ENTRY_IN_TMP, PF_FLOW_META_LOAD_IN_SUBPROCESS
|
|
7
9
|
from promptflow._utils.user_agent_utils import ClientUserAgentUtil
|
|
@@ -30,12 +32,12 @@ class BatchRunContext:
|
|
|
30
32
|
]
|
|
31
33
|
"""
|
|
32
34
|
|
|
33
|
-
def __init__(self, client) -> None:
|
|
35
|
+
def __init__(self, client: Union[CodeClient, ProxyClient]) -> None:
|
|
34
36
|
self.client = client
|
|
35
37
|
self._is_batch_timeout_set_by_system = False
|
|
36
38
|
self._is_otel_timeout_set_by_system = False
|
|
37
39
|
|
|
38
|
-
def __enter__(self):
|
|
40
|
+
def __enter__(self) -> None:
|
|
39
41
|
if isinstance(self.client, CodeClient):
|
|
40
42
|
ClientUserAgentUtil.append_user_agent(USER_AGENT)
|
|
41
43
|
inject_openai_api()
|
|
@@ -56,7 +58,12 @@ class BatchRunContext:
|
|
|
56
58
|
# For addressing the issue of asyncio event loop closed on Windows
|
|
57
59
|
set_event_loop_policy()
|
|
58
60
|
|
|
59
|
-
def __exit__(
|
|
61
|
+
def __exit__(
|
|
62
|
+
self,
|
|
63
|
+
exc_type: Optional[Type[BaseException]],
|
|
64
|
+
exc_value: Optional[BaseException],
|
|
65
|
+
exc_tb: Optional[types.TracebackType],
|
|
66
|
+
) -> None:
|
|
60
67
|
if isinstance(self.client, CodeClient):
|
|
61
68
|
recover_openai_api()
|
|
62
69
|
|
|
@@ -5,8 +5,9 @@ import inspect
|
|
|
5
5
|
import json
|
|
6
6
|
import logging
|
|
7
7
|
import os
|
|
8
|
+
from concurrent.futures import Future
|
|
8
9
|
from pathlib import Path
|
|
9
|
-
from typing import Callable, Dict, Optional, Union
|
|
10
|
+
from typing import Any, Callable, Dict, Optional, Union, cast
|
|
10
11
|
|
|
11
12
|
import pandas as pd
|
|
12
13
|
from promptflow.contracts.types import AttrDict
|
|
@@ -22,25 +23,31 @@ LOGGER = logging.getLogger(__name__)
|
|
|
22
23
|
|
|
23
24
|
class CodeRun:
|
|
24
25
|
def __init__(
|
|
25
|
-
self,
|
|
26
|
-
|
|
26
|
+
self,
|
|
27
|
+
*,
|
|
28
|
+
run: Future,
|
|
29
|
+
input_data,
|
|
30
|
+
evaluator_name: Optional[str] = None,
|
|
31
|
+
aggregator: Callable[["CodeRun"], Future],
|
|
32
|
+
**kwargs, # pylint: disable=unused-argument
|
|
33
|
+
) -> None:
|
|
27
34
|
self.run = run
|
|
28
35
|
self.evaluator_name = evaluator_name if evaluator_name is not None else ""
|
|
29
36
|
self.input_data = input_data
|
|
30
|
-
self.aggregated_metrics =
|
|
37
|
+
self.aggregated_metrics = aggregator(self)
|
|
31
38
|
|
|
32
|
-
def get_result_df(self, exclude_inputs=False):
|
|
39
|
+
def get_result_df(self, exclude_inputs: bool = False) -> pd.DataFrame:
|
|
33
40
|
batch_run_timeout = get_int_env_var(PF_BATCH_TIMEOUT_SEC, PF_BATCH_TIMEOUT_SEC_DEFAULT)
|
|
34
|
-
result_df = self.run.result(timeout=batch_run_timeout)
|
|
41
|
+
result_df = cast(pd.DataFrame, self.run.result(timeout=batch_run_timeout))
|
|
35
42
|
if exclude_inputs:
|
|
36
43
|
result_df = result_df.drop(columns=[col for col in result_df.columns if col.startswith("inputs.")])
|
|
37
44
|
return result_df
|
|
38
45
|
|
|
39
|
-
def get_aggregated_metrics(self):
|
|
46
|
+
def get_aggregated_metrics(self) -> Dict[str, Any]:
|
|
40
47
|
try:
|
|
41
48
|
batch_run_timeout = get_int_env_var(PF_BATCH_TIMEOUT_SEC, PF_BATCH_TIMEOUT_SEC_DEFAULT)
|
|
42
|
-
aggregated_metrics = (
|
|
43
|
-
self.aggregated_metrics.result(timeout=batch_run_timeout)
|
|
49
|
+
aggregated_metrics: Optional[Any] = (
|
|
50
|
+
cast(Dict, self.aggregated_metrics.result(timeout=batch_run_timeout))
|
|
44
51
|
if self.aggregated_metrics is not None
|
|
45
52
|
else None
|
|
46
53
|
)
|
|
@@ -104,10 +111,10 @@ class CodeClient: # pylint: disable=client-accepts-api-version-keyword
|
|
|
104
111
|
verify_integrity=True,
|
|
105
112
|
)
|
|
106
113
|
|
|
107
|
-
|
|
114
|
+
@staticmethod
|
|
115
|
+
def _calculate_aggregations(evaluator: Callable, run: CodeRun) -> Any:
|
|
108
116
|
try:
|
|
109
117
|
if _has_aggregator(evaluator):
|
|
110
|
-
aggregate_input = None
|
|
111
118
|
evaluator_output = run.get_result_df(exclude_inputs=True)
|
|
112
119
|
if len(evaluator_output.columns) == 1 and evaluator_output.columns[0] == "output":
|
|
113
120
|
aggregate_input = evaluator_output["output"].tolist()
|
|
@@ -152,21 +159,30 @@ class CodeClient: # pylint: disable=client-accepts-api-version-keyword
|
|
|
152
159
|
column_mapping=column_mapping,
|
|
153
160
|
evaluator_name=evaluator_name,
|
|
154
161
|
)
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
162
|
+
|
|
163
|
+
return CodeRun(
|
|
164
|
+
run=eval_future,
|
|
165
|
+
input_data=data,
|
|
166
|
+
evaluator_name=evaluator_name,
|
|
167
|
+
aggregator=lambda code_run: self._thread_pool.submit(
|
|
168
|
+
self._calculate_aggregations, evaluator=flow, run=code_run
|
|
169
|
+
),
|
|
170
|
+
)
|
|
159
171
|
|
|
160
172
|
def get_details(self, run: CodeRun, all_results: bool = False) -> pd.DataFrame:
|
|
161
173
|
result_df = run.get_result_df(exclude_inputs=not all_results)
|
|
162
174
|
return result_df
|
|
163
175
|
|
|
164
|
-
def get_metrics(self, run: CodeRun) ->
|
|
176
|
+
def get_metrics(self, run: CodeRun) -> Dict[str, Any]:
|
|
165
177
|
try:
|
|
166
178
|
aggregated_metrics = run.get_aggregated_metrics()
|
|
167
179
|
print("Aggregated metrics")
|
|
168
180
|
print(aggregated_metrics)
|
|
169
181
|
except Exception as ex: # pylint: disable=broad-exception-caught
|
|
170
182
|
LOGGER.debug("Error calculating metrics for evaluator %s, failed with error %s", run.evaluator_name, ex)
|
|
171
|
-
return
|
|
183
|
+
return {}
|
|
172
184
|
return aggregated_metrics
|
|
185
|
+
|
|
186
|
+
def get_run_summary(self, run: CodeRun) -> Any: # pylint: disable=unused-argument
|
|
187
|
+
# Not implemented
|
|
188
|
+
return None
|
|
@@ -3,11 +3,12 @@
|
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
4
|
import inspect
|
|
5
5
|
import logging
|
|
6
|
+
import math
|
|
6
7
|
import os
|
|
7
8
|
from concurrent.futures import Future
|
|
8
9
|
from typing import Any, Callable, Dict, Optional, Union
|
|
10
|
+
from collections import OrderedDict
|
|
9
11
|
|
|
10
|
-
import numpy as np
|
|
11
12
|
import pandas as pd
|
|
12
13
|
from promptflow.client import PFClient
|
|
13
14
|
from promptflow.entities import Run
|
|
@@ -53,13 +54,27 @@ class ProxyClient: # pylint: disable=client-accepts-api-version-keyword
|
|
|
53
54
|
def get_details(self, proxy_run: ProxyRun, all_results: bool = False) -> pd.DataFrame:
|
|
54
55
|
run: Run = proxy_run.run.result()
|
|
55
56
|
result_df = self._pf_client.get_details(run, all_results=all_results)
|
|
56
|
-
result_df.replace("(Failed)",
|
|
57
|
+
result_df.replace("(Failed)", math.nan, inplace=True)
|
|
57
58
|
return result_df
|
|
58
59
|
|
|
59
60
|
def get_metrics(self, proxy_run: ProxyRun) -> Dict[str, Any]:
|
|
60
61
|
run: Run = proxy_run.run.result()
|
|
61
62
|
return self._pf_client.get_metrics(run)
|
|
62
63
|
|
|
64
|
+
def get_run_summary(self, proxy_run: ProxyRun) -> Dict[str, Any]:
|
|
65
|
+
run = proxy_run.run.result()
|
|
66
|
+
|
|
67
|
+
# pylint: disable=protected-access
|
|
68
|
+
return OrderedDict(
|
|
69
|
+
[
|
|
70
|
+
("status", run.status),
|
|
71
|
+
("duration", str(run._end_time - run._created_on)),
|
|
72
|
+
("completed_lines", run._properties.get("system_metrics", {}).get("__pf__.lines.completed", "NA")),
|
|
73
|
+
("failed_lines", run._properties.get("system_metrics", {}).get("__pf__.lines.failed", "NA")),
|
|
74
|
+
("log_path", str(run._output_path)),
|
|
75
|
+
]
|
|
76
|
+
)
|
|
77
|
+
|
|
63
78
|
@staticmethod
|
|
64
79
|
def _should_batch_use_async(flow):
|
|
65
80
|
if os.getenv("PF_EVALS_BATCH_USE_ASYNC", "true").lower() == "true":
|
|
@@ -10,10 +10,11 @@ import posixpath
|
|
|
10
10
|
import time
|
|
11
11
|
import types
|
|
12
12
|
import uuid
|
|
13
|
-
from typing import Any, Dict, Optional, Set, Type
|
|
13
|
+
from typing import Any, Dict, List, Optional, Set, Type
|
|
14
14
|
from urllib.parse import urlparse
|
|
15
15
|
|
|
16
16
|
from promptflow._sdk.entities import Run
|
|
17
|
+
from typing_extensions import Self
|
|
17
18
|
|
|
18
19
|
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
19
20
|
from azure.ai.evaluation._http_utils import get_http_client
|
|
@@ -27,6 +28,7 @@ LOGGER = logging.getLogger(__name__)
|
|
|
27
28
|
# Handle optional import. The azure libraries are only present if
|
|
28
29
|
# promptflow-azure is installed.
|
|
29
30
|
try:
|
|
31
|
+
from azure.ai.ml import MLClient
|
|
30
32
|
from azure.ai.ml.entities._credentials import AccountKeyConfiguration # pylint: disable=ungrouped-imports
|
|
31
33
|
from azure.ai.ml.entities._datastore.datastore import Datastore
|
|
32
34
|
from azure.storage.blob import BlobServiceClient
|
|
@@ -121,8 +123,8 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
|
|
|
121
123
|
self._run_name = run_name
|
|
122
124
|
self._promptflow_run = promptflow_run
|
|
123
125
|
self._status = RunStatus.NOT_STARTED
|
|
124
|
-
self._url_base = None
|
|
125
|
-
self.
|
|
126
|
+
self._url_base: Optional[str] = None
|
|
127
|
+
self._info: Optional[RunInfo] = None
|
|
126
128
|
|
|
127
129
|
@property
|
|
128
130
|
def status(self) -> RunStatus:
|
|
@@ -134,6 +136,20 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
|
|
|
134
136
|
"""
|
|
135
137
|
return self._status
|
|
136
138
|
|
|
139
|
+
@property
|
|
140
|
+
def info(self) -> RunInfo:
|
|
141
|
+
if self._info is None:
|
|
142
|
+
msg = "Run info is missing"
|
|
143
|
+
raise EvaluationException(
|
|
144
|
+
message=msg,
|
|
145
|
+
internal_message=msg,
|
|
146
|
+
target=ErrorTarget.EVAL_RUN,
|
|
147
|
+
category=ErrorCategory.UNKNOWN,
|
|
148
|
+
blame=ErrorBlame.UNKNOWN,
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
return self._info
|
|
152
|
+
|
|
137
153
|
def _get_scope(self) -> str:
|
|
138
154
|
"""
|
|
139
155
|
Return the scope information for the workspace.
|
|
@@ -161,11 +177,11 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
|
|
|
161
177
|
)
|
|
162
178
|
self._url_base = None
|
|
163
179
|
self._status = RunStatus.BROKEN
|
|
164
|
-
self.
|
|
180
|
+
self._info = RunInfo.generate(self._run_name)
|
|
165
181
|
else:
|
|
166
182
|
self._url_base = urlparse(self._tracking_uri).netloc
|
|
167
183
|
if self._promptflow_run is not None:
|
|
168
|
-
self.
|
|
184
|
+
self._info = RunInfo(
|
|
169
185
|
self._promptflow_run.name,
|
|
170
186
|
self._promptflow_run._experiment_name, # pylint: disable=protected-access
|
|
171
187
|
self._promptflow_run.name,
|
|
@@ -182,7 +198,7 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
|
|
|
182
198
|
body["run_name"] = self._run_name
|
|
183
199
|
response = self.request_with_retry(url=url, method="POST", json_dict=body)
|
|
184
200
|
if response.status_code != 200:
|
|
185
|
-
self.
|
|
201
|
+
self._info = RunInfo.generate(self._run_name)
|
|
186
202
|
LOGGER.warning(
|
|
187
203
|
"The run failed to start: %s: %s."
|
|
188
204
|
"The results will be saved locally, but will not be logged to Azure.",
|
|
@@ -192,7 +208,7 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
|
|
|
192
208
|
self._status = RunStatus.BROKEN
|
|
193
209
|
else:
|
|
194
210
|
parsed_response = response.json()
|
|
195
|
-
self.
|
|
211
|
+
self._info = RunInfo(
|
|
196
212
|
run_id=parsed_response["run"]["info"]["run_id"],
|
|
197
213
|
experiment_id=parsed_response["run"]["info"]["experiment_id"],
|
|
198
214
|
run_name=parsed_response["run"]["info"]["run_name"],
|
|
@@ -235,7 +251,7 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
|
|
|
235
251
|
LOGGER.warning("Unable to terminate the run.")
|
|
236
252
|
self._status = RunStatus.TERMINATED
|
|
237
253
|
|
|
238
|
-
def __enter__(self):
|
|
254
|
+
def __enter__(self) -> Self:
|
|
239
255
|
"""The Context Manager enter call.
|
|
240
256
|
|
|
241
257
|
:return: The instance of the class.
|
|
@@ -249,7 +265,7 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
|
|
|
249
265
|
exc_type: Optional[Type[BaseException]],
|
|
250
266
|
exc_value: Optional[BaseException],
|
|
251
267
|
exc_tb: Optional[types.TracebackType],
|
|
252
|
-
) ->
|
|
268
|
+
) -> None:
|
|
253
269
|
"""The context manager exit call.
|
|
254
270
|
|
|
255
271
|
:param exc_type: The exception type
|
|
@@ -408,7 +424,7 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
|
|
|
408
424
|
return
|
|
409
425
|
# First we will list the files and the appropriate remote paths for them.
|
|
410
426
|
root_upload_path = posixpath.join("promptflow", "PromptFlowArtifacts", self.info.run_name)
|
|
411
|
-
remote_paths = {"paths": []}
|
|
427
|
+
remote_paths: Dict[str, List[Dict[str, str]]] = {"paths": []}
|
|
412
428
|
local_paths = []
|
|
413
429
|
# Go over the artifact folder and upload all artifacts.
|
|
414
430
|
for root, _, filenames in os.walk(artifact_folder):
|