azure-ai-evaluation 1.0.0b1__py3-none-any.whl → 1.0.0b2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of azure-ai-evaluation might be problematic. Click here for more details.
- azure/ai/evaluation/__init__.py +1 -5
- azure/ai/evaluation/_common/rai_service.py +4 -4
- azure/ai/evaluation/_common/utils.py +19 -19
- azure/ai/evaluation/_constants.py +9 -0
- azure/ai/evaluation/_evaluate/_batch_run_client/batch_run_context.py +2 -1
- azure/ai/evaluation/_evaluate/_batch_run_client/code_client.py +39 -17
- azure/ai/evaluation/_evaluate/_batch_run_client/proxy_client.py +23 -13
- azure/ai/evaluation/_evaluate/_eval_run.py +38 -18
- azure/ai/evaluation/_evaluate/_evaluate.py +35 -28
- azure/ai/evaluation/_evaluate/_telemetry/__init__.py +13 -8
- azure/ai/evaluation/_evaluate/_utils.py +29 -22
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +1 -1
- azure/ai/evaluation/_evaluators/_chat/_chat.py +16 -9
- azure/ai/evaluation/_evaluators/_chat/retrieval/_retrieval.py +4 -10
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +5 -10
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +0 -2
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety_base.py +1 -2
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +9 -4
- azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +1 -1
- azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +1 -1
- azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +1 -1
- azure/ai/evaluation/_evaluators/_content_safety/_violence.py +1 -1
- azure/ai/evaluation/_evaluators/_eci/_eci.py +2 -2
- azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +2 -1
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +5 -10
- azure/ai/evaluation/_evaluators/_gleu/_gleu.py +1 -1
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +5 -10
- azure/ai/evaluation/_evaluators/_meteor/_meteor.py +1 -0
- azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +2 -2
- azure/ai/evaluation/_evaluators/_protected_materials/_protected_materials.py +2 -2
- azure/ai/evaluation/_evaluators/_qa/_qa.py +3 -14
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +5 -10
- azure/ai/evaluation/_evaluators/_rouge/_rouge.py +3 -2
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +5 -10
- azure/ai/evaluation/_evaluators/_xpia/xpia.py +1 -2
- azure/ai/evaluation/_http_utils.py +3 -3
- azure/ai/evaluation/_version.py +1 -1
- azure/ai/evaluation/simulator/__init__.py +1 -1
- azure/ai/evaluation/simulator/_adversarial_simulator.py +8 -6
- azure/ai/evaluation/simulator/_conversation/__init__.py +1 -1
- azure/ai/evaluation/simulator/_conversation/_conversation.py +16 -16
- azure/ai/evaluation/simulator/_direct_attack_simulator.py +6 -6
- azure/ai/evaluation/simulator/_helpers/__init__.py +3 -2
- azure/ai/evaluation/simulator/_helpers/_experimental.py +157 -0
- azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +11 -29
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +6 -6
- azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +2 -3
- azure/ai/evaluation/simulator/_model_tools/_rai_client.py +18 -11
- azure/ai/evaluation/simulator/_model_tools/_template_handler.py +1 -1
- azure/ai/evaluation/simulator/_model_tools/models.py +9 -11
- azure/ai/evaluation/simulator/_prompty/__init__.py +0 -0
- azure/ai/evaluation/simulator/{simulator.py → _simulator.py} +147 -80
- azure/ai/evaluation/simulator/_tracing.py +21 -24
- azure/ai/evaluation/simulator/_utils.py +4 -1
- {azure_ai_evaluation-1.0.0b1.dist-info → azure_ai_evaluation-1.0.0b2.dist-info}/METADATA +86 -14
- {azure_ai_evaluation-1.0.0b1.dist-info → azure_ai_evaluation-1.0.0b2.dist-info}/RECORD +58 -56
- {azure_ai_evaluation-1.0.0b1.dist-info → azure_ai_evaluation-1.0.0b2.dist-info}/WHEEL +0 -0
- {azure_ai_evaluation-1.0.0b1.dist-info → azure_ai_evaluation-1.0.0b2.dist-info}/top_level.txt +0 -0
|
@@ -6,21 +6,24 @@ import functools
|
|
|
6
6
|
import inspect
|
|
7
7
|
import json
|
|
8
8
|
import logging
|
|
9
|
-
from typing import Callable, Dict
|
|
9
|
+
from typing import Callable, Dict, TypeVar
|
|
10
10
|
|
|
11
11
|
import pandas as pd
|
|
12
|
-
|
|
13
12
|
from promptflow._sdk.entities._flows import FlexFlow as flex_flow
|
|
14
13
|
from promptflow._sdk.entities._flows import Prompty as prompty_sdk
|
|
15
14
|
from promptflow._sdk.entities._flows.dag import Flow as dag_flow
|
|
16
15
|
from promptflow.client import PFClient
|
|
17
16
|
from promptflow.core import Prompty as prompty_core
|
|
17
|
+
from typing_extensions import ParamSpec
|
|
18
18
|
|
|
19
19
|
from ..._user_agent import USER_AGENT
|
|
20
20
|
from .._utils import _trace_destination_from_project_scope
|
|
21
21
|
|
|
22
22
|
LOGGER = logging.getLogger(__name__)
|
|
23
23
|
|
|
24
|
+
P = ParamSpec("P")
|
|
25
|
+
R = TypeVar("R")
|
|
26
|
+
|
|
24
27
|
|
|
25
28
|
def _get_evaluator_type(evaluator: Dict[str, Callable]):
|
|
26
29
|
"""
|
|
@@ -82,7 +85,7 @@ def _get_evaluator_properties(evaluator, evaluator_name):
|
|
|
82
85
|
name = str(evaluator)
|
|
83
86
|
pf_type = "Unknown"
|
|
84
87
|
except Exception as e: # pylint: disable=broad-exception-caught
|
|
85
|
-
LOGGER.debug(
|
|
88
|
+
LOGGER.debug("Failed to get evaluator properties: %s", e)
|
|
86
89
|
name = str(evaluator)
|
|
87
90
|
pf_type = "Unknown"
|
|
88
91
|
|
|
@@ -95,15 +98,17 @@ def _get_evaluator_properties(evaluator, evaluator_name):
|
|
|
95
98
|
|
|
96
99
|
|
|
97
100
|
# cspell:ignore isna
|
|
98
|
-
def log_evaluate_activity(func) ->
|
|
101
|
+
def log_evaluate_activity(func: Callable[P, R]) -> Callable[P, R]:
|
|
99
102
|
"""Decorator to log evaluate activity
|
|
100
103
|
|
|
101
104
|
:param func: The function to be decorated
|
|
102
105
|
:type func: Callable
|
|
106
|
+
:returns: The decorated function
|
|
107
|
+
:rtype: Callable[P, R]
|
|
103
108
|
"""
|
|
104
109
|
|
|
105
110
|
@functools.wraps(func)
|
|
106
|
-
def wrapper(*args, **kwargs) ->
|
|
111
|
+
def wrapper(*args: P.args, **kwargs: P.kwargs) -> R:
|
|
107
112
|
from promptflow._sdk._telemetry import ActivityType, log_activity
|
|
108
113
|
from promptflow._sdk._telemetry.telemetry import get_telemetry_logger
|
|
109
114
|
|
|
@@ -119,7 +124,7 @@ def log_evaluate_activity(func) -> None:
|
|
|
119
124
|
user_agent=USER_AGENT,
|
|
120
125
|
)
|
|
121
126
|
|
|
122
|
-
track_in_cloud = bool(pf_client._config.get_trace_destination())
|
|
127
|
+
track_in_cloud = bool(pf_client._config.get_trace_destination()) # pylint: disable=protected-access
|
|
123
128
|
evaluate_target = bool(kwargs.get("target", None))
|
|
124
129
|
evaluator_config = bool(kwargs.get("evaluator_config", None))
|
|
125
130
|
custom_dimensions = {
|
|
@@ -154,7 +159,7 @@ def log_evaluate_activity(func) -> None:
|
|
|
154
159
|
evaluator_info["failed_rows"] = failed_rows
|
|
155
160
|
evaluator_info["total_rows"] = total_rows
|
|
156
161
|
except Exception as e: # pylint: disable=broad-exception-caught
|
|
157
|
-
LOGGER.debug(
|
|
162
|
+
LOGGER.debug("Failed to collect evaluate failed row info for %s: %s", evaluator_name, e)
|
|
158
163
|
evaluators_info.append(evaluator_info)
|
|
159
164
|
|
|
160
165
|
custom_dimensions = {"evaluators_info": json.dumps(evaluators_info)}
|
|
@@ -167,7 +172,7 @@ def log_evaluate_activity(func) -> None:
|
|
|
167
172
|
):
|
|
168
173
|
pass
|
|
169
174
|
except Exception as e: # pylint: disable=broad-exception-caught
|
|
170
|
-
LOGGER.debug(
|
|
175
|
+
LOGGER.debug("Failed to collect evaluate usage info: %s", e)
|
|
171
176
|
|
|
172
177
|
return result
|
|
173
178
|
|
|
@@ -8,12 +8,13 @@ import re
|
|
|
8
8
|
import tempfile
|
|
9
9
|
from collections import namedtuple
|
|
10
10
|
from pathlib import Path
|
|
11
|
+
from typing import Dict
|
|
11
12
|
|
|
12
13
|
import pandas as pd
|
|
13
14
|
|
|
14
|
-
from azure.ai.evaluation._constants import DEFAULT_EVALUATION_RESULTS_FILE_NAME, Prefixes
|
|
15
|
-
from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
|
|
15
|
+
from azure.ai.evaluation._constants import DEFAULT_EVALUATION_RESULTS_FILE_NAME, DefaultOpenEncoding, Prefixes
|
|
16
16
|
from azure.ai.evaluation._evaluate._eval_run import EvalRun
|
|
17
|
+
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
17
18
|
|
|
18
19
|
LOGGER = logging.getLogger(__name__)
|
|
19
20
|
|
|
@@ -33,16 +34,16 @@ def extract_workspace_triad_from_trace_provider(trace_provider: str): # pylint:
|
|
|
33
34
|
match = re.match(AZURE_WORKSPACE_REGEX_FORMAT, trace_provider)
|
|
34
35
|
if not match or len(match.groups()) != 5:
|
|
35
36
|
raise EvaluationException(
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
37
|
+
message="Malformed trace provider string, expected azureml://subscriptions/<subscription_id>/"
|
|
38
|
+
"resourceGroups/<resource_group>/providers/Microsoft.MachineLearningServices/"
|
|
39
|
+
f"workspaces/<workspace_name>, got {trace_provider}",
|
|
40
|
+
internal_message="Malformed trace provider string, expected azureml://subscriptions/<subscription_id>/"
|
|
41
|
+
"resourceGroups/<resource_group>/providers/Microsoft.MachineLearningServices/"
|
|
42
|
+
"workspaces/<workspace_name>,",
|
|
43
|
+
target=ErrorTarget.UNKNOWN,
|
|
44
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
45
|
+
blame=ErrorBlame.UNKNOWN,
|
|
46
|
+
)
|
|
46
47
|
subscription_id = match.group(1)
|
|
47
48
|
resource_group_name = match.group(3)
|
|
48
49
|
workspace_name = match.group(5)
|
|
@@ -50,7 +51,7 @@ def extract_workspace_triad_from_trace_provider(trace_provider: str): # pylint:
|
|
|
50
51
|
|
|
51
52
|
|
|
52
53
|
def load_jsonl(path):
|
|
53
|
-
with open(path, "r", encoding=
|
|
54
|
+
with open(path, "r", encoding=DefaultOpenEncoding.READ) as f:
|
|
54
55
|
return [json.loads(line) for line in f.readlines()]
|
|
55
56
|
|
|
56
57
|
|
|
@@ -99,7 +100,7 @@ def _log_metrics_and_instance_results(
|
|
|
99
100
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
100
101
|
tmp_path = os.path.join(tmpdir, artifact_name)
|
|
101
102
|
|
|
102
|
-
with open(tmp_path, "w", encoding=
|
|
103
|
+
with open(tmp_path, "w", encoding=DefaultOpenEncoding.WRITE) as f:
|
|
103
104
|
f.write(instance_results.to_json(orient="records", lines=True))
|
|
104
105
|
|
|
105
106
|
ev_run.log_artifact(tmpdir, artifact_name)
|
|
@@ -155,11 +156,13 @@ def _write_output(path, data_dict):
|
|
|
155
156
|
if os.path.isdir(path):
|
|
156
157
|
p = p / DEFAULT_EVALUATION_RESULTS_FILE_NAME
|
|
157
158
|
|
|
158
|
-
with open(p, "w") as f:
|
|
159
|
+
with open(p, "w", encoding=DefaultOpenEncoding.WRITE) as f:
|
|
159
160
|
json.dump(data_dict, f)
|
|
160
161
|
|
|
161
162
|
|
|
162
|
-
def _apply_column_mapping(
|
|
163
|
+
def _apply_column_mapping(
|
|
164
|
+
source_df: pd.DataFrame, mapping_config: Dict[str, str], inplace: bool = False
|
|
165
|
+
) -> pd.DataFrame:
|
|
163
166
|
"""
|
|
164
167
|
Apply column mapping to source_df based on mapping_config.
|
|
165
168
|
|
|
@@ -167,10 +170,11 @@ def _apply_column_mapping(source_df: pd.DataFrame, mapping_config: dict, inplace
|
|
|
167
170
|
:param source_df: the data frame to be changed.
|
|
168
171
|
:type source_df: pd.DataFrame
|
|
169
172
|
:param mapping_config: The configuration, containing column mapping.
|
|
170
|
-
:type mapping_config:
|
|
173
|
+
:type mapping_config: Dict[str, str].
|
|
171
174
|
:param inplace: If true, the source_df will be changed inplace.
|
|
172
175
|
:type inplace: bool
|
|
173
176
|
:return: The modified data frame.
|
|
177
|
+
:rtype: pd.DataFrame
|
|
174
178
|
"""
|
|
175
179
|
result_df = source_df
|
|
176
180
|
|
|
@@ -211,19 +215,22 @@ def _has_aggregator(evaluator):
|
|
|
211
215
|
return hasattr(evaluator, "__aggregate__")
|
|
212
216
|
|
|
213
217
|
|
|
214
|
-
def get_int_env_var(env_var_name, default_value
|
|
218
|
+
def get_int_env_var(env_var_name: str, default_value: int) -> int:
|
|
215
219
|
"""
|
|
216
|
-
The function `get_int_env_var` retrieves an integer environment variable value, with
|
|
220
|
+
The function `get_int_env_var` retrieves an integer environment variable value, with a
|
|
217
221
|
default value if the variable is not set or cannot be converted to an integer.
|
|
218
222
|
|
|
219
223
|
:param env_var_name: The name of the environment variable you want to retrieve the value of
|
|
224
|
+
:type env_var_name: str
|
|
220
225
|
:param default_value: The default value is the value that will be returned if the environment
|
|
221
|
-
|
|
226
|
+
variable is not found or if it cannot be converted to an integer
|
|
227
|
+
:type default_value: int
|
|
222
228
|
:return: an integer value.
|
|
229
|
+
:rtype: int
|
|
223
230
|
"""
|
|
224
231
|
try:
|
|
225
|
-
return int(os.environ
|
|
226
|
-
except
|
|
232
|
+
return int(os.environ[env_var_name])
|
|
233
|
+
except (ValueError, KeyError):
|
|
227
234
|
return default_value
|
|
228
235
|
|
|
229
236
|
|
|
@@ -2,8 +2,8 @@
|
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
4
|
from nltk.translate.bleu_score import SmoothingFunction, sentence_bleu
|
|
5
|
-
|
|
6
5
|
from promptflow._utils.async_utils import async_run_allowing_running_loop
|
|
6
|
+
|
|
7
7
|
from azure.ai.evaluation._common.utils import nltk_tokenize
|
|
8
8
|
|
|
9
9
|
|
|
@@ -4,19 +4,18 @@
|
|
|
4
4
|
import json
|
|
5
5
|
import logging
|
|
6
6
|
from concurrent.futures import as_completed
|
|
7
|
-
from typing import Dict, List
|
|
7
|
+
from typing import Dict, List
|
|
8
8
|
|
|
9
9
|
import numpy as np
|
|
10
|
-
|
|
11
10
|
from promptflow.tracing import ThreadPoolExecutorWithContext as ThreadPoolExecutor
|
|
12
11
|
|
|
12
|
+
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
13
|
+
|
|
13
14
|
from .._coherence import CoherenceEvaluator
|
|
14
15
|
from .._fluency import FluencyEvaluator
|
|
15
16
|
from .._groundedness import GroundednessEvaluator
|
|
16
|
-
from ..._model_configurations import AzureOpenAIModelConfiguration, OpenAIModelConfiguration
|
|
17
17
|
from .._relevance import RelevanceEvaluator
|
|
18
18
|
from .retrieval import RetrievalChatEvaluator
|
|
19
|
-
from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
|
|
20
19
|
|
|
21
20
|
logger = logging.getLogger(__name__)
|
|
22
21
|
|
|
@@ -203,7 +202,7 @@ class ChatEvaluator:
|
|
|
203
202
|
return score
|
|
204
203
|
except Exception as e: # pylint: disable=broad-exception-caught
|
|
205
204
|
logger.warning(
|
|
206
|
-
|
|
205
|
+
"Evaluator %s failed for turn %s with exception: %s", evaluator.__class__.__name__, turn_num + 1, e
|
|
207
206
|
)
|
|
208
207
|
return {}
|
|
209
208
|
|
|
@@ -264,7 +263,10 @@ class ChatEvaluator:
|
|
|
264
263
|
)
|
|
265
264
|
|
|
266
265
|
if "role" not in turn or "content" not in turn:
|
|
267
|
-
msg =
|
|
266
|
+
msg = (
|
|
267
|
+
"Each turn in 'conversation' must have 'role' and 'content' keys. "
|
|
268
|
+
+ f"Turn number: {one_based_turn_num}"
|
|
269
|
+
)
|
|
268
270
|
raise EvaluationException(
|
|
269
271
|
message=msg,
|
|
270
272
|
internal_message=msg,
|
|
@@ -272,7 +274,7 @@ class ChatEvaluator:
|
|
|
272
274
|
category=ErrorCategory.INVALID_VALUE,
|
|
273
275
|
blame=ErrorBlame.USER_ERROR,
|
|
274
276
|
)
|
|
275
|
-
|
|
277
|
+
|
|
276
278
|
if turn["role"] != expected_role:
|
|
277
279
|
msg = f"Expected role {expected_role} but got {turn['role']}. Turn number: {one_based_turn_num}"
|
|
278
280
|
raise EvaluationException(
|
|
@@ -305,7 +307,9 @@ class ChatEvaluator:
|
|
|
305
307
|
)
|
|
306
308
|
|
|
307
309
|
if "citations" not in turn["context"]:
|
|
308
|
-
msg =
|
|
310
|
+
msg = (
|
|
311
|
+
f"Context in each assistant's turn must have 'citations' key. Turn number: {one_based_turn_num}"
|
|
312
|
+
)
|
|
309
313
|
raise EvaluationException(
|
|
310
314
|
message=msg,
|
|
311
315
|
internal_message=msg,
|
|
@@ -326,7 +330,10 @@ class ChatEvaluator:
|
|
|
326
330
|
|
|
327
331
|
for citation_num, citation in enumerate(turn["context"]["citations"]):
|
|
328
332
|
if not isinstance(citation, dict):
|
|
329
|
-
msg =
|
|
333
|
+
msg = (
|
|
334
|
+
"Each citation in 'citations' must be a dictionary. "
|
|
335
|
+
+ f"Turn number: {one_based_turn_num}, Citation number: {citation_num + 1}"
|
|
336
|
+
)
|
|
330
337
|
raise EvaluationException(
|
|
331
338
|
message=msg,
|
|
332
339
|
internal_message=msg,
|
|
@@ -6,18 +6,12 @@ import json
|
|
|
6
6
|
import logging
|
|
7
7
|
import os
|
|
8
8
|
import re
|
|
9
|
-
from typing import Union
|
|
10
9
|
|
|
11
10
|
import numpy as np
|
|
12
|
-
|
|
13
11
|
from promptflow._utils.async_utils import async_run_allowing_running_loop
|
|
14
12
|
from promptflow.core import AsyncPrompty
|
|
15
13
|
|
|
16
|
-
from ....
|
|
17
|
-
from ...._common.utils import (
|
|
18
|
-
check_and_add_api_version_for_aoai_model_config,
|
|
19
|
-
check_and_add_user_agent_for_aoai_model_config,
|
|
20
|
-
)
|
|
14
|
+
from ...._common.utils import ensure_api_version_in_aoai_model_config, ensure_user_agent_in_aoai_model_config
|
|
21
15
|
|
|
22
16
|
logger = logging.getLogger(__name__)
|
|
23
17
|
|
|
@@ -34,7 +28,7 @@ class _AsyncRetrievalChatEvaluator:
|
|
|
34
28
|
DEFAULT_OPEN_API_VERSION = "2024-02-15-preview"
|
|
35
29
|
|
|
36
30
|
def __init__(self, model_config: dict):
|
|
37
|
-
|
|
31
|
+
ensure_api_version_in_aoai_model_config(model_config, self.DEFAULT_OPEN_API_VERSION)
|
|
38
32
|
|
|
39
33
|
prompty_model_config = {"configuration": model_config, "parameters": {"extra_headers": {}}}
|
|
40
34
|
|
|
@@ -42,7 +36,7 @@ class _AsyncRetrievalChatEvaluator:
|
|
|
42
36
|
# https://github.com/encode/httpx/discussions/2959
|
|
43
37
|
prompty_model_config["parameters"]["extra_headers"].update({"Connection": "close"})
|
|
44
38
|
|
|
45
|
-
|
|
39
|
+
ensure_user_agent_in_aoai_model_config(
|
|
46
40
|
model_config,
|
|
47
41
|
prompty_model_config,
|
|
48
42
|
USER_AGENT,
|
|
@@ -92,7 +86,7 @@ class _AsyncRetrievalChatEvaluator:
|
|
|
92
86
|
|
|
93
87
|
except Exception as e: # pylint: disable=broad-exception-caught
|
|
94
88
|
logger.warning(
|
|
95
|
-
|
|
89
|
+
"Evaluator %s failed for turn %s with exception: %s", self.__class__.__name__, turn_num + 1, e
|
|
96
90
|
)
|
|
97
91
|
|
|
98
92
|
per_turn_scores.append(np.nan)
|
|
@@ -4,19 +4,14 @@
|
|
|
4
4
|
|
|
5
5
|
import os
|
|
6
6
|
import re
|
|
7
|
-
from typing import Union
|
|
8
7
|
|
|
9
8
|
import numpy as np
|
|
10
|
-
|
|
11
9
|
from promptflow._utils.async_utils import async_run_allowing_running_loop
|
|
12
|
-
from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
|
|
13
10
|
from promptflow.core import AsyncPrompty
|
|
14
11
|
|
|
15
|
-
from
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
check_and_add_user_agent_for_aoai_model_config,
|
|
19
|
-
)
|
|
12
|
+
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
13
|
+
|
|
14
|
+
from ..._common.utils import ensure_api_version_in_aoai_model_config, ensure_user_agent_in_aoai_model_config
|
|
20
15
|
|
|
21
16
|
try:
|
|
22
17
|
from ..._user_agent import USER_AGENT
|
|
@@ -31,7 +26,7 @@ class _AsyncCoherenceEvaluator:
|
|
|
31
26
|
DEFAULT_OPEN_API_VERSION = "2024-02-15-preview"
|
|
32
27
|
|
|
33
28
|
def __init__(self, model_config: dict):
|
|
34
|
-
|
|
29
|
+
ensure_api_version_in_aoai_model_config(model_config, self.DEFAULT_OPEN_API_VERSION)
|
|
35
30
|
|
|
36
31
|
prompty_model_config = {"configuration": model_config, "parameters": {"extra_headers": {}}}
|
|
37
32
|
|
|
@@ -39,7 +34,7 @@ class _AsyncCoherenceEvaluator:
|
|
|
39
34
|
# https://github.com/encode/httpx/discussions/2959
|
|
40
35
|
prompty_model_config["parameters"]["extra_headers"].update({"Connection": "close"})
|
|
41
36
|
|
|
42
|
-
|
|
37
|
+
ensure_user_agent_in_aoai_model_config(
|
|
43
38
|
model_config,
|
|
44
39
|
prompty_model_config,
|
|
45
40
|
USER_AGENT,
|
|
@@ -5,8 +5,6 @@ from concurrent.futures import as_completed
|
|
|
5
5
|
|
|
6
6
|
from promptflow.tracing import ThreadPoolExecutorWithContext as ThreadPoolExecutor
|
|
7
7
|
|
|
8
|
-
from azure.ai.evaluation._model_configurations import AzureAIProject
|
|
9
|
-
|
|
10
8
|
try:
|
|
11
9
|
from ._hate_unfairness import HateUnfairnessEvaluator
|
|
12
10
|
from ._self_harm import SelfHarmEvaluator
|
|
@@ -6,8 +6,7 @@ from abc import ABC
|
|
|
6
6
|
|
|
7
7
|
from azure.ai.evaluation._common.constants import EvaluationMetrics
|
|
8
8
|
from azure.ai.evaluation._common.rai_service import evaluate_with_rai_service
|
|
9
|
-
from azure.ai.evaluation._exceptions import
|
|
10
|
-
from azure.ai.evaluation._model_configurations import AzureAIProject
|
|
9
|
+
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
11
10
|
|
|
12
11
|
|
|
13
12
|
class ContentSafetyEvaluatorBase(ABC):
|
|
@@ -7,9 +7,8 @@ from typing import Dict, List
|
|
|
7
7
|
|
|
8
8
|
import numpy as np
|
|
9
9
|
from promptflow.tracing import ThreadPoolExecutorWithContext as ThreadPoolExecutor
|
|
10
|
-
from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
|
|
11
10
|
|
|
12
|
-
from azure.ai.evaluation.
|
|
11
|
+
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
13
12
|
|
|
14
13
|
try:
|
|
15
14
|
from ._hate_unfairness import HateUnfairnessEvaluator
|
|
@@ -165,7 +164,10 @@ class ContentSafetyChatEvaluator:
|
|
|
165
164
|
return score
|
|
166
165
|
except Exception as e: # pylint: disable=broad-exception-caught
|
|
167
166
|
logger.warning(
|
|
168
|
-
|
|
167
|
+
"Evaluator %s failed for turn %s with exception: %s",
|
|
168
|
+
evaluator.__class__.__name__,
|
|
169
|
+
turn_num + 1,
|
|
170
|
+
e,
|
|
169
171
|
)
|
|
170
172
|
return {}
|
|
171
173
|
|
|
@@ -235,7 +237,10 @@ class ContentSafetyChatEvaluator:
|
|
|
235
237
|
)
|
|
236
238
|
|
|
237
239
|
if "role" not in turn or "content" not in turn:
|
|
238
|
-
msg =
|
|
240
|
+
msg = (
|
|
241
|
+
"Each turn in 'conversation' must have 'role' and 'content' keys. "
|
|
242
|
+
+ f"Turn number: {one_based_turn_num}"
|
|
243
|
+
)
|
|
239
244
|
raise EvaluationException(
|
|
240
245
|
message=msg,
|
|
241
246
|
internal_message=msg,
|
|
@@ -2,8 +2,8 @@
|
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
4
|
from promptflow._utils.async_utils import async_run_allowing_running_loop
|
|
5
|
+
|
|
5
6
|
from azure.ai.evaluation._common.constants import EvaluationMetrics
|
|
6
|
-
from azure.ai.evaluation._model_configurations import AzureAIProject
|
|
7
7
|
|
|
8
8
|
try:
|
|
9
9
|
from ._content_safety_base import ContentSafetyEvaluatorBase
|
|
@@ -2,8 +2,8 @@
|
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
4
|
from promptflow._utils.async_utils import async_run_allowing_running_loop
|
|
5
|
+
|
|
5
6
|
from azure.ai.evaluation._common.constants import EvaluationMetrics
|
|
6
|
-
from azure.ai.evaluation._model_configurations import AzureAIProject
|
|
7
7
|
|
|
8
8
|
try:
|
|
9
9
|
from ._content_safety_base import ContentSafetyEvaluatorBase
|
|
@@ -2,8 +2,8 @@
|
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
4
|
from promptflow._utils.async_utils import async_run_allowing_running_loop
|
|
5
|
+
|
|
5
6
|
from azure.ai.evaluation._common.constants import EvaluationMetrics
|
|
6
|
-
from azure.ai.evaluation._model_configurations import AzureAIProject
|
|
7
7
|
|
|
8
8
|
try:
|
|
9
9
|
from ._content_safety_base import ContentSafetyEvaluatorBase
|
|
@@ -2,8 +2,8 @@
|
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
4
|
from promptflow._utils.async_utils import async_run_allowing_running_loop
|
|
5
|
+
|
|
5
6
|
from azure.ai.evaluation._common.constants import EvaluationMetrics
|
|
6
|
-
from azure.ai.evaluation._model_configurations import AzureAIProject
|
|
7
7
|
|
|
8
8
|
try:
|
|
9
9
|
from ._content_safety_base import ContentSafetyEvaluatorBase
|
|
@@ -2,10 +2,10 @@
|
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
4
|
from promptflow._utils.async_utils import async_run_allowing_running_loop
|
|
5
|
+
|
|
5
6
|
from azure.ai.evaluation._common.constants import _InternalEvaluationMetrics
|
|
6
7
|
from azure.ai.evaluation._common.rai_service import evaluate_with_rai_service
|
|
7
|
-
from azure.ai.evaluation._exceptions import
|
|
8
|
-
from azure.ai.evaluation._model_configurations import AzureAIProject
|
|
8
|
+
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
9
9
|
|
|
10
10
|
|
|
11
11
|
class _AsyncECIEvaluator:
|
|
@@ -6,7 +6,8 @@ from collections import Counter
|
|
|
6
6
|
from typing import List
|
|
7
7
|
|
|
8
8
|
from promptflow._utils.async_utils import async_run_allowing_running_loop
|
|
9
|
-
|
|
9
|
+
|
|
10
|
+
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
10
11
|
|
|
11
12
|
|
|
12
13
|
class _AsyncF1ScoreEvaluator:
|
|
@@ -4,19 +4,14 @@
|
|
|
4
4
|
|
|
5
5
|
import os
|
|
6
6
|
import re
|
|
7
|
-
from typing import Union
|
|
8
7
|
|
|
9
8
|
import numpy as np
|
|
10
|
-
|
|
11
9
|
from promptflow._utils.async_utils import async_run_allowing_running_loop
|
|
12
|
-
from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
|
|
13
10
|
from promptflow.core import AsyncPrompty
|
|
14
11
|
|
|
15
|
-
from
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
check_and_add_user_agent_for_aoai_model_config,
|
|
19
|
-
)
|
|
12
|
+
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
13
|
+
|
|
14
|
+
from ..._common.utils import ensure_api_version_in_aoai_model_config, ensure_user_agent_in_aoai_model_config
|
|
20
15
|
|
|
21
16
|
try:
|
|
22
17
|
from ..._user_agent import USER_AGENT
|
|
@@ -31,7 +26,7 @@ class _AsyncFluencyEvaluator:
|
|
|
31
26
|
DEFAULT_OPEN_API_VERSION = "2024-02-15-preview"
|
|
32
27
|
|
|
33
28
|
def __init__(self, model_config: dict):
|
|
34
|
-
|
|
29
|
+
ensure_api_version_in_aoai_model_config(model_config, self.DEFAULT_OPEN_API_VERSION)
|
|
35
30
|
|
|
36
31
|
prompty_model_config = {"configuration": model_config, "parameters": {"extra_headers": {}}}
|
|
37
32
|
|
|
@@ -39,7 +34,7 @@ class _AsyncFluencyEvaluator:
|
|
|
39
34
|
# https://github.com/encode/httpx/discussions/2959
|
|
40
35
|
prompty_model_config["parameters"]["extra_headers"].update({"Connection": "close"})
|
|
41
36
|
|
|
42
|
-
|
|
37
|
+
ensure_user_agent_in_aoai_model_config(
|
|
43
38
|
model_config,
|
|
44
39
|
prompty_model_config,
|
|
45
40
|
USER_AGENT,
|
|
@@ -2,8 +2,8 @@
|
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
4
|
from nltk.translate.gleu_score import sentence_gleu
|
|
5
|
-
|
|
6
5
|
from promptflow._utils.async_utils import async_run_allowing_running_loop
|
|
6
|
+
|
|
7
7
|
from azure.ai.evaluation._common.utils import nltk_tokenize
|
|
8
8
|
|
|
9
9
|
|
|
@@ -4,19 +4,14 @@
|
|
|
4
4
|
|
|
5
5
|
import os
|
|
6
6
|
import re
|
|
7
|
-
from typing import Union
|
|
8
7
|
|
|
9
8
|
import numpy as np
|
|
10
|
-
|
|
11
9
|
from promptflow._utils.async_utils import async_run_allowing_running_loop
|
|
12
|
-
from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
|
|
13
10
|
from promptflow.core import AsyncPrompty
|
|
14
11
|
|
|
15
|
-
from
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
check_and_add_user_agent_for_aoai_model_config,
|
|
19
|
-
)
|
|
12
|
+
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
13
|
+
|
|
14
|
+
from ..._common.utils import ensure_api_version_in_aoai_model_config, ensure_user_agent_in_aoai_model_config
|
|
20
15
|
|
|
21
16
|
try:
|
|
22
17
|
from ..._user_agent import USER_AGENT
|
|
@@ -31,7 +26,7 @@ class _AsyncGroundednessEvaluator:
|
|
|
31
26
|
DEFAULT_OPEN_API_VERSION = "2024-02-15-preview"
|
|
32
27
|
|
|
33
28
|
def __init__(self, model_config: dict):
|
|
34
|
-
|
|
29
|
+
ensure_api_version_in_aoai_model_config(model_config, self.DEFAULT_OPEN_API_VERSION)
|
|
35
30
|
|
|
36
31
|
prompty_model_config = {"configuration": model_config, "parameters": {"extra_headers": {}}}
|
|
37
32
|
|
|
@@ -39,7 +34,7 @@ class _AsyncGroundednessEvaluator:
|
|
|
39
34
|
# https://github.com/encode/httpx/discussions/2959
|
|
40
35
|
prompty_model_config["parameters"]["extra_headers"].update({"Connection": "close"})
|
|
41
36
|
|
|
42
|
-
|
|
37
|
+
ensure_user_agent_in_aoai_model_config(
|
|
43
38
|
model_config,
|
|
44
39
|
prompty_model_config,
|
|
45
40
|
USER_AGENT,
|
|
@@ -2,10 +2,10 @@
|
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
4
|
from promptflow._utils.async_utils import async_run_allowing_running_loop
|
|
5
|
+
|
|
5
6
|
from azure.ai.evaluation._common.constants import EvaluationMetrics
|
|
6
7
|
from azure.ai.evaluation._common.rai_service import evaluate_with_rai_service
|
|
7
|
-
from azure.ai.evaluation._exceptions import
|
|
8
|
-
from azure.ai.evaluation._model_configurations import AzureAIProject
|
|
8
|
+
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
9
9
|
|
|
10
10
|
|
|
11
11
|
class _AsyncProtectedMaterialEvaluator:
|
|
@@ -2,10 +2,10 @@
|
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
4
|
from promptflow._utils.async_utils import async_run_allowing_running_loop
|
|
5
|
+
|
|
5
6
|
from azure.ai.evaluation._common.constants import EvaluationMetrics
|
|
6
7
|
from azure.ai.evaluation._common.rai_service import evaluate_with_rai_service
|
|
7
|
-
from azure.ai.evaluation._exceptions import
|
|
8
|
-
from azure.ai.evaluation._model_configurations import AzureAIProject
|
|
8
|
+
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
9
9
|
|
|
10
10
|
|
|
11
11
|
class _AsyncProtectedMaterialsEvaluator:
|