azure-ai-evaluation 1.1.0__py3-none-any.whl → 1.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- azure/ai/evaluation/__init__.py +1 -15
- azure/ai/evaluation/_azure/_clients.py +24 -8
- azure/ai/evaluation/_azure/_models.py +2 -2
- azure/ai/evaluation/_common/utils.py +8 -8
- azure/ai/evaluation/_constants.py +21 -0
- azure/ai/evaluation/_evaluate/_batch_run/__init__.py +2 -1
- azure/ai/evaluation/_evaluate/_eval_run.py +3 -1
- azure/ai/evaluation/_evaluate/_evaluate.py +74 -14
- azure/ai/evaluation/_evaluate/_utils.py +27 -0
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +46 -25
- azure/ai/evaluation/_evaluators/_common/__init__.py +2 -0
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +69 -4
- azure/ai/evaluation/_evaluators/_common/_base_multi_eval.py +61 -0
- azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +7 -1
- azure/ai/evaluation/_evaluators/_common/_conversation_aggregators.py +49 -0
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +5 -42
- azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +2 -0
- azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +2 -0
- azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +2 -0
- azure/ai/evaluation/_evaluators/_content_safety/_violence.py +2 -0
- azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +61 -68
- azure/ai/evaluation/_evaluators/_gleu/_gleu.py +45 -23
- azure/ai/evaluation/_evaluators/_meteor/_meteor.py +55 -34
- azure/ai/evaluation/_evaluators/_qa/_qa.py +32 -27
- azure/ai/evaluation/_evaluators/_rouge/_rouge.py +44 -23
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +41 -81
- azure/ai/evaluation/_exceptions.py +0 -1
- azure/ai/evaluation/_safety_evaluation/__init__.py +3 -0
- azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +640 -0
- azure/ai/evaluation/_version.py +2 -1
- azure/ai/evaluation/simulator/_adversarial_simulator.py +10 -3
- azure/ai/evaluation/simulator/_conversation/__init__.py +4 -5
- azure/ai/evaluation/simulator/_conversation/_conversation.py +4 -0
- azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +2 -0
- azure/ai/evaluation/simulator/_simulator.py +21 -13
- {azure_ai_evaluation-1.1.0.dist-info → azure_ai_evaluation-1.3.0.dist-info}/METADATA +77 -7
- {azure_ai_evaluation-1.1.0.dist-info → azure_ai_evaluation-1.3.0.dist-info}/RECORD +40 -44
- azure/ai/evaluation/_evaluators/_multimodal/__init__.py +0 -20
- azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py +0 -132
- azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal_base.py +0 -55
- azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py +0 -100
- azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py +0 -124
- azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py +0 -100
- azure/ai/evaluation/_evaluators/_multimodal/_sexual.py +0 -100
- azure/ai/evaluation/_evaluators/_multimodal/_violence.py +0 -100
- {azure_ai_evaluation-1.1.0.dist-info → azure_ai_evaluation-1.3.0.dist-info}/NOTICE.txt +0 -0
- {azure_ai_evaluation-1.1.0.dist-info → azure_ai_evaluation-1.3.0.dist-info}/WHEEL +0 -0
- {azure_ai_evaluation-1.1.0.dist-info → azure_ai_evaluation-1.3.0.dist-info}/top_level.txt +0 -0
azure/ai/evaluation/__init__.py
CHANGED
|
@@ -12,14 +12,6 @@ from ._evaluators._content_safety import (
|
|
|
12
12
|
SexualEvaluator,
|
|
13
13
|
ViolenceEvaluator,
|
|
14
14
|
)
|
|
15
|
-
from ._evaluators._multimodal._content_safety_multimodal import (
|
|
16
|
-
ContentSafetyMultimodalEvaluator,
|
|
17
|
-
HateUnfairnessMultimodalEvaluator,
|
|
18
|
-
SelfHarmMultimodalEvaluator,
|
|
19
|
-
SexualMultimodalEvaluator,
|
|
20
|
-
ViolenceMultimodalEvaluator,
|
|
21
|
-
)
|
|
22
|
-
from ._evaluators._multimodal._protected_material import ProtectedMaterialMultimodalEvaluator
|
|
23
15
|
from ._evaluators._f1_score import F1ScoreEvaluator
|
|
24
16
|
from ._evaluators._fluency import FluencyEvaluator
|
|
25
17
|
from ._evaluators._gleu import GleuScoreEvaluator
|
|
@@ -72,11 +64,5 @@ __all__ = [
|
|
|
72
64
|
"EvaluatorConfig",
|
|
73
65
|
"Conversation",
|
|
74
66
|
"Message",
|
|
75
|
-
"EvaluationResult"
|
|
76
|
-
"ContentSafetyMultimodalEvaluator",
|
|
77
|
-
"HateUnfairnessMultimodalEvaluator",
|
|
78
|
-
"SelfHarmMultimodalEvaluator",
|
|
79
|
-
"SexualMultimodalEvaluator",
|
|
80
|
-
"ViolenceMultimodalEvaluator",
|
|
81
|
-
"ProtectedMaterialMultimodalEvaluator",
|
|
67
|
+
"EvaluationResult"
|
|
82
68
|
]
|
|
@@ -17,7 +17,7 @@ from azure.ai.evaluation.simulator._model_tools._identity_manager import TokenSc
|
|
|
17
17
|
from ._models import BlobStoreInfo, Workspace
|
|
18
18
|
|
|
19
19
|
|
|
20
|
-
API_VERSION: Final[str] = "2024-
|
|
20
|
+
API_VERSION: Final[str] = "2024-07-01-preview"
|
|
21
21
|
QUERY_KEY_API_VERSION: Final[str] = "api-version"
|
|
22
22
|
PATH_ML_WORKSPACES = ("providers", "Microsoft.MachineLearningServices", "workspaces")
|
|
23
23
|
|
|
@@ -69,7 +69,9 @@ class LiteMLClient:
|
|
|
69
69
|
self._get_token_manager()
|
|
70
70
|
return cast(TokenCredential, self._credential)
|
|
71
71
|
|
|
72
|
-
def workspace_get_default_datastore(
|
|
72
|
+
def workspace_get_default_datastore(
|
|
73
|
+
self, workspace_name: str, *, include_credentials: bool = False, **kwargs: Any
|
|
74
|
+
) -> BlobStoreInfo:
|
|
73
75
|
# 1. Get the default blob store
|
|
74
76
|
# REST API documentation:
|
|
75
77
|
# https://learn.microsoft.com/rest/api/azureml/datastores/list?view=rest-azureml-2024-10-01
|
|
@@ -92,18 +94,29 @@ class LiteMLClient:
|
|
|
92
94
|
account_name = props_json["accountName"]
|
|
93
95
|
endpoint = props_json["endpoint"]
|
|
94
96
|
container_name = props_json["containerName"]
|
|
97
|
+
credential_type = props_json.get("credentials", {}).get("credentialsType")
|
|
95
98
|
|
|
96
99
|
# 2. Get the SAS token to use for accessing the blob store
|
|
97
100
|
# REST API documentation:
|
|
98
101
|
# https://learn.microsoft.com/rest/api/azureml/datastores/list-secrets?view=rest-azureml-2024-10-01
|
|
99
|
-
blob_store_credential: Optional[Union[AzureSasCredential, str]]
|
|
100
|
-
if include_credentials:
|
|
102
|
+
blob_store_credential: Optional[Union[AzureSasCredential, TokenCredential, str]]
|
|
103
|
+
if not include_credentials:
|
|
104
|
+
blob_store_credential = None
|
|
105
|
+
elif credential_type and credential_type.lower() == "none":
|
|
106
|
+
# If storage account key access is disabled, and only Microsoft Entra ID authentication is available,
|
|
107
|
+
# the credentialsType will be "None" and we should not attempt to get the secrets.
|
|
108
|
+
blob_store_credential = self.get_credential()
|
|
109
|
+
else:
|
|
101
110
|
url = self._generate_path(
|
|
102
111
|
*PATH_ML_WORKSPACES, workspace_name, "datastores", "workspaceblobstore", "listSecrets"
|
|
103
112
|
)
|
|
104
113
|
secrets_response = self._http_client.request(
|
|
105
114
|
method="POST",
|
|
106
115
|
url=url,
|
|
116
|
+
json={
|
|
117
|
+
"expirableSecret": True,
|
|
118
|
+
"expireAfterHours": int(kwargs.get("key_expiration_hours", 1)),
|
|
119
|
+
},
|
|
107
120
|
params={
|
|
108
121
|
QUERY_KEY_API_VERSION: self._api_version,
|
|
109
122
|
},
|
|
@@ -114,10 +127,13 @@ class LiteMLClient:
|
|
|
114
127
|
secrets_json = secrets_response.json()
|
|
115
128
|
secrets_type = secrets_json["secretsType"].lower()
|
|
116
129
|
|
|
130
|
+
# As per this website, only SAS tokens, access tokens, or Entra IDs are valid for accessing blob data
|
|
131
|
+
# stores:
|
|
132
|
+
# https://learn.microsoft.com/rest/api/storageservices/authorize-requests-to-azure-storage.
|
|
117
133
|
if secrets_type == "sas":
|
|
118
134
|
blob_store_credential = AzureSasCredential(secrets_json["sasToken"])
|
|
119
135
|
elif secrets_type == "accountkey":
|
|
120
|
-
# To support
|
|
136
|
+
# To support older versions of azure-storage-blob better, we return a string here instead of
|
|
121
137
|
# an AzureNamedKeyCredential
|
|
122
138
|
blob_store_credential = secrets_json["key"]
|
|
123
139
|
else:
|
|
@@ -164,19 +180,19 @@ class LiteMLClient:
|
|
|
164
180
|
# nothing to see here, move along
|
|
165
181
|
return
|
|
166
182
|
|
|
167
|
-
|
|
183
|
+
message = f"The {description} request failed with HTTP {response.status_code}"
|
|
168
184
|
try:
|
|
169
185
|
error_json = response.json()["error"]
|
|
170
186
|
additional_info = f"({error_json['code']}) {error_json['message']}"
|
|
187
|
+
message += f" - {additional_info}"
|
|
171
188
|
except (JSONDecodeError, ValueError, KeyError):
|
|
172
189
|
pass
|
|
173
190
|
|
|
174
191
|
raise EvaluationException(
|
|
175
|
-
message=
|
|
192
|
+
message=message,
|
|
176
193
|
target=ErrorTarget.EVALUATE,
|
|
177
194
|
category=ErrorCategory.FAILED_EXECUTION,
|
|
178
195
|
blame=ErrorBlame.SYSTEM_ERROR,
|
|
179
|
-
internal_message=additional_info,
|
|
180
196
|
)
|
|
181
197
|
|
|
182
198
|
def _generate_path(self, *paths: str) -> str:
|
|
@@ -8,7 +8,7 @@
|
|
|
8
8
|
|
|
9
9
|
from typing import Dict, List, NamedTuple, Optional, Union
|
|
10
10
|
from msrest.serialization import Model
|
|
11
|
-
from azure.core.credentials import AzureSasCredential
|
|
11
|
+
from azure.core.credentials import AzureSasCredential, TokenCredential
|
|
12
12
|
|
|
13
13
|
|
|
14
14
|
class BlobStoreInfo(NamedTuple):
|
|
@@ -16,7 +16,7 @@ class BlobStoreInfo(NamedTuple):
|
|
|
16
16
|
account_name: str
|
|
17
17
|
endpoint: str
|
|
18
18
|
container_name: str
|
|
19
|
-
credential: Optional[Union[AzureSasCredential, str]]
|
|
19
|
+
credential: Optional[Union[AzureSasCredential, TokenCredential, str]]
|
|
20
20
|
|
|
21
21
|
|
|
22
22
|
class WorkspaceHubConfig(Model):
|
|
@@ -366,7 +366,7 @@ def validate_conversation(conversation):
|
|
|
366
366
|
if not isinstance(messages, list):
|
|
367
367
|
raise_exception(
|
|
368
368
|
"'messages' parameter must be a JSON-compatible list of chat messages",
|
|
369
|
-
ErrorTarget.
|
|
369
|
+
ErrorTarget.CONTENT_SAFETY_CHAT_EVALUATOR,
|
|
370
370
|
)
|
|
371
371
|
expected_roles = {"user", "assistant", "system"}
|
|
372
372
|
image_found = False
|
|
@@ -393,7 +393,7 @@ def validate_conversation(conversation):
|
|
|
393
393
|
):
|
|
394
394
|
raise_exception(
|
|
395
395
|
f"Messages must be a strongly typed class of ChatRequestMessage. Message number: {num}",
|
|
396
|
-
ErrorTarget.
|
|
396
|
+
ErrorTarget.CONTENT_SAFETY_CHAT_EVALUATOR,
|
|
397
397
|
)
|
|
398
398
|
if isinstance(message, AssistantMessage):
|
|
399
399
|
assistant_message_count += 1
|
|
@@ -407,7 +407,7 @@ def validate_conversation(conversation):
|
|
|
407
407
|
if message.get("role") not in expected_roles:
|
|
408
408
|
raise_exception(
|
|
409
409
|
f"Invalid role provided: {message.get('role')}. Message number: {num}",
|
|
410
|
-
ErrorTarget.
|
|
410
|
+
ErrorTarget.CONTENT_SAFETY_CHAT_EVALUATOR,
|
|
411
411
|
)
|
|
412
412
|
if message.get("role") == "assistant":
|
|
413
413
|
assistant_message_count += 1
|
|
@@ -417,7 +417,7 @@ def validate_conversation(conversation):
|
|
|
417
417
|
if not isinstance(content, (str, list)):
|
|
418
418
|
raise_exception(
|
|
419
419
|
f"Content in each turn must be a string or array. Message number: {num}",
|
|
420
|
-
ErrorTarget.
|
|
420
|
+
ErrorTarget.CONTENT_SAFETY_CHAT_EVALUATOR,
|
|
421
421
|
)
|
|
422
422
|
if isinstance(content, list):
|
|
423
423
|
if any(item.get("type") == "image_url" and "url" in item.get("image_url", {}) for item in content):
|
|
@@ -425,21 +425,21 @@ def validate_conversation(conversation):
|
|
|
425
425
|
if not image_found:
|
|
426
426
|
raise_exception(
|
|
427
427
|
"Message needs to have multi-modal input like images.",
|
|
428
|
-
ErrorTarget.
|
|
428
|
+
ErrorTarget.CONTENT_SAFETY_CHAT_EVALUATOR,
|
|
429
429
|
)
|
|
430
430
|
if assistant_message_count == 0:
|
|
431
431
|
raise_exception(
|
|
432
432
|
"Assistant role required in one of the messages.",
|
|
433
|
-
ErrorTarget.
|
|
433
|
+
ErrorTarget.CONTENT_SAFETY_CHAT_EVALUATOR,
|
|
434
434
|
)
|
|
435
435
|
if user_message_count == 0:
|
|
436
436
|
raise_exception(
|
|
437
437
|
"User role required in one of the messages.",
|
|
438
|
-
ErrorTarget.
|
|
438
|
+
ErrorTarget.CONTENT_SAFETY_CHAT_EVALUATOR,
|
|
439
439
|
)
|
|
440
440
|
if assistant_message_count > 1:
|
|
441
441
|
raise_exception(
|
|
442
442
|
"Evaluators for multimodal conversations only support single turn. "
|
|
443
443
|
"User and assistant role expected as the only role in each message.",
|
|
444
|
-
ErrorTarget.
|
|
444
|
+
ErrorTarget.CONTENT_SAFETY_CHAT_EVALUATOR,
|
|
445
445
|
)
|
|
@@ -1,7 +1,9 @@
|
|
|
1
1
|
# ---------------------------------------------------------
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
|
+
import enum
|
|
4
5
|
from typing import Literal
|
|
6
|
+
from azure.ai.evaluation._common._experimental import experimental
|
|
5
7
|
|
|
6
8
|
|
|
7
9
|
class EvaluationMetrics:
|
|
@@ -20,6 +22,9 @@ class EvaluationMetrics:
|
|
|
20
22
|
SELF_HARM = "self_harm"
|
|
21
23
|
SEXUAL = "sexual"
|
|
22
24
|
PROTECTED_MATERIAL = "protected_material"
|
|
25
|
+
ARTWORK = "artwork"
|
|
26
|
+
FICTIONAL_CHARACTERS = "fictional_characters"
|
|
27
|
+
LOGOS_AND_BRANDS = "logos_and_brands"
|
|
23
28
|
XPIA = "xpia"
|
|
24
29
|
|
|
25
30
|
|
|
@@ -57,6 +62,22 @@ class EvaluationRunProperties:
|
|
|
57
62
|
EVALUATION_SDK = "_azureml.evaluation_sdk_name"
|
|
58
63
|
|
|
59
64
|
|
|
65
|
+
@experimental
|
|
66
|
+
class _AggregationType(enum.Enum):
|
|
67
|
+
"""Defines how numeric evaluation results should be aggregated
|
|
68
|
+
to produce a single value. Used by individual evaluators to combine per-turn results for
|
|
69
|
+
a conversation-based input. In general, wherever this enum is used, it is also possible
|
|
70
|
+
to directly assign the underlying aggregation function for more complex use cases.
|
|
71
|
+
The 'custom' value is generally not an acceptable input, and should only be used as an output
|
|
72
|
+
to indicate that a custom aggregation function has been injected."""
|
|
73
|
+
|
|
74
|
+
MEAN = "mean"
|
|
75
|
+
MAX = "max"
|
|
76
|
+
MIN = "min"
|
|
77
|
+
SUM = "sum"
|
|
78
|
+
CUSTOM = "custom"
|
|
79
|
+
|
|
80
|
+
|
|
60
81
|
DEFAULT_EVALUATION_RESULTS_FILE_NAME = "evaluation_results.json"
|
|
61
82
|
|
|
62
83
|
CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT = 4
|
|
@@ -5,5 +5,6 @@ from .eval_run_context import EvalRunContext
|
|
|
5
5
|
from .code_client import CodeClient
|
|
6
6
|
from .proxy_client import ProxyClient
|
|
7
7
|
from .target_run_context import TargetRunContext
|
|
8
|
+
from .proxy_client import ProxyRun
|
|
8
9
|
|
|
9
|
-
__all__ = ["CodeClient", "ProxyClient", "EvalRunContext", "TargetRunContext"]
|
|
10
|
+
__all__ = ["CodeClient", "ProxyClient", "EvalRunContext", "TargetRunContext", "ProxyRun"]
|
|
@@ -421,7 +421,9 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
|
|
|
421
421
|
local_paths.append(local_file_path)
|
|
422
422
|
|
|
423
423
|
# We will write the artifacts to the workspaceblobstore
|
|
424
|
-
datastore = self._management_client.workspace_get_default_datastore(
|
|
424
|
+
datastore = self._management_client.workspace_get_default_datastore(
|
|
425
|
+
self._workspace_name, include_credentials=True
|
|
426
|
+
)
|
|
425
427
|
account_url = f"{datastore.account_name}.blob.{datastore.endpoint}"
|
|
426
428
|
|
|
427
429
|
svc_client = BlobServiceClient(account_url=account_url, credential=datastore.credential)
|
|
@@ -12,6 +12,7 @@ import pandas as pd
|
|
|
12
12
|
from promptflow._sdk._constants import LINE_NUMBER
|
|
13
13
|
from promptflow.client import PFClient
|
|
14
14
|
from promptflow.entities import Run
|
|
15
|
+
from promptflow._sdk._configuration import Configuration
|
|
15
16
|
|
|
16
17
|
from azure.ai.evaluation._common.math import list_mean_nan_safe, apply_transform_nan_safe
|
|
17
18
|
from azure.ai.evaluation._common.utils import validate_azure_ai_project
|
|
@@ -20,17 +21,19 @@ from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarg
|
|
|
20
21
|
from .._constants import (
|
|
21
22
|
CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT,
|
|
22
23
|
EvaluationMetrics,
|
|
24
|
+
DefaultOpenEncoding,
|
|
23
25
|
Prefixes,
|
|
24
26
|
_InternalEvaluationMetrics,
|
|
25
27
|
)
|
|
26
28
|
from .._model_configurations import AzureAIProject, EvaluationResult, EvaluatorConfig
|
|
27
29
|
from .._user_agent import USER_AGENT
|
|
28
|
-
from ._batch_run import EvalRunContext, CodeClient, ProxyClient, TargetRunContext
|
|
30
|
+
from ._batch_run import EvalRunContext, CodeClient, ProxyClient, TargetRunContext, ProxyRun
|
|
29
31
|
from ._utils import (
|
|
30
32
|
_apply_column_mapping,
|
|
31
33
|
_log_metrics_and_instance_results,
|
|
32
34
|
_trace_destination_from_project_scope,
|
|
33
35
|
_write_output,
|
|
36
|
+
DataLoaderFactory,
|
|
34
37
|
)
|
|
35
38
|
|
|
36
39
|
TClient = TypeVar("TClient", ProxyClient, CodeClient)
|
|
@@ -144,6 +147,9 @@ def _aggregate_label_defect_metrics(df: pd.DataFrame) -> Tuple[List[str], Dict[s
|
|
|
144
147
|
"""
|
|
145
148
|
handled_metrics = [
|
|
146
149
|
EvaluationMetrics.PROTECTED_MATERIAL,
|
|
150
|
+
EvaluationMetrics.FICTIONAL_CHARACTERS,
|
|
151
|
+
EvaluationMetrics.ARTWORK,
|
|
152
|
+
EvaluationMetrics.LOGOS_AND_BRANDS,
|
|
147
153
|
_InternalEvaluationMetrics.ECI,
|
|
148
154
|
EvaluationMetrics.XPIA,
|
|
149
155
|
]
|
|
@@ -429,10 +435,11 @@ def _validate_and_load_data(target, data, evaluators, output_path, azure_ai_proj
|
|
|
429
435
|
)
|
|
430
436
|
|
|
431
437
|
try:
|
|
432
|
-
|
|
438
|
+
data_loader = DataLoaderFactory.get_loader(data)
|
|
439
|
+
initial_data_df = data_loader.load()
|
|
433
440
|
except Exception as e:
|
|
434
441
|
raise EvaluationException(
|
|
435
|
-
message=f"Unable to load data from '{data}'.
|
|
442
|
+
message=f"Unable to load data from '{data}'. Supported formats are JSONL and CSV. Detailed error: {e}.",
|
|
436
443
|
target=ErrorTarget.EVALUATE,
|
|
437
444
|
category=ErrorCategory.INVALID_VALUE,
|
|
438
445
|
blame=ErrorBlame.USER_ERROR,
|
|
@@ -444,7 +451,7 @@ def _validate_and_load_data(target, data, evaluators, output_path, azure_ai_proj
|
|
|
444
451
|
def _apply_target_to_data(
|
|
445
452
|
target: Callable,
|
|
446
453
|
data: Union[str, os.PathLike],
|
|
447
|
-
|
|
454
|
+
batch_client: TClient,
|
|
448
455
|
initial_data: pd.DataFrame,
|
|
449
456
|
evaluation_name: Optional[str] = None,
|
|
450
457
|
**kwargs,
|
|
@@ -454,10 +461,10 @@ def _apply_target_to_data(
|
|
|
454
461
|
|
|
455
462
|
:param target: The function to be applied to data.
|
|
456
463
|
:type target: Callable
|
|
457
|
-
:param data: The path to input jsonl file.
|
|
464
|
+
:param data: The path to input jsonl or csv file.
|
|
458
465
|
:type data: Union[str, os.PathLike]
|
|
459
|
-
:param
|
|
460
|
-
:type
|
|
466
|
+
:param batch_client: The promptflow client to be used.
|
|
467
|
+
:type batch_client: PFClient
|
|
461
468
|
:param initial_data: The data frame with the loaded data.
|
|
462
469
|
:type initial_data: pd.DataFrame
|
|
463
470
|
:param evaluation_name: The name of the evaluation.
|
|
@@ -467,7 +474,7 @@ def _apply_target_to_data(
|
|
|
467
474
|
"""
|
|
468
475
|
_run_name = kwargs.get("_run_name")
|
|
469
476
|
with TargetRunContext():
|
|
470
|
-
run:
|
|
477
|
+
run: ProxyRun = batch_client.run(
|
|
471
478
|
flow=target,
|
|
472
479
|
display_name=evaluation_name,
|
|
473
480
|
data=data,
|
|
@@ -475,7 +482,18 @@ def _apply_target_to_data(
|
|
|
475
482
|
name=_run_name,
|
|
476
483
|
)
|
|
477
484
|
|
|
478
|
-
target_output: pd.DataFrame =
|
|
485
|
+
target_output: pd.DataFrame = batch_client.get_details(run, all_results=True)
|
|
486
|
+
run_summary = batch_client.get_run_summary(run)
|
|
487
|
+
|
|
488
|
+
if run_summary["completed_lines"] == 0:
|
|
489
|
+
msg = (f"Evaluation target failed to produce any results."
|
|
490
|
+
f" Please check the logs at {run_summary['log_path']} for more details about cause of failure.")
|
|
491
|
+
raise EvaluationException(
|
|
492
|
+
message=msg,
|
|
493
|
+
target=ErrorTarget.EVALUATE,
|
|
494
|
+
category=ErrorCategory.FAILED_EXECUTION,
|
|
495
|
+
blame=ErrorBlame.USER_ERROR,
|
|
496
|
+
)
|
|
479
497
|
# Remove input and output prefix
|
|
480
498
|
generated_columns = {
|
|
481
499
|
col[len(Prefixes.OUTPUTS) :] for col in target_output.columns if col.startswith(Prefixes.OUTPUTS)
|
|
@@ -494,7 +512,7 @@ def _apply_target_to_data(
|
|
|
494
512
|
# Concatenate output to input
|
|
495
513
|
target_output = pd.concat([target_output, initial_data], axis=1)
|
|
496
514
|
|
|
497
|
-
return target_output, generated_columns, run
|
|
515
|
+
return target_output, generated_columns, run.run.result()
|
|
498
516
|
|
|
499
517
|
|
|
500
518
|
def _process_column_mappings(
|
|
@@ -510,7 +528,7 @@ def _process_column_mappings(
|
|
|
510
528
|
|
|
511
529
|
processed_config: Dict[str, Dict[str, str]] = {}
|
|
512
530
|
|
|
513
|
-
|
|
531
|
+
expected_references = re.compile(r"^\$\{(target|data)\.[a-zA-Z_]+\}$")
|
|
514
532
|
|
|
515
533
|
if column_mapping:
|
|
516
534
|
for evaluator, mapping_config in column_mapping.items():
|
|
@@ -519,7 +537,7 @@ def _process_column_mappings(
|
|
|
519
537
|
|
|
520
538
|
for map_to_key, map_value in mapping_config.items():
|
|
521
539
|
# Check if there's any unexpected reference other than ${target.} or ${data.}
|
|
522
|
-
if
|
|
540
|
+
if not expected_references.search(map_value):
|
|
523
541
|
msg = "Unexpected references detected in 'column_mapping'. Ensure only ${target.} and ${data.} are used."
|
|
524
542
|
raise EvaluationException(
|
|
525
543
|
message=msg,
|
|
@@ -569,13 +587,14 @@ def evaluate(
|
|
|
569
587
|
evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None,
|
|
570
588
|
azure_ai_project: Optional[AzureAIProject] = None,
|
|
571
589
|
output_path: Optional[Union[str, os.PathLike]] = None,
|
|
590
|
+
fail_on_evaluator_errors: bool = False,
|
|
572
591
|
**kwargs,
|
|
573
592
|
) -> EvaluationResult:
|
|
574
593
|
"""Evaluates target or data with built-in or custom evaluators. If both target and data are provided,
|
|
575
594
|
data will be run through target function and then results will be evaluated.
|
|
576
595
|
|
|
577
596
|
:keyword data: Path to the data to be evaluated or passed to target if target is set.
|
|
578
|
-
|
|
597
|
+
JSONL and CSV files are supported. `target` and `data` both cannot be None. Required.
|
|
579
598
|
:paramtype data: str
|
|
580
599
|
:keyword evaluators: Evaluators to be used for evaluation. It should be a dictionary with key as alias for evaluator
|
|
581
600
|
and value as the evaluator function. Required.
|
|
@@ -594,6 +613,11 @@ def evaluate(
|
|
|
594
613
|
:paramtype output_path: Optional[str]
|
|
595
614
|
:keyword azure_ai_project: Logs evaluation results to AI Studio if set.
|
|
596
615
|
:paramtype azure_ai_project: Optional[~azure.ai.evaluation.AzureAIProject]
|
|
616
|
+
:keyword fail_on_evaluator_errors: Whether or not the evaluation should cancel early with an EvaluationException
|
|
617
|
+
if ANY evaluator fails during their evaluation.
|
|
618
|
+
Defaults to false, which means that evaluations will continue regardless of failures.
|
|
619
|
+
If such failures occur, metrics may be missing, and evidence of failures can be found in the evaluation's logs.
|
|
620
|
+
:paramtype fail_on_evaluator_errors: bool
|
|
597
621
|
:return: Evaluation results.
|
|
598
622
|
:rtype: ~azure.ai.evaluation.EvaluationResult
|
|
599
623
|
|
|
@@ -615,6 +639,7 @@ def evaluate(
|
|
|
615
639
|
evaluator_config=evaluator_config,
|
|
616
640
|
azure_ai_project=azure_ai_project,
|
|
617
641
|
output_path=output_path,
|
|
642
|
+
fail_on_evaluator_errors=fail_on_evaluator_errors,
|
|
618
643
|
**kwargs,
|
|
619
644
|
)
|
|
620
645
|
except Exception as e:
|
|
@@ -663,6 +688,16 @@ def _print_summary(per_evaluator_results: Dict[str, Any]) -> None:
|
|
|
663
688
|
print("\n====================================================\n")
|
|
664
689
|
|
|
665
690
|
|
|
691
|
+
def _print_fail_flag_warning() -> None:
|
|
692
|
+
print(
|
|
693
|
+
"Notice: fail_on_evaluator_errors is enabled. It is recommended that you disable "
|
|
694
|
+
+ "this flag for evaluations on large datasets (loosely defined as more than 10 rows of inputs, "
|
|
695
|
+
+ "or more than 4 evaluators). Using this flag on large datasets runs the risk of large runs failing "
|
|
696
|
+
+ "without producing any outputs, since a single failure will cancel the entire run "
|
|
697
|
+
"when fail_on_evaluator_errors is enabled."
|
|
698
|
+
)
|
|
699
|
+
|
|
700
|
+
|
|
666
701
|
def _evaluate( # pylint: disable=too-many-locals,too-many-statements
|
|
667
702
|
*,
|
|
668
703
|
evaluators: Dict[str, Callable],
|
|
@@ -672,8 +707,11 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
|
|
|
672
707
|
evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None,
|
|
673
708
|
azure_ai_project: Optional[AzureAIProject] = None,
|
|
674
709
|
output_path: Optional[Union[str, os.PathLike]] = None,
|
|
710
|
+
fail_on_evaluator_errors: bool = False,
|
|
675
711
|
**kwargs,
|
|
676
712
|
) -> EvaluationResult:
|
|
713
|
+
if fail_on_evaluator_errors:
|
|
714
|
+
_print_fail_flag_warning()
|
|
677
715
|
input_data_df = _validate_and_load_data(target, data, evaluators, output_path, azure_ai_project, evaluation_name)
|
|
678
716
|
|
|
679
717
|
# Process evaluator config to replace ${target.} with ${data.}
|
|
@@ -690,6 +728,7 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
|
|
|
690
728
|
if target is not None:
|
|
691
729
|
_validate_columns_for_target(input_data_df, target)
|
|
692
730
|
|
|
731
|
+
Configuration.get_instance().set_config("trace.destination", "none")
|
|
693
732
|
pf_client = PFClient(user_agent=USER_AGENT)
|
|
694
733
|
target_run: Optional[Run] = None
|
|
695
734
|
|
|
@@ -702,7 +741,7 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
|
|
|
702
741
|
target_generated_columns: Set[str] = set()
|
|
703
742
|
if data is not None and target is not None:
|
|
704
743
|
input_data_df, target_generated_columns, target_run = _apply_target_to_data(
|
|
705
|
-
target, data, pf_client, input_data_df, evaluation_name, **kwargs
|
|
744
|
+
target, data, ProxyClient(pf_client), input_data_df, evaluation_name, **kwargs
|
|
706
745
|
)
|
|
707
746
|
|
|
708
747
|
for evaluator_name, mapping in column_mapping.items():
|
|
@@ -773,6 +812,10 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
|
|
|
773
812
|
evaluators_result_df = None
|
|
774
813
|
evaluators_metric = {}
|
|
775
814
|
for evaluator_name, evaluator_result in per_evaluator_results.items():
|
|
815
|
+
if fail_on_evaluator_errors and evaluator_result["run_summary"]["failed_lines"] > 0:
|
|
816
|
+
_print_summary(per_evaluator_results)
|
|
817
|
+
_turn_error_logs_into_exception(evaluator_result["run_summary"]["log_path"] + "/error.json")
|
|
818
|
+
|
|
776
819
|
evaluator_result_df = evaluator_result["result"]
|
|
777
820
|
|
|
778
821
|
# drop input columns
|
|
@@ -825,3 +868,20 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
|
|
|
825
868
|
_write_output(output_path, result)
|
|
826
869
|
|
|
827
870
|
return result
|
|
871
|
+
|
|
872
|
+
|
|
873
|
+
def _turn_error_logs_into_exception(log_path: str) -> None:
|
|
874
|
+
"""Produce an EvaluationException using the contents of the inputted
|
|
875
|
+
file as the error message.
|
|
876
|
+
|
|
877
|
+
:param log_path: The path to the error log file.
|
|
878
|
+
:type log_path: str
|
|
879
|
+
"""
|
|
880
|
+
with open(log_path, "r", encoding=DefaultOpenEncoding.READ) as file:
|
|
881
|
+
error_message = file.read()
|
|
882
|
+
raise EvaluationException(
|
|
883
|
+
message=error_message,
|
|
884
|
+
target=ErrorTarget.EVALUATE,
|
|
885
|
+
category=ErrorCategory.FAILED_EXECUTION,
|
|
886
|
+
blame=ErrorBlame.UNKNOWN,
|
|
887
|
+
)
|
|
@@ -328,3 +328,30 @@ def set_event_loop_policy() -> None:
|
|
|
328
328
|
# Reference: https://stackoverflow.com/questions/45600579/asyncio-event-loop-is-closed-when-getting-loop
|
|
329
329
|
# On Windows seems to be a problem with EventLoopPolicy, use this snippet to work around it
|
|
330
330
|
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy()) # type: ignore[attr-defined]
|
|
331
|
+
|
|
332
|
+
|
|
333
|
+
class JSONLDataFileLoader:
|
|
334
|
+
def __init__(self, filename: Union[os.PathLike, str]):
|
|
335
|
+
self.filename = filename
|
|
336
|
+
|
|
337
|
+
def load(self) -> pd.DataFrame:
|
|
338
|
+
return pd.read_json(self.filename, lines=True)
|
|
339
|
+
|
|
340
|
+
|
|
341
|
+
class CSVDataFileLoader:
|
|
342
|
+
def __init__(self, filename: Union[os.PathLike, str]):
|
|
343
|
+
self.filename = filename
|
|
344
|
+
|
|
345
|
+
def load(self) -> pd.DataFrame:
|
|
346
|
+
return pd.read_csv(self.filename)
|
|
347
|
+
|
|
348
|
+
|
|
349
|
+
class DataLoaderFactory:
|
|
350
|
+
@staticmethod
|
|
351
|
+
def get_loader(filename: Union[os.PathLike, str]) -> Union[JSONLDataFileLoader, CSVDataFileLoader]:
|
|
352
|
+
filename_str = str(filename).lower()
|
|
353
|
+
if filename_str.endswith(".csv"):
|
|
354
|
+
return CSVDataFileLoader(filename)
|
|
355
|
+
|
|
356
|
+
# fallback to JSONL to maintain backward compatibility
|
|
357
|
+
return JSONLDataFileLoader(filename)
|
|
@@ -1,30 +1,16 @@
|
|
|
1
1
|
# ---------------------------------------------------------
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
|
+
from typing import Dict
|
|
4
5
|
from nltk.translate.bleu_score import SmoothingFunction, sentence_bleu
|
|
5
|
-
from
|
|
6
|
+
from typing_extensions import overload, override
|
|
6
7
|
|
|
7
8
|
from azure.ai.evaluation._common.utils import nltk_tokenize
|
|
8
9
|
|
|
10
|
+
from azure.ai.evaluation._evaluators._common import EvaluatorBase
|
|
9
11
|
|
|
10
|
-
class _AsyncBleuScoreEvaluator:
|
|
11
|
-
def __init__(self):
|
|
12
|
-
pass
|
|
13
|
-
|
|
14
|
-
async def __call__(self, *, response: str, ground_truth: str, **kwargs):
|
|
15
|
-
reference_tokens = nltk_tokenize(ground_truth)
|
|
16
|
-
hypothesis_tokens = nltk_tokenize(response)
|
|
17
|
-
|
|
18
|
-
# NIST Smoothing
|
|
19
|
-
smoothing_function = SmoothingFunction().method4
|
|
20
|
-
score = sentence_bleu([reference_tokens], hypothesis_tokens, smoothing_function=smoothing_function)
|
|
21
|
-
|
|
22
|
-
return {
|
|
23
|
-
"bleu_score": score,
|
|
24
|
-
}
|
|
25
12
|
|
|
26
|
-
|
|
27
|
-
class BleuScoreEvaluator:
|
|
13
|
+
class BleuScoreEvaluator(EvaluatorBase):
|
|
28
14
|
"""
|
|
29
15
|
Calculate the BLEU score for a given response and ground truth.
|
|
30
16
|
|
|
@@ -51,9 +37,32 @@ class BleuScoreEvaluator:
|
|
|
51
37
|
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
52
38
|
|
|
53
39
|
def __init__(self):
|
|
54
|
-
|
|
40
|
+
super().__init__()
|
|
41
|
+
|
|
42
|
+
@override
|
|
43
|
+
async def _do_eval(self, eval_input: Dict) -> Dict[str, float]:
|
|
44
|
+
"""Produce a glue score evaluation result.
|
|
45
|
+
|
|
46
|
+
:param eval_input: The input to the evaluation function.
|
|
47
|
+
:type eval_input: Dict
|
|
48
|
+
:return: The evaluation result.
|
|
49
|
+
:rtype: Dict
|
|
50
|
+
"""
|
|
51
|
+
ground_truth = eval_input["ground_truth"]
|
|
52
|
+
response = eval_input["response"]
|
|
53
|
+
reference_tokens = nltk_tokenize(ground_truth)
|
|
54
|
+
hypothesis_tokens = nltk_tokenize(response)
|
|
55
55
|
|
|
56
|
-
|
|
56
|
+
# NIST Smoothing
|
|
57
|
+
smoothing_function = SmoothingFunction().method4
|
|
58
|
+
score = sentence_bleu([reference_tokens], hypothesis_tokens, smoothing_function=smoothing_function)
|
|
59
|
+
|
|
60
|
+
return {
|
|
61
|
+
"bleu_score": score,
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
@overload # type: ignore
|
|
65
|
+
def __call__(self, *, response: str, ground_truth: str):
|
|
57
66
|
"""
|
|
58
67
|
Evaluate the BLEU score between the response and the ground truth.
|
|
59
68
|
|
|
@@ -64,9 +73,21 @@ class BleuScoreEvaluator:
|
|
|
64
73
|
:return: The BLEU score.
|
|
65
74
|
:rtype: Dict[str, float]
|
|
66
75
|
"""
|
|
67
|
-
return async_run_allowing_running_loop(
|
|
68
|
-
self._async_evaluator, response=response, ground_truth=ground_truth, **kwargs
|
|
69
|
-
)
|
|
70
76
|
|
|
71
|
-
|
|
72
|
-
|
|
77
|
+
@override
|
|
78
|
+
def __call__( # pylint: disable=docstring-missing-param
|
|
79
|
+
self,
|
|
80
|
+
*args,
|
|
81
|
+
**kwargs,
|
|
82
|
+
):
|
|
83
|
+
"""
|
|
84
|
+
Evaluate the BLEU score between the response and the ground truth.
|
|
85
|
+
|
|
86
|
+
:keyword response: The response to be evaluated.
|
|
87
|
+
:paramtype response: str
|
|
88
|
+
:keyword ground_truth: The ground truth to be compared against.
|
|
89
|
+
:paramtype ground_truth: str
|
|
90
|
+
:return: The BLEU score.
|
|
91
|
+
:rtype: Dict[str, float]
|
|
92
|
+
"""
|
|
93
|
+
return super().__call__(*args, **kwargs)
|
|
@@ -5,9 +5,11 @@
|
|
|
5
5
|
from ._base_eval import EvaluatorBase
|
|
6
6
|
from ._base_prompty_eval import PromptyEvaluatorBase
|
|
7
7
|
from ._base_rai_svc_eval import RaiServiceEvaluatorBase
|
|
8
|
+
from ._base_multi_eval import MultiEvaluatorBase
|
|
8
9
|
|
|
9
10
|
__all__ = [
|
|
10
11
|
"EvaluatorBase",
|
|
11
12
|
"PromptyEvaluatorBase",
|
|
12
13
|
"RaiServiceEvaluatorBase",
|
|
14
|
+
"MultiEvaluatorBase",
|
|
13
15
|
]
|