azure-ai-evaluation 1.1.0__py3-none-any.whl → 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of azure-ai-evaluation might be problematic. Click here for more details.
- azure/ai/evaluation/_azure/_clients.py +24 -8
- azure/ai/evaluation/_azure/_models.py +2 -2
- azure/ai/evaluation/_constants.py +18 -0
- azure/ai/evaluation/_evaluate/_batch_run/__init__.py +2 -1
- azure/ai/evaluation/_evaluate/_eval_run.py +3 -1
- azure/ai/evaluation/_evaluate/_evaluate.py +69 -12
- azure/ai/evaluation/_evaluate/_utils.py +27 -0
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +46 -25
- azure/ai/evaluation/_evaluators/_common/__init__.py +2 -0
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +69 -4
- azure/ai/evaluation/_evaluators/_common/_base_multi_eval.py +61 -0
- azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +7 -1
- azure/ai/evaluation/_evaluators/_common/_conversation_aggregators.py +49 -0
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +5 -42
- azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +2 -0
- azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +2 -0
- azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +2 -0
- azure/ai/evaluation/_evaluators/_content_safety/_violence.py +2 -0
- azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +61 -68
- azure/ai/evaluation/_evaluators/_gleu/_gleu.py +45 -23
- azure/ai/evaluation/_evaluators/_meteor/_meteor.py +55 -34
- azure/ai/evaluation/_evaluators/_qa/_qa.py +32 -27
- azure/ai/evaluation/_evaluators/_rouge/_rouge.py +44 -23
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +41 -81
- azure/ai/evaluation/_version.py +1 -1
- azure/ai/evaluation/simulator/_simulator.py +21 -13
- {azure_ai_evaluation-1.1.0.dist-info → azure_ai_evaluation-1.2.0.dist-info}/METADATA +71 -7
- {azure_ai_evaluation-1.1.0.dist-info → azure_ai_evaluation-1.2.0.dist-info}/RECORD +31 -29
- {azure_ai_evaluation-1.1.0.dist-info → azure_ai_evaluation-1.2.0.dist-info}/NOTICE.txt +0 -0
- {azure_ai_evaluation-1.1.0.dist-info → azure_ai_evaluation-1.2.0.dist-info}/WHEEL +0 -0
- {azure_ai_evaluation-1.1.0.dist-info → azure_ai_evaluation-1.2.0.dist-info}/top_level.txt +0 -0
|
@@ -17,7 +17,7 @@ from azure.ai.evaluation.simulator._model_tools._identity_manager import TokenSc
|
|
|
17
17
|
from ._models import BlobStoreInfo, Workspace
|
|
18
18
|
|
|
19
19
|
|
|
20
|
-
API_VERSION: Final[str] = "2024-
|
|
20
|
+
API_VERSION: Final[str] = "2024-07-01-preview"
|
|
21
21
|
QUERY_KEY_API_VERSION: Final[str] = "api-version"
|
|
22
22
|
PATH_ML_WORKSPACES = ("providers", "Microsoft.MachineLearningServices", "workspaces")
|
|
23
23
|
|
|
@@ -69,7 +69,9 @@ class LiteMLClient:
|
|
|
69
69
|
self._get_token_manager()
|
|
70
70
|
return cast(TokenCredential, self._credential)
|
|
71
71
|
|
|
72
|
-
def workspace_get_default_datastore(
|
|
72
|
+
def workspace_get_default_datastore(
|
|
73
|
+
self, workspace_name: str, *, include_credentials: bool = False, **kwargs: Any
|
|
74
|
+
) -> BlobStoreInfo:
|
|
73
75
|
# 1. Get the default blob store
|
|
74
76
|
# REST API documentation:
|
|
75
77
|
# https://learn.microsoft.com/rest/api/azureml/datastores/list?view=rest-azureml-2024-10-01
|
|
@@ -92,18 +94,29 @@ class LiteMLClient:
|
|
|
92
94
|
account_name = props_json["accountName"]
|
|
93
95
|
endpoint = props_json["endpoint"]
|
|
94
96
|
container_name = props_json["containerName"]
|
|
97
|
+
credential_type = props_json.get("credentials", {}).get("credentialsType")
|
|
95
98
|
|
|
96
99
|
# 2. Get the SAS token to use for accessing the blob store
|
|
97
100
|
# REST API documentation:
|
|
98
101
|
# https://learn.microsoft.com/rest/api/azureml/datastores/list-secrets?view=rest-azureml-2024-10-01
|
|
99
|
-
blob_store_credential: Optional[Union[AzureSasCredential, str]]
|
|
100
|
-
if include_credentials:
|
|
102
|
+
blob_store_credential: Optional[Union[AzureSasCredential, TokenCredential, str]]
|
|
103
|
+
if not include_credentials:
|
|
104
|
+
blob_store_credential = None
|
|
105
|
+
elif credential_type and credential_type.lower() == "none":
|
|
106
|
+
# If storage account key access is disabled, and only Microsoft Entra ID authentication is available,
|
|
107
|
+
# the credentialsType will be "None" and we should not attempt to get the secrets.
|
|
108
|
+
blob_store_credential = self.get_credential()
|
|
109
|
+
else:
|
|
101
110
|
url = self._generate_path(
|
|
102
111
|
*PATH_ML_WORKSPACES, workspace_name, "datastores", "workspaceblobstore", "listSecrets"
|
|
103
112
|
)
|
|
104
113
|
secrets_response = self._http_client.request(
|
|
105
114
|
method="POST",
|
|
106
115
|
url=url,
|
|
116
|
+
json={
|
|
117
|
+
"expirableSecret": True,
|
|
118
|
+
"expireAfterHours": int(kwargs.get("key_expiration_hours", 1)),
|
|
119
|
+
},
|
|
107
120
|
params={
|
|
108
121
|
QUERY_KEY_API_VERSION: self._api_version,
|
|
109
122
|
},
|
|
@@ -114,10 +127,13 @@ class LiteMLClient:
|
|
|
114
127
|
secrets_json = secrets_response.json()
|
|
115
128
|
secrets_type = secrets_json["secretsType"].lower()
|
|
116
129
|
|
|
130
|
+
# As per this website, only SAS tokens, access tokens, or Entra IDs are valid for accessing blob data
|
|
131
|
+
# stores:
|
|
132
|
+
# https://learn.microsoft.com/rest/api/storageservices/authorize-requests-to-azure-storage.
|
|
117
133
|
if secrets_type == "sas":
|
|
118
134
|
blob_store_credential = AzureSasCredential(secrets_json["sasToken"])
|
|
119
135
|
elif secrets_type == "accountkey":
|
|
120
|
-
# To support
|
|
136
|
+
# To support older versions of azure-storage-blob better, we return a string here instead of
|
|
121
137
|
# an AzureNamedKeyCredential
|
|
122
138
|
blob_store_credential = secrets_json["key"]
|
|
123
139
|
else:
|
|
@@ -164,19 +180,19 @@ class LiteMLClient:
|
|
|
164
180
|
# nothing to see here, move along
|
|
165
181
|
return
|
|
166
182
|
|
|
167
|
-
|
|
183
|
+
message = f"The {description} request failed with HTTP {response.status_code}"
|
|
168
184
|
try:
|
|
169
185
|
error_json = response.json()["error"]
|
|
170
186
|
additional_info = f"({error_json['code']}) {error_json['message']}"
|
|
187
|
+
message += f" - {additional_info}"
|
|
171
188
|
except (JSONDecodeError, ValueError, KeyError):
|
|
172
189
|
pass
|
|
173
190
|
|
|
174
191
|
raise EvaluationException(
|
|
175
|
-
message=
|
|
192
|
+
message=message,
|
|
176
193
|
target=ErrorTarget.EVALUATE,
|
|
177
194
|
category=ErrorCategory.FAILED_EXECUTION,
|
|
178
195
|
blame=ErrorBlame.SYSTEM_ERROR,
|
|
179
|
-
internal_message=additional_info,
|
|
180
196
|
)
|
|
181
197
|
|
|
182
198
|
def _generate_path(self, *paths: str) -> str:
|
|
@@ -8,7 +8,7 @@
|
|
|
8
8
|
|
|
9
9
|
from typing import Dict, List, NamedTuple, Optional, Union
|
|
10
10
|
from msrest.serialization import Model
|
|
11
|
-
from azure.core.credentials import AzureSasCredential
|
|
11
|
+
from azure.core.credentials import AzureSasCredential, TokenCredential
|
|
12
12
|
|
|
13
13
|
|
|
14
14
|
class BlobStoreInfo(NamedTuple):
|
|
@@ -16,7 +16,7 @@ class BlobStoreInfo(NamedTuple):
|
|
|
16
16
|
account_name: str
|
|
17
17
|
endpoint: str
|
|
18
18
|
container_name: str
|
|
19
|
-
credential: Optional[Union[AzureSasCredential, str]]
|
|
19
|
+
credential: Optional[Union[AzureSasCredential, TokenCredential, str]]
|
|
20
20
|
|
|
21
21
|
|
|
22
22
|
class WorkspaceHubConfig(Model):
|
|
@@ -1,7 +1,9 @@
|
|
|
1
1
|
# ---------------------------------------------------------
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
|
+
import enum
|
|
4
5
|
from typing import Literal
|
|
6
|
+
from azure.ai.evaluation._common._experimental import experimental
|
|
5
7
|
|
|
6
8
|
|
|
7
9
|
class EvaluationMetrics:
|
|
@@ -57,6 +59,22 @@ class EvaluationRunProperties:
|
|
|
57
59
|
EVALUATION_SDK = "_azureml.evaluation_sdk_name"
|
|
58
60
|
|
|
59
61
|
|
|
62
|
+
@experimental
|
|
63
|
+
class _AggregationType(enum.Enum):
|
|
64
|
+
"""Defines how numeric evaluation results should be aggregated
|
|
65
|
+
to produce a single value. Used by individual evaluators to combine per-turn results for
|
|
66
|
+
a conversation-based input. In general, wherever this enum is used, it is also possible
|
|
67
|
+
to directly assign the underlying aggregation function for more complex use cases.
|
|
68
|
+
The 'custom' value is generally not an acceptable input, and should only be used as an output
|
|
69
|
+
to indicate that a custom aggregation function has been injected."""
|
|
70
|
+
|
|
71
|
+
MEAN = "mean"
|
|
72
|
+
MAX = "max"
|
|
73
|
+
MIN = "min"
|
|
74
|
+
SUM = "sum"
|
|
75
|
+
CUSTOM = "custom"
|
|
76
|
+
|
|
77
|
+
|
|
60
78
|
DEFAULT_EVALUATION_RESULTS_FILE_NAME = "evaluation_results.json"
|
|
61
79
|
|
|
62
80
|
CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT = 4
|
|
@@ -5,5 +5,6 @@ from .eval_run_context import EvalRunContext
|
|
|
5
5
|
from .code_client import CodeClient
|
|
6
6
|
from .proxy_client import ProxyClient
|
|
7
7
|
from .target_run_context import TargetRunContext
|
|
8
|
+
from .proxy_client import ProxyRun
|
|
8
9
|
|
|
9
|
-
__all__ = ["CodeClient", "ProxyClient", "EvalRunContext", "TargetRunContext"]
|
|
10
|
+
__all__ = ["CodeClient", "ProxyClient", "EvalRunContext", "TargetRunContext", "ProxyRun"]
|
|
@@ -421,7 +421,9 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
|
|
|
421
421
|
local_paths.append(local_file_path)
|
|
422
422
|
|
|
423
423
|
# We will write the artifacts to the workspaceblobstore
|
|
424
|
-
datastore = self._management_client.workspace_get_default_datastore(
|
|
424
|
+
datastore = self._management_client.workspace_get_default_datastore(
|
|
425
|
+
self._workspace_name, include_credentials=True
|
|
426
|
+
)
|
|
425
427
|
account_url = f"{datastore.account_name}.blob.{datastore.endpoint}"
|
|
426
428
|
|
|
427
429
|
svc_client = BlobServiceClient(account_url=account_url, credential=datastore.credential)
|
|
@@ -12,6 +12,7 @@ import pandas as pd
|
|
|
12
12
|
from promptflow._sdk._constants import LINE_NUMBER
|
|
13
13
|
from promptflow.client import PFClient
|
|
14
14
|
from promptflow.entities import Run
|
|
15
|
+
from promptflow._sdk._configuration import Configuration
|
|
15
16
|
|
|
16
17
|
from azure.ai.evaluation._common.math import list_mean_nan_safe, apply_transform_nan_safe
|
|
17
18
|
from azure.ai.evaluation._common.utils import validate_azure_ai_project
|
|
@@ -20,17 +21,19 @@ from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarg
|
|
|
20
21
|
from .._constants import (
|
|
21
22
|
CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT,
|
|
22
23
|
EvaluationMetrics,
|
|
24
|
+
DefaultOpenEncoding,
|
|
23
25
|
Prefixes,
|
|
24
26
|
_InternalEvaluationMetrics,
|
|
25
27
|
)
|
|
26
28
|
from .._model_configurations import AzureAIProject, EvaluationResult, EvaluatorConfig
|
|
27
29
|
from .._user_agent import USER_AGENT
|
|
28
|
-
from ._batch_run import EvalRunContext, CodeClient, ProxyClient, TargetRunContext
|
|
30
|
+
from ._batch_run import EvalRunContext, CodeClient, ProxyClient, TargetRunContext, ProxyRun
|
|
29
31
|
from ._utils import (
|
|
30
32
|
_apply_column_mapping,
|
|
31
33
|
_log_metrics_and_instance_results,
|
|
32
34
|
_trace_destination_from_project_scope,
|
|
33
35
|
_write_output,
|
|
36
|
+
DataLoaderFactory,
|
|
34
37
|
)
|
|
35
38
|
|
|
36
39
|
TClient = TypeVar("TClient", ProxyClient, CodeClient)
|
|
@@ -429,10 +432,11 @@ def _validate_and_load_data(target, data, evaluators, output_path, azure_ai_proj
|
|
|
429
432
|
)
|
|
430
433
|
|
|
431
434
|
try:
|
|
432
|
-
|
|
435
|
+
data_loader = DataLoaderFactory.get_loader(data)
|
|
436
|
+
initial_data_df = data_loader.load()
|
|
433
437
|
except Exception as e:
|
|
434
438
|
raise EvaluationException(
|
|
435
|
-
message=f"Unable to load data from '{data}'.
|
|
439
|
+
message=f"Unable to load data from '{data}'. Supported formats are JSONL and CSV. Detailed error: {e}.",
|
|
436
440
|
target=ErrorTarget.EVALUATE,
|
|
437
441
|
category=ErrorCategory.INVALID_VALUE,
|
|
438
442
|
blame=ErrorBlame.USER_ERROR,
|
|
@@ -444,7 +448,7 @@ def _validate_and_load_data(target, data, evaluators, output_path, azure_ai_proj
|
|
|
444
448
|
def _apply_target_to_data(
|
|
445
449
|
target: Callable,
|
|
446
450
|
data: Union[str, os.PathLike],
|
|
447
|
-
|
|
451
|
+
batch_client: TClient,
|
|
448
452
|
initial_data: pd.DataFrame,
|
|
449
453
|
evaluation_name: Optional[str] = None,
|
|
450
454
|
**kwargs,
|
|
@@ -454,10 +458,10 @@ def _apply_target_to_data(
|
|
|
454
458
|
|
|
455
459
|
:param target: The function to be applied to data.
|
|
456
460
|
:type target: Callable
|
|
457
|
-
:param data: The path to input jsonl file.
|
|
461
|
+
:param data: The path to input jsonl or csv file.
|
|
458
462
|
:type data: Union[str, os.PathLike]
|
|
459
|
-
:param
|
|
460
|
-
:type
|
|
463
|
+
:param batch_client: The promptflow client to be used.
|
|
464
|
+
:type batch_client: PFClient
|
|
461
465
|
:param initial_data: The data frame with the loaded data.
|
|
462
466
|
:type initial_data: pd.DataFrame
|
|
463
467
|
:param evaluation_name: The name of the evaluation.
|
|
@@ -467,7 +471,7 @@ def _apply_target_to_data(
|
|
|
467
471
|
"""
|
|
468
472
|
_run_name = kwargs.get("_run_name")
|
|
469
473
|
with TargetRunContext():
|
|
470
|
-
run:
|
|
474
|
+
run: ProxyRun = batch_client.run(
|
|
471
475
|
flow=target,
|
|
472
476
|
display_name=evaluation_name,
|
|
473
477
|
data=data,
|
|
@@ -475,7 +479,18 @@ def _apply_target_to_data(
|
|
|
475
479
|
name=_run_name,
|
|
476
480
|
)
|
|
477
481
|
|
|
478
|
-
target_output: pd.DataFrame =
|
|
482
|
+
target_output: pd.DataFrame = batch_client.get_details(run, all_results=True)
|
|
483
|
+
run_summary = batch_client.get_run_summary(run)
|
|
484
|
+
|
|
485
|
+
if run_summary["completed_lines"] == 0:
|
|
486
|
+
msg = (f"Evaluation target failed to produce any results."
|
|
487
|
+
f" Please check the logs at {run_summary['log_path']} for more details about cause of failure.")
|
|
488
|
+
raise EvaluationException(
|
|
489
|
+
message=msg,
|
|
490
|
+
target=ErrorTarget.EVALUATE,
|
|
491
|
+
category=ErrorCategory.FAILED_EXECUTION,
|
|
492
|
+
blame=ErrorBlame.USER_ERROR,
|
|
493
|
+
)
|
|
479
494
|
# Remove input and output prefix
|
|
480
495
|
generated_columns = {
|
|
481
496
|
col[len(Prefixes.OUTPUTS) :] for col in target_output.columns if col.startswith(Prefixes.OUTPUTS)
|
|
@@ -494,7 +509,7 @@ def _apply_target_to_data(
|
|
|
494
509
|
# Concatenate output to input
|
|
495
510
|
target_output = pd.concat([target_output, initial_data], axis=1)
|
|
496
511
|
|
|
497
|
-
return target_output, generated_columns, run
|
|
512
|
+
return target_output, generated_columns, run.run.result()
|
|
498
513
|
|
|
499
514
|
|
|
500
515
|
def _process_column_mappings(
|
|
@@ -569,13 +584,14 @@ def evaluate(
|
|
|
569
584
|
evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None,
|
|
570
585
|
azure_ai_project: Optional[AzureAIProject] = None,
|
|
571
586
|
output_path: Optional[Union[str, os.PathLike]] = None,
|
|
587
|
+
fail_on_evaluator_errors: bool = False,
|
|
572
588
|
**kwargs,
|
|
573
589
|
) -> EvaluationResult:
|
|
574
590
|
"""Evaluates target or data with built-in or custom evaluators. If both target and data are provided,
|
|
575
591
|
data will be run through target function and then results will be evaluated.
|
|
576
592
|
|
|
577
593
|
:keyword data: Path to the data to be evaluated or passed to target if target is set.
|
|
578
|
-
|
|
594
|
+
JSONL and CSV files are supported. `target` and `data` both cannot be None. Required.
|
|
579
595
|
:paramtype data: str
|
|
580
596
|
:keyword evaluators: Evaluators to be used for evaluation. It should be a dictionary with key as alias for evaluator
|
|
581
597
|
and value as the evaluator function. Required.
|
|
@@ -594,6 +610,11 @@ def evaluate(
|
|
|
594
610
|
:paramtype output_path: Optional[str]
|
|
595
611
|
:keyword azure_ai_project: Logs evaluation results to AI Studio if set.
|
|
596
612
|
:paramtype azure_ai_project: Optional[~azure.ai.evaluation.AzureAIProject]
|
|
613
|
+
:keyword fail_on_evaluator_errors: Whether or not the evaluation should cancel early with an EvaluationException
|
|
614
|
+
if ANY evaluator fails during their evaluation.
|
|
615
|
+
Defaults to false, which means that evaluations will continue regardless of failures.
|
|
616
|
+
If such failures occur, metrics may be missing, and evidence of failures can be found in the evaluation's logs.
|
|
617
|
+
:paramtype fail_on_evaluator_errors: bool
|
|
597
618
|
:return: Evaluation results.
|
|
598
619
|
:rtype: ~azure.ai.evaluation.EvaluationResult
|
|
599
620
|
|
|
@@ -615,6 +636,7 @@ def evaluate(
|
|
|
615
636
|
evaluator_config=evaluator_config,
|
|
616
637
|
azure_ai_project=azure_ai_project,
|
|
617
638
|
output_path=output_path,
|
|
639
|
+
fail_on_evaluator_errors=fail_on_evaluator_errors,
|
|
618
640
|
**kwargs,
|
|
619
641
|
)
|
|
620
642
|
except Exception as e:
|
|
@@ -663,6 +685,16 @@ def _print_summary(per_evaluator_results: Dict[str, Any]) -> None:
|
|
|
663
685
|
print("\n====================================================\n")
|
|
664
686
|
|
|
665
687
|
|
|
688
|
+
def _print_fail_flag_warning() -> None:
|
|
689
|
+
print(
|
|
690
|
+
"Notice: fail_on_evaluator_errors is enabled. It is recommended that you disable "
|
|
691
|
+
+ "this flag for evaluations on large datasets (loosely defined as more than 10 rows of inputs, "
|
|
692
|
+
+ "or more than 4 evaluators). Using this flag on large datasets runs the risk of large runs failing "
|
|
693
|
+
+ "without producing any outputs, since a single failure will cancel the entire run "
|
|
694
|
+
"when fail_on_evaluator_errors is enabled."
|
|
695
|
+
)
|
|
696
|
+
|
|
697
|
+
|
|
666
698
|
def _evaluate( # pylint: disable=too-many-locals,too-many-statements
|
|
667
699
|
*,
|
|
668
700
|
evaluators: Dict[str, Callable],
|
|
@@ -672,8 +704,11 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
|
|
|
672
704
|
evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None,
|
|
673
705
|
azure_ai_project: Optional[AzureAIProject] = None,
|
|
674
706
|
output_path: Optional[Union[str, os.PathLike]] = None,
|
|
707
|
+
fail_on_evaluator_errors: bool = False,
|
|
675
708
|
**kwargs,
|
|
676
709
|
) -> EvaluationResult:
|
|
710
|
+
if fail_on_evaluator_errors:
|
|
711
|
+
_print_fail_flag_warning()
|
|
677
712
|
input_data_df = _validate_and_load_data(target, data, evaluators, output_path, azure_ai_project, evaluation_name)
|
|
678
713
|
|
|
679
714
|
# Process evaluator config to replace ${target.} with ${data.}
|
|
@@ -690,6 +725,7 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
|
|
|
690
725
|
if target is not None:
|
|
691
726
|
_validate_columns_for_target(input_data_df, target)
|
|
692
727
|
|
|
728
|
+
Configuration.get_instance().set_config("trace.destination", "none")
|
|
693
729
|
pf_client = PFClient(user_agent=USER_AGENT)
|
|
694
730
|
target_run: Optional[Run] = None
|
|
695
731
|
|
|
@@ -702,7 +738,7 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
|
|
|
702
738
|
target_generated_columns: Set[str] = set()
|
|
703
739
|
if data is not None and target is not None:
|
|
704
740
|
input_data_df, target_generated_columns, target_run = _apply_target_to_data(
|
|
705
|
-
target, data, pf_client, input_data_df, evaluation_name, **kwargs
|
|
741
|
+
target, data, ProxyClient(pf_client), input_data_df, evaluation_name, **kwargs
|
|
706
742
|
)
|
|
707
743
|
|
|
708
744
|
for evaluator_name, mapping in column_mapping.items():
|
|
@@ -773,6 +809,10 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
|
|
|
773
809
|
evaluators_result_df = None
|
|
774
810
|
evaluators_metric = {}
|
|
775
811
|
for evaluator_name, evaluator_result in per_evaluator_results.items():
|
|
812
|
+
if fail_on_evaluator_errors and evaluator_result["run_summary"]["failed_lines"] > 0:
|
|
813
|
+
_print_summary(per_evaluator_results)
|
|
814
|
+
_turn_error_logs_into_exception(evaluator_result["run_summary"]["log_path"] + "/error.json")
|
|
815
|
+
|
|
776
816
|
evaluator_result_df = evaluator_result["result"]
|
|
777
817
|
|
|
778
818
|
# drop input columns
|
|
@@ -825,3 +865,20 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
|
|
|
825
865
|
_write_output(output_path, result)
|
|
826
866
|
|
|
827
867
|
return result
|
|
868
|
+
|
|
869
|
+
|
|
870
|
+
def _turn_error_logs_into_exception(log_path: str) -> None:
|
|
871
|
+
"""Produce an EvaluationException using the contents of the inputted
|
|
872
|
+
file as the error message.
|
|
873
|
+
|
|
874
|
+
:param log_path: The path to the error log file.
|
|
875
|
+
:type log_path: str
|
|
876
|
+
"""
|
|
877
|
+
with open(log_path, "r", encoding=DefaultOpenEncoding.READ) as file:
|
|
878
|
+
error_message = file.read()
|
|
879
|
+
raise EvaluationException(
|
|
880
|
+
message=error_message,
|
|
881
|
+
target=ErrorTarget.EVALUATE,
|
|
882
|
+
category=ErrorCategory.FAILED_EXECUTION,
|
|
883
|
+
blame=ErrorBlame.UNKNOWN,
|
|
884
|
+
)
|
|
@@ -328,3 +328,30 @@ def set_event_loop_policy() -> None:
|
|
|
328
328
|
# Reference: https://stackoverflow.com/questions/45600579/asyncio-event-loop-is-closed-when-getting-loop
|
|
329
329
|
# On Windows seems to be a problem with EventLoopPolicy, use this snippet to work around it
|
|
330
330
|
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy()) # type: ignore[attr-defined]
|
|
331
|
+
|
|
332
|
+
|
|
333
|
+
class JSONLDataFileLoader:
|
|
334
|
+
def __init__(self, filename: Union[os.PathLike, str]):
|
|
335
|
+
self.filename = filename
|
|
336
|
+
|
|
337
|
+
def load(self) -> pd.DataFrame:
|
|
338
|
+
return pd.read_json(self.filename, lines=True)
|
|
339
|
+
|
|
340
|
+
|
|
341
|
+
class CSVDataFileLoader:
|
|
342
|
+
def __init__(self, filename: Union[os.PathLike, str]):
|
|
343
|
+
self.filename = filename
|
|
344
|
+
|
|
345
|
+
def load(self) -> pd.DataFrame:
|
|
346
|
+
return pd.read_csv(self.filename)
|
|
347
|
+
|
|
348
|
+
|
|
349
|
+
class DataLoaderFactory:
|
|
350
|
+
@staticmethod
|
|
351
|
+
def get_loader(filename: Union[os.PathLike, str]) -> Union[JSONLDataFileLoader, CSVDataFileLoader]:
|
|
352
|
+
filename_str = str(filename).lower()
|
|
353
|
+
if filename_str.endswith(".csv"):
|
|
354
|
+
return CSVDataFileLoader(filename)
|
|
355
|
+
|
|
356
|
+
# fallback to JSONL to maintain backward compatibility
|
|
357
|
+
return JSONLDataFileLoader(filename)
|
|
@@ -1,30 +1,16 @@
|
|
|
1
1
|
# ---------------------------------------------------------
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
|
+
from typing import Dict
|
|
4
5
|
from nltk.translate.bleu_score import SmoothingFunction, sentence_bleu
|
|
5
|
-
from
|
|
6
|
+
from typing_extensions import overload, override
|
|
6
7
|
|
|
7
8
|
from azure.ai.evaluation._common.utils import nltk_tokenize
|
|
8
9
|
|
|
10
|
+
from azure.ai.evaluation._evaluators._common import EvaluatorBase
|
|
9
11
|
|
|
10
|
-
class _AsyncBleuScoreEvaluator:
|
|
11
|
-
def __init__(self):
|
|
12
|
-
pass
|
|
13
|
-
|
|
14
|
-
async def __call__(self, *, response: str, ground_truth: str, **kwargs):
|
|
15
|
-
reference_tokens = nltk_tokenize(ground_truth)
|
|
16
|
-
hypothesis_tokens = nltk_tokenize(response)
|
|
17
|
-
|
|
18
|
-
# NIST Smoothing
|
|
19
|
-
smoothing_function = SmoothingFunction().method4
|
|
20
|
-
score = sentence_bleu([reference_tokens], hypothesis_tokens, smoothing_function=smoothing_function)
|
|
21
|
-
|
|
22
|
-
return {
|
|
23
|
-
"bleu_score": score,
|
|
24
|
-
}
|
|
25
12
|
|
|
26
|
-
|
|
27
|
-
class BleuScoreEvaluator:
|
|
13
|
+
class BleuScoreEvaluator(EvaluatorBase):
|
|
28
14
|
"""
|
|
29
15
|
Calculate the BLEU score for a given response and ground truth.
|
|
30
16
|
|
|
@@ -51,9 +37,32 @@ class BleuScoreEvaluator:
|
|
|
51
37
|
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
52
38
|
|
|
53
39
|
def __init__(self):
|
|
54
|
-
|
|
40
|
+
super().__init__()
|
|
41
|
+
|
|
42
|
+
@override
|
|
43
|
+
async def _do_eval(self, eval_input: Dict) -> Dict[str, float]:
|
|
44
|
+
"""Produce a glue score evaluation result.
|
|
45
|
+
|
|
46
|
+
:param eval_input: The input to the evaluation function.
|
|
47
|
+
:type eval_input: Dict
|
|
48
|
+
:return: The evaluation result.
|
|
49
|
+
:rtype: Dict
|
|
50
|
+
"""
|
|
51
|
+
ground_truth = eval_input["ground_truth"]
|
|
52
|
+
response = eval_input["response"]
|
|
53
|
+
reference_tokens = nltk_tokenize(ground_truth)
|
|
54
|
+
hypothesis_tokens = nltk_tokenize(response)
|
|
55
55
|
|
|
56
|
-
|
|
56
|
+
# NIST Smoothing
|
|
57
|
+
smoothing_function = SmoothingFunction().method4
|
|
58
|
+
score = sentence_bleu([reference_tokens], hypothesis_tokens, smoothing_function=smoothing_function)
|
|
59
|
+
|
|
60
|
+
return {
|
|
61
|
+
"bleu_score": score,
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
@overload # type: ignore
|
|
65
|
+
def __call__(self, *, response: str, ground_truth: str):
|
|
57
66
|
"""
|
|
58
67
|
Evaluate the BLEU score between the response and the ground truth.
|
|
59
68
|
|
|
@@ -64,9 +73,21 @@ class BleuScoreEvaluator:
|
|
|
64
73
|
:return: The BLEU score.
|
|
65
74
|
:rtype: Dict[str, float]
|
|
66
75
|
"""
|
|
67
|
-
return async_run_allowing_running_loop(
|
|
68
|
-
self._async_evaluator, response=response, ground_truth=ground_truth, **kwargs
|
|
69
|
-
)
|
|
70
76
|
|
|
71
|
-
|
|
72
|
-
|
|
77
|
+
@override
|
|
78
|
+
def __call__( # pylint: disable=docstring-missing-param
|
|
79
|
+
self,
|
|
80
|
+
*args,
|
|
81
|
+
**kwargs,
|
|
82
|
+
):
|
|
83
|
+
"""
|
|
84
|
+
Evaluate the BLEU score between the response and the ground truth.
|
|
85
|
+
|
|
86
|
+
:keyword response: The response to be evaluated.
|
|
87
|
+
:paramtype response: str
|
|
88
|
+
:keyword ground_truth: The ground truth to be compared against.
|
|
89
|
+
:paramtype ground_truth: str
|
|
90
|
+
:return: The BLEU score.
|
|
91
|
+
:rtype: Dict[str, float]
|
|
92
|
+
"""
|
|
93
|
+
return super().__call__(*args, **kwargs)
|
|
@@ -5,9 +5,11 @@
|
|
|
5
5
|
from ._base_eval import EvaluatorBase
|
|
6
6
|
from ._base_prompty_eval import PromptyEvaluatorBase
|
|
7
7
|
from ._base_rai_svc_eval import RaiServiceEvaluatorBase
|
|
8
|
+
from ._base_multi_eval import MultiEvaluatorBase
|
|
8
9
|
|
|
9
10
|
__all__ = [
|
|
10
11
|
"EvaluatorBase",
|
|
11
12
|
"PromptyEvaluatorBase",
|
|
12
13
|
"RaiServiceEvaluatorBase",
|
|
14
|
+
"MultiEvaluatorBase",
|
|
13
15
|
]
|
|
@@ -4,15 +4,18 @@
|
|
|
4
4
|
|
|
5
5
|
import inspect
|
|
6
6
|
from abc import ABC, abstractmethod
|
|
7
|
-
from typing import Any, Callable, Dict, Generic, List, TypedDict, TypeVar, Union, cast, final
|
|
7
|
+
from typing import Any, Callable, Dict, Generic, List, TypedDict, TypeVar, Union, cast, final, Optional
|
|
8
8
|
|
|
9
9
|
from promptflow._utils.async_utils import async_run_allowing_running_loop
|
|
10
10
|
from typing_extensions import ParamSpec, TypeAlias, get_overloads
|
|
11
11
|
|
|
12
|
-
from azure.ai.evaluation._common.math import list_mean
|
|
13
12
|
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
14
13
|
from azure.ai.evaluation._common.utils import remove_optional_singletons
|
|
14
|
+
from azure.ai.evaluation._constants import _AggregationType
|
|
15
15
|
from azure.ai.evaluation._model_configurations import Conversation
|
|
16
|
+
from azure.ai.evaluation._common._experimental import experimental
|
|
17
|
+
|
|
18
|
+
from ._conversation_aggregators import GetAggregator, GetAggregatorType
|
|
16
19
|
|
|
17
20
|
P = ParamSpec("P")
|
|
18
21
|
T = TypeVar("T")
|
|
@@ -25,6 +28,7 @@ class DerivedEvalInput(TypedDict, total=False):
|
|
|
25
28
|
query: Dict[str, Any]
|
|
26
29
|
response: Dict[str, Any]
|
|
27
30
|
context: str
|
|
31
|
+
ground_truth: str
|
|
28
32
|
|
|
29
33
|
|
|
30
34
|
AggregateResult: TypeAlias = Dict[str, Union[float, Dict[str, List[T]]]]
|
|
@@ -69,6 +73,13 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
|
|
|
69
73
|
:type not_singleton_inputs: List[str]
|
|
70
74
|
:param eval_last_turn: If True, only the last turn of the conversation will be evaluated. Default is False.
|
|
71
75
|
:type eval_last_turn: bool
|
|
76
|
+
:param conversation_aggregation_type: The type of aggregation to perform on the per-turn results of a conversation
|
|
77
|
+
to produce a single result.
|
|
78
|
+
Default is ~azure.ai.evaluation._AggregationType.MEAN.
|
|
79
|
+
:type conversation_aggregation_type: ~azure.ai.evaluation._AggregationType
|
|
80
|
+
:param conversation_aggregator_override: A function that will be used to aggregate per-turn results. If provided,
|
|
81
|
+
overrides the standard aggregator implied by conversation_aggregation_type. None by default.
|
|
82
|
+
:type conversation_aggregator_override: Optional[Callable[[List[float]], float]]
|
|
72
83
|
"""
|
|
73
84
|
|
|
74
85
|
# ~~~ METHODS THAT ALMOST ALWAYS NEED TO BE OVERRIDDEN BY CHILDREN~~~
|
|
@@ -80,11 +91,17 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
|
|
|
80
91
|
*,
|
|
81
92
|
not_singleton_inputs: List[str] = ["conversation", "kwargs"],
|
|
82
93
|
eval_last_turn: bool = False,
|
|
94
|
+
conversation_aggregation_type: _AggregationType = _AggregationType.MEAN,
|
|
95
|
+
conversation_aggregator_override: Optional[Callable[[List[float]], float]] = None,
|
|
83
96
|
):
|
|
84
97
|
self._not_singleton_inputs = not_singleton_inputs
|
|
85
98
|
self._eval_last_turn = eval_last_turn
|
|
86
99
|
self._singleton_inputs = self._derive_singleton_inputs()
|
|
87
100
|
self._async_evaluator = AsyncEvaluatorBase(self._real_call)
|
|
101
|
+
self._conversation_aggregation_function = GetAggregator(conversation_aggregation_type)
|
|
102
|
+
if conversation_aggregator_override is not None:
|
|
103
|
+
# Type ignore since we already checked for None, but mypy doesn't know that.
|
|
104
|
+
self._conversation_aggregation_function = conversation_aggregator_override # type: ignore[assignment]
|
|
88
105
|
|
|
89
106
|
# This needs to be overridden just to change the function header into something more informative,
|
|
90
107
|
# and to be able to add a more specific docstring. The actual function contents should just be
|
|
@@ -158,6 +175,7 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
|
|
|
158
175
|
include_context = "context" in self._singleton_inputs
|
|
159
176
|
include_query = "query" in self._singleton_inputs
|
|
160
177
|
include_response = "response" in self._singleton_inputs
|
|
178
|
+
include_ground_truth = "ground_truth" in self._singleton_inputs
|
|
161
179
|
|
|
162
180
|
def converter(conversation: Dict) -> List[DerivedEvalInput]:
|
|
163
181
|
messages = cast(List[Dict[str, Any]], conversation["messages"])
|
|
@@ -198,6 +216,8 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
|
|
|
198
216
|
eval_input["response"] = response.get("content", "")
|
|
199
217
|
if include_context:
|
|
200
218
|
eval_input["context"] = str(context)
|
|
219
|
+
if include_ground_truth:
|
|
220
|
+
eval_input["ground_truth"] = response.get("ground_truth", "")
|
|
201
221
|
eval_inputs.append(eval_input)
|
|
202
222
|
return eval_inputs
|
|
203
223
|
|
|
@@ -355,7 +375,7 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
|
|
|
355
375
|
# Find and average all numeric values
|
|
356
376
|
for metric, values in evaluation_per_turn.items():
|
|
357
377
|
if all(isinstance(value, (int, float)) for value in values):
|
|
358
|
-
aggregated[metric] =
|
|
378
|
+
aggregated[metric] = self._conversation_aggregation_function(cast(List[Union[int, float]], values))
|
|
359
379
|
# Slap the per-turn results back in.
|
|
360
380
|
aggregated["evaluation_per_turn"] = evaluation_per_turn
|
|
361
381
|
return aggregated
|
|
@@ -383,10 +403,51 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
|
|
|
383
403
|
# Otherwise, aggregate results.
|
|
384
404
|
return self._aggregate_results(per_turn_results=per_turn_results)
|
|
385
405
|
|
|
406
|
+
# ~~~ METHODS THAT SHOULD NOT BE OVERRIDDEN BY CHILDREN~~~``
|
|
407
|
+
|
|
386
408
|
@final
|
|
387
409
|
def _to_async(self) -> "AsyncEvaluatorBase":
|
|
388
410
|
return self._async_evaluator
|
|
389
411
|
|
|
412
|
+
@experimental
|
|
413
|
+
@final
|
|
414
|
+
def _set_conversation_aggregation_type(self, conversation_aggregation_type: _AggregationType) -> None:
|
|
415
|
+
"""Input a conversation aggregation type to re-assign the aggregator function used by this evaluator for
|
|
416
|
+
multi-turn conversations. This aggregator is used to combine numeric outputs from each evaluation of a
|
|
417
|
+
multi-turn conversation into a single top-level result.
|
|
418
|
+
|
|
419
|
+
:param conversation_aggregation_type: The type of aggregation to perform on the per-turn
|
|
420
|
+
results of a conversation to produce a single result.
|
|
421
|
+
:type conversation_aggregation_type: ~azure.ai.evaluation._AggregationType
|
|
422
|
+
"""
|
|
423
|
+
self._conversation_aggregation_function = GetAggregator(conversation_aggregation_type)
|
|
424
|
+
|
|
425
|
+
@experimental
|
|
426
|
+
@final
|
|
427
|
+
def _set_conversation_aggregator(self, aggregator: Callable[[List[float]], float]) -> None:
|
|
428
|
+
"""Set the conversation aggregator function directly. This function will be applied to all numeric outputs
|
|
429
|
+
of an evaluator when it evaluates a conversation with multiple-turns thus ends up with multiple results per
|
|
430
|
+
evaluation that is needs to coalesce into a single result. Use when built-in aggregators do not
|
|
431
|
+
suit your needs, but use with caution.
|
|
432
|
+
|
|
433
|
+
:param aggregator: The function to use to aggregate per-turn results.
|
|
434
|
+
:type aggregator: Callable[[List[float]], float]
|
|
435
|
+
"""
|
|
436
|
+
self._conversation_aggregation_function = aggregator
|
|
437
|
+
|
|
438
|
+
@experimental
|
|
439
|
+
@final
|
|
440
|
+
def _get_conversation_aggregator_type(self) -> _AggregationType:
|
|
441
|
+
"""Get the current conversation aggregation type used by this evaluator. This refers to the
|
|
442
|
+
method used when a single input produces multiple evaluation results (ex: when a multi-turn conversation
|
|
443
|
+
is inputted into an evaluator that evaluates each turn individually). The individual inputs
|
|
444
|
+
are combined by the function implied here to produce a single overall result.
|
|
445
|
+
|
|
446
|
+
:return: The conversation aggregation type.
|
|
447
|
+
:rtype: ~azure.ai.evaluation._AggregationType
|
|
448
|
+
"""
|
|
449
|
+
return GetAggregatorType(self._conversation_aggregation_function)
|
|
450
|
+
|
|
390
451
|
|
|
391
452
|
class AsyncEvaluatorBase:
|
|
392
453
|
"""The asynchronous evaluator hidden underneath all evaluators. This makes generous use passing functions
|
|
@@ -402,7 +463,9 @@ class AsyncEvaluatorBase:
|
|
|
402
463
|
# are just not passed into this function instead of ending up in kwargs.
|
|
403
464
|
# Since we want this to be relatively call-agnostic, we just account for every input that any children
|
|
404
465
|
# are known to throw at this, mash them into kwargs, and then pass them into the real call.
|
|
405
|
-
async def __call__(
|
|
466
|
+
async def __call__(
|
|
467
|
+
self, *, query=None, response=None, context=None, conversation=None, ground_truth=None, **kwargs
|
|
468
|
+
):
|
|
406
469
|
if conversation is not None:
|
|
407
470
|
kwargs["conversation"] = conversation
|
|
408
471
|
if query is not None:
|
|
@@ -411,4 +474,6 @@ class AsyncEvaluatorBase:
|
|
|
411
474
|
kwargs["response"] = response
|
|
412
475
|
if context is not None:
|
|
413
476
|
kwargs["context"] = context
|
|
477
|
+
if ground_truth is not None:
|
|
478
|
+
kwargs["ground_truth"] = ground_truth
|
|
414
479
|
return await self._real_call(**kwargs)
|