azure-ai-evaluation 1.0.1__py3-none-any.whl → 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of azure-ai-evaluation might be problematic. Click here for more details.
- azure/ai/evaluation/_azure/__init__.py +3 -0
- azure/ai/evaluation/_azure/_clients.py +204 -0
- azure/ai/evaluation/_azure/_models.py +227 -0
- azure/ai/evaluation/_azure/_token_manager.py +118 -0
- azure/ai/evaluation/_common/rai_service.py +30 -21
- azure/ai/evaluation/_constants.py +19 -0
- azure/ai/evaluation/_evaluate/_batch_run/__init__.py +2 -1
- azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +1 -1
- azure/ai/evaluation/_evaluate/_eval_run.py +16 -43
- azure/ai/evaluation/_evaluate/_evaluate.py +76 -44
- azure/ai/evaluation/_evaluate/_utils.py +93 -34
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +46 -25
- azure/ai/evaluation/_evaluators/_common/__init__.py +2 -0
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +140 -5
- azure/ai/evaluation/_evaluators/_common/_base_multi_eval.py +61 -0
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +12 -1
- azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +40 -2
- azure/ai/evaluation/_evaluators/_common/_conversation_aggregators.py +49 -0
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +6 -43
- azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +2 -0
- azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +2 -0
- azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +2 -0
- azure/ai/evaluation/_evaluators/_content_safety/_violence.py +2 -0
- azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +61 -68
- azure/ai/evaluation/_evaluators/_gleu/_gleu.py +45 -23
- azure/ai/evaluation/_evaluators/_meteor/_meteor.py +55 -34
- azure/ai/evaluation/_evaluators/_qa/_qa.py +32 -27
- azure/ai/evaluation/_evaluators/_rouge/_rouge.py +44 -23
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +42 -82
- azure/ai/evaluation/_http_utils.py +6 -4
- azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +0 -4
- azure/ai/evaluation/_vendor/rouge_score/scoring.py +0 -4
- azure/ai/evaluation/_vendor/rouge_score/tokenize.py +0 -4
- azure/ai/evaluation/_version.py +1 -1
- azure/ai/evaluation/simulator/_adversarial_scenario.py +2 -0
- azure/ai/evaluation/simulator/_adversarial_simulator.py +35 -16
- azure/ai/evaluation/simulator/_conversation/__init__.py +128 -7
- azure/ai/evaluation/simulator/_conversation/_conversation.py +0 -1
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +1 -0
- azure/ai/evaluation/simulator/_model_tools/_rai_client.py +40 -0
- azure/ai/evaluation/simulator/_model_tools/_template_handler.py +1 -0
- azure/ai/evaluation/simulator/_simulator.py +24 -13
- {azure_ai_evaluation-1.0.1.dist-info → azure_ai_evaluation-1.2.0.dist-info}/METADATA +84 -15
- {azure_ai_evaluation-1.0.1.dist-info → azure_ai_evaluation-1.2.0.dist-info}/RECORD +47 -41
- {azure_ai_evaluation-1.0.1.dist-info → azure_ai_evaluation-1.2.0.dist-info}/NOTICE.txt +0 -0
- {azure_ai_evaluation-1.0.1.dist-info → azure_ai_evaluation-1.2.0.dist-info}/WHEEL +0 -0
- {azure_ai_evaluation-1.0.1.dist-info → azure_ai_evaluation-1.2.0.dist-info}/top_level.txt +0 -0
|
@@ -1,7 +1,9 @@
|
|
|
1
1
|
# ---------------------------------------------------------
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
|
+
import enum
|
|
4
5
|
from typing import Literal
|
|
6
|
+
from azure.ai.evaluation._common._experimental import experimental
|
|
5
7
|
|
|
6
8
|
|
|
7
9
|
class EvaluationMetrics:
|
|
@@ -54,6 +56,23 @@ class EvaluationRunProperties:
|
|
|
54
56
|
|
|
55
57
|
RUN_TYPE = "runType"
|
|
56
58
|
EVALUATION_RUN = "_azureml.evaluation_run"
|
|
59
|
+
EVALUATION_SDK = "_azureml.evaluation_sdk_name"
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
@experimental
|
|
63
|
+
class _AggregationType(enum.Enum):
|
|
64
|
+
"""Defines how numeric evaluation results should be aggregated
|
|
65
|
+
to produce a single value. Used by individual evaluators to combine per-turn results for
|
|
66
|
+
a conversation-based input. In general, wherever this enum is used, it is also possible
|
|
67
|
+
to directly assign the underlying aggregation function for more complex use cases.
|
|
68
|
+
The 'custom' value is generally not an acceptable input, and should only be used as an output
|
|
69
|
+
to indicate that a custom aggregation function has been injected."""
|
|
70
|
+
|
|
71
|
+
MEAN = "mean"
|
|
72
|
+
MAX = "max"
|
|
73
|
+
MIN = "min"
|
|
74
|
+
SUM = "sum"
|
|
75
|
+
CUSTOM = "custom"
|
|
57
76
|
|
|
58
77
|
|
|
59
78
|
DEFAULT_EVALUATION_RESULTS_FILE_NAME = "evaluation_results.json"
|
|
@@ -5,5 +5,6 @@ from .eval_run_context import EvalRunContext
|
|
|
5
5
|
from .code_client import CodeClient
|
|
6
6
|
from .proxy_client import ProxyClient
|
|
7
7
|
from .target_run_context import TargetRunContext
|
|
8
|
+
from .proxy_client import ProxyRun
|
|
8
9
|
|
|
9
|
-
__all__ = ["CodeClient", "ProxyClient", "EvalRunContext", "TargetRunContext"]
|
|
10
|
+
__all__ = ["CodeClient", "ProxyClient", "EvalRunContext", "TargetRunContext", "ProxyRun"]
|
|
@@ -16,7 +16,7 @@ class TargetRunContext:
|
|
|
16
16
|
:type upload_snapshot: bool
|
|
17
17
|
"""
|
|
18
18
|
|
|
19
|
-
def __init__(self, upload_snapshot: bool) -> None:
|
|
19
|
+
def __init__(self, upload_snapshot: bool = False) -> None:
|
|
20
20
|
self._upload_snapshot = upload_snapshot
|
|
21
21
|
self._original_cwd = os.getcwd()
|
|
22
22
|
|
|
@@ -22,29 +22,12 @@ from azure.ai.evaluation._version import VERSION
|
|
|
22
22
|
from azure.core.pipeline.policies import RetryPolicy
|
|
23
23
|
from azure.core.rest import HttpResponse
|
|
24
24
|
from azure.core.exceptions import HttpResponseError
|
|
25
|
+
from azure.storage.blob import BlobServiceClient
|
|
26
|
+
from azure.ai.evaluation._azure._clients import LiteMLClient
|
|
25
27
|
|
|
26
28
|
LOGGER = logging.getLogger(__name__)
|
|
27
29
|
|
|
28
30
|
|
|
29
|
-
# Handle optional import. The azure libraries are only present if
|
|
30
|
-
# promptflow-azure is installed.
|
|
31
|
-
try:
|
|
32
|
-
from azure.ai.ml import MLClient
|
|
33
|
-
from azure.ai.ml.entities._credentials import AccountKeyConfiguration # pylint: disable=ungrouped-imports
|
|
34
|
-
from azure.ai.ml.entities._datastore.datastore import Datastore
|
|
35
|
-
from azure.storage.blob import BlobServiceClient
|
|
36
|
-
except (ModuleNotFoundError, ImportError):
|
|
37
|
-
raise EvaluationException( # pylint: disable=raise-missing-from
|
|
38
|
-
message=(
|
|
39
|
-
"The required packages for remote tracking are missing.\n"
|
|
40
|
-
'To resolve this, please install them by running "pip install azure-ai-evaluation[remote]".'
|
|
41
|
-
),
|
|
42
|
-
target=ErrorTarget.EVALUATE,
|
|
43
|
-
category=ErrorCategory.MISSING_PACKAGE,
|
|
44
|
-
blame=ErrorBlame.USER_ERROR,
|
|
45
|
-
)
|
|
46
|
-
|
|
47
|
-
|
|
48
31
|
@dataclasses.dataclass
|
|
49
32
|
class RunInfo:
|
|
50
33
|
"""
|
|
@@ -93,15 +76,16 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
|
|
|
93
76
|
:type group_name: str
|
|
94
77
|
:param workspace_name: The name of workspace/project used to track run.
|
|
95
78
|
:type workspace_name: str
|
|
96
|
-
:param
|
|
97
|
-
:type
|
|
79
|
+
:param management_client: The trace destination string to parse the AI ML workspace blob store from.
|
|
80
|
+
:type management_client:
|
|
81
|
+
~azure.ai.evaluation._promptflow.azure._lite_azure_management_client.LiteMLClient
|
|
98
82
|
:param promptflow_run: The promptflow run used by the
|
|
83
|
+
:type promptflow_run: Optional[promptflow._sdk.entities.Run]
|
|
99
84
|
"""
|
|
100
85
|
|
|
101
86
|
_MAX_RETRIES = 5
|
|
102
87
|
_BACKOFF_FACTOR = 2
|
|
103
88
|
_TIMEOUT = 5
|
|
104
|
-
_SCOPE = "https://management.azure.com/.default"
|
|
105
89
|
|
|
106
90
|
EVALUATION_ARTIFACT = "instance_results.jsonl"
|
|
107
91
|
|
|
@@ -112,14 +96,14 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
|
|
|
112
96
|
subscription_id: str,
|
|
113
97
|
group_name: str,
|
|
114
98
|
workspace_name: str,
|
|
115
|
-
|
|
99
|
+
management_client: LiteMLClient,
|
|
116
100
|
promptflow_run: Optional[Run] = None,
|
|
117
101
|
) -> None:
|
|
118
102
|
self._tracking_uri: str = tracking_uri
|
|
119
103
|
self._subscription_id: str = subscription_id
|
|
120
104
|
self._resource_group_name: str = group_name
|
|
121
105
|
self._workspace_name: str = workspace_name
|
|
122
|
-
self.
|
|
106
|
+
self._management_client: LiteMLClient = management_client
|
|
123
107
|
self._is_promptflow_run: bool = promptflow_run is not None
|
|
124
108
|
self._run_name = run_name
|
|
125
109
|
self._promptflow_run = promptflow_run
|
|
@@ -184,7 +168,7 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
|
|
|
184
168
|
if self._promptflow_run is not None:
|
|
185
169
|
self._info = RunInfo(
|
|
186
170
|
self._promptflow_run.name,
|
|
187
|
-
self._promptflow_run._experiment_name, # pylint: disable=protected-access
|
|
171
|
+
self._promptflow_run._experiment_name or "", # pylint: disable=protected-access
|
|
188
172
|
self._promptflow_run.name,
|
|
189
173
|
)
|
|
190
174
|
else:
|
|
@@ -310,12 +294,8 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
|
|
|
310
294
|
"""
|
|
311
295
|
return f"https://{self._url_base}" "/mlflow/v2.0" f"{self._get_scope()}" f"/api/2.0/mlflow/runs/log-metric"
|
|
312
296
|
|
|
313
|
-
def _get_token(self):
|
|
314
|
-
|
|
315
|
-
# is an optional dependency.
|
|
316
|
-
from promptflow.azure._utils._token_cache import ArmTokenCache # pylint: disable=import-error,no-name-in-module
|
|
317
|
-
|
|
318
|
-
return ArmTokenCache().get_token(self._ml_client._credential) # pylint: disable=protected-access
|
|
297
|
+
def _get_token(self) -> str:
|
|
298
|
+
return self._management_client.get_token()
|
|
319
299
|
|
|
320
300
|
def request_with_retry(
|
|
321
301
|
self, url: str, method: str, json_dict: Dict[str, Any], headers: Optional[Dict[str, str]] = None
|
|
@@ -441,9 +421,12 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
|
|
|
441
421
|
local_paths.append(local_file_path)
|
|
442
422
|
|
|
443
423
|
# We will write the artifacts to the workspaceblobstore
|
|
444
|
-
datastore = self.
|
|
424
|
+
datastore = self._management_client.workspace_get_default_datastore(
|
|
425
|
+
self._workspace_name, include_credentials=True
|
|
426
|
+
)
|
|
445
427
|
account_url = f"{datastore.account_name}.blob.{datastore.endpoint}"
|
|
446
|
-
|
|
428
|
+
|
|
429
|
+
svc_client = BlobServiceClient(account_url=account_url, credential=datastore.credential)
|
|
447
430
|
try:
|
|
448
431
|
for local, remote in zip(local_paths, remote_paths["paths"]):
|
|
449
432
|
blob_client = svc_client.get_blob_client(container=datastore.container_name, blob=remote["path"])
|
|
@@ -515,16 +498,6 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
|
|
|
515
498
|
except Exception as ex: # pylint: disable=broad-exception-caught
|
|
516
499
|
LOGGER.debug("Exception occurred while registering image artifact. ex: %s", ex)
|
|
517
500
|
|
|
518
|
-
def _get_datastore_credential(self, datastore: "Datastore"):
|
|
519
|
-
# Reference the logic in azure.ai.ml._artifact._artifact_utilities
|
|
520
|
-
# https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/ml/azure-ai-ml/azure/ai/ml/_artifacts/_artifact_utilities.py#L103
|
|
521
|
-
credential = datastore.credentials
|
|
522
|
-
if isinstance(credential, AccountKeyConfiguration):
|
|
523
|
-
return credential.account_key
|
|
524
|
-
if hasattr(credential, "sas_token"):
|
|
525
|
-
return credential.sas_token
|
|
526
|
-
return self._ml_client.datastores._credential # pylint: disable=protected-access
|
|
527
|
-
|
|
528
501
|
def log_metric(self, key: str, value: float) -> None:
|
|
529
502
|
"""
|
|
530
503
|
Log the metric to azure similar to how it is done by mlflow.
|
|
@@ -10,9 +10,9 @@ from typing import Any, Callable, Dict, List, Optional, Set, Tuple, TypedDict, T
|
|
|
10
10
|
|
|
11
11
|
import pandas as pd
|
|
12
12
|
from promptflow._sdk._constants import LINE_NUMBER
|
|
13
|
-
from promptflow._sdk._errors import UserAuthenticationError, UploadInternalError
|
|
14
13
|
from promptflow.client import PFClient
|
|
15
14
|
from promptflow.entities import Run
|
|
15
|
+
from promptflow._sdk._configuration import Configuration
|
|
16
16
|
|
|
17
17
|
from azure.ai.evaluation._common.math import list_mean_nan_safe, apply_transform_nan_safe
|
|
18
18
|
from azure.ai.evaluation._common.utils import validate_azure_ai_project
|
|
@@ -21,18 +21,19 @@ from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarg
|
|
|
21
21
|
from .._constants import (
|
|
22
22
|
CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT,
|
|
23
23
|
EvaluationMetrics,
|
|
24
|
-
|
|
24
|
+
DefaultOpenEncoding,
|
|
25
25
|
Prefixes,
|
|
26
26
|
_InternalEvaluationMetrics,
|
|
27
27
|
)
|
|
28
28
|
from .._model_configurations import AzureAIProject, EvaluationResult, EvaluatorConfig
|
|
29
29
|
from .._user_agent import USER_AGENT
|
|
30
|
-
from ._batch_run import EvalRunContext, CodeClient, ProxyClient, TargetRunContext
|
|
30
|
+
from ._batch_run import EvalRunContext, CodeClient, ProxyClient, TargetRunContext, ProxyRun
|
|
31
31
|
from ._utils import (
|
|
32
32
|
_apply_column_mapping,
|
|
33
33
|
_log_metrics_and_instance_results,
|
|
34
34
|
_trace_destination_from_project_scope,
|
|
35
35
|
_write_output,
|
|
36
|
+
DataLoaderFactory,
|
|
36
37
|
)
|
|
37
38
|
|
|
38
39
|
TClient = TypeVar("TClient", ProxyClient, CodeClient)
|
|
@@ -431,10 +432,11 @@ def _validate_and_load_data(target, data, evaluators, output_path, azure_ai_proj
|
|
|
431
432
|
)
|
|
432
433
|
|
|
433
434
|
try:
|
|
434
|
-
|
|
435
|
+
data_loader = DataLoaderFactory.get_loader(data)
|
|
436
|
+
initial_data_df = data_loader.load()
|
|
435
437
|
except Exception as e:
|
|
436
438
|
raise EvaluationException(
|
|
437
|
-
message=f"Unable to load data from '{data}'.
|
|
439
|
+
message=f"Unable to load data from '{data}'. Supported formats are JSONL and CSV. Detailed error: {e}.",
|
|
438
440
|
target=ErrorTarget.EVALUATE,
|
|
439
441
|
category=ErrorCategory.INVALID_VALUE,
|
|
440
442
|
blame=ErrorBlame.USER_ERROR,
|
|
@@ -446,7 +448,7 @@ def _validate_and_load_data(target, data, evaluators, output_path, azure_ai_proj
|
|
|
446
448
|
def _apply_target_to_data(
|
|
447
449
|
target: Callable,
|
|
448
450
|
data: Union[str, os.PathLike],
|
|
449
|
-
|
|
451
|
+
batch_client: TClient,
|
|
450
452
|
initial_data: pd.DataFrame,
|
|
451
453
|
evaluation_name: Optional[str] = None,
|
|
452
454
|
**kwargs,
|
|
@@ -456,10 +458,10 @@ def _apply_target_to_data(
|
|
|
456
458
|
|
|
457
459
|
:param target: The function to be applied to data.
|
|
458
460
|
:type target: Callable
|
|
459
|
-
:param data: The path to input jsonl file.
|
|
461
|
+
:param data: The path to input jsonl or csv file.
|
|
460
462
|
:type data: Union[str, os.PathLike]
|
|
461
|
-
:param
|
|
462
|
-
:type
|
|
463
|
+
:param batch_client: The promptflow client to be used.
|
|
464
|
+
:type batch_client: PFClient
|
|
463
465
|
:param initial_data: The data frame with the loaded data.
|
|
464
466
|
:type initial_data: pd.DataFrame
|
|
465
467
|
:param evaluation_name: The name of the evaluation.
|
|
@@ -468,35 +470,27 @@ def _apply_target_to_data(
|
|
|
468
470
|
:rtype: Tuple[pandas.DataFrame, List[str]]
|
|
469
471
|
"""
|
|
470
472
|
_run_name = kwargs.get("_run_name")
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
properties={EvaluationRunProperties.RUN_TYPE: "eval_run", "isEvaluatorRun": "true"},
|
|
480
|
-
stream=True,
|
|
481
|
-
name=_run_name,
|
|
482
|
-
)
|
|
483
|
-
except (UserAuthenticationError, UploadInternalError) as ex:
|
|
484
|
-
if "Failed to upload run" in ex.message:
|
|
485
|
-
msg = (
|
|
486
|
-
"Failed to upload the target run to the cloud. "
|
|
487
|
-
"This may be caused by insufficient permission to access storage or other errors."
|
|
488
|
-
)
|
|
489
|
-
raise EvaluationException(
|
|
490
|
-
message=msg,
|
|
491
|
-
target=ErrorTarget.EVALUATE,
|
|
492
|
-
category=ErrorCategory.FAILED_REMOTE_TRACKING,
|
|
493
|
-
blame=ErrorBlame.USER_ERROR,
|
|
494
|
-
tsg_link="https://aka.ms/azsdk/python/evaluation/remotetracking/troubleshoot",
|
|
495
|
-
) from ex
|
|
473
|
+
with TargetRunContext():
|
|
474
|
+
run: ProxyRun = batch_client.run(
|
|
475
|
+
flow=target,
|
|
476
|
+
display_name=evaluation_name,
|
|
477
|
+
data=data,
|
|
478
|
+
stream=True,
|
|
479
|
+
name=_run_name,
|
|
480
|
+
)
|
|
496
481
|
|
|
497
|
-
|
|
482
|
+
target_output: pd.DataFrame = batch_client.get_details(run, all_results=True)
|
|
483
|
+
run_summary = batch_client.get_run_summary(run)
|
|
498
484
|
|
|
499
|
-
|
|
485
|
+
if run_summary["completed_lines"] == 0:
|
|
486
|
+
msg = (f"Evaluation target failed to produce any results."
|
|
487
|
+
f" Please check the logs at {run_summary['log_path']} for more details about cause of failure.")
|
|
488
|
+
raise EvaluationException(
|
|
489
|
+
message=msg,
|
|
490
|
+
target=ErrorTarget.EVALUATE,
|
|
491
|
+
category=ErrorCategory.FAILED_EXECUTION,
|
|
492
|
+
blame=ErrorBlame.USER_ERROR,
|
|
493
|
+
)
|
|
500
494
|
# Remove input and output prefix
|
|
501
495
|
generated_columns = {
|
|
502
496
|
col[len(Prefixes.OUTPUTS) :] for col in target_output.columns if col.startswith(Prefixes.OUTPUTS)
|
|
@@ -515,7 +509,7 @@ def _apply_target_to_data(
|
|
|
515
509
|
# Concatenate output to input
|
|
516
510
|
target_output = pd.concat([target_output, initial_data], axis=1)
|
|
517
511
|
|
|
518
|
-
return target_output, generated_columns, run
|
|
512
|
+
return target_output, generated_columns, run.run.result()
|
|
519
513
|
|
|
520
514
|
|
|
521
515
|
def _process_column_mappings(
|
|
@@ -590,13 +584,14 @@ def evaluate(
|
|
|
590
584
|
evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None,
|
|
591
585
|
azure_ai_project: Optional[AzureAIProject] = None,
|
|
592
586
|
output_path: Optional[Union[str, os.PathLike]] = None,
|
|
587
|
+
fail_on_evaluator_errors: bool = False,
|
|
593
588
|
**kwargs,
|
|
594
589
|
) -> EvaluationResult:
|
|
595
590
|
"""Evaluates target or data with built-in or custom evaluators. If both target and data are provided,
|
|
596
591
|
data will be run through target function and then results will be evaluated.
|
|
597
592
|
|
|
598
593
|
:keyword data: Path to the data to be evaluated or passed to target if target is set.
|
|
599
|
-
|
|
594
|
+
JSONL and CSV files are supported. `target` and `data` both cannot be None. Required.
|
|
600
595
|
:paramtype data: str
|
|
601
596
|
:keyword evaluators: Evaluators to be used for evaluation. It should be a dictionary with key as alias for evaluator
|
|
602
597
|
and value as the evaluator function. Required.
|
|
@@ -615,6 +610,11 @@ def evaluate(
|
|
|
615
610
|
:paramtype output_path: Optional[str]
|
|
616
611
|
:keyword azure_ai_project: Logs evaluation results to AI Studio if set.
|
|
617
612
|
:paramtype azure_ai_project: Optional[~azure.ai.evaluation.AzureAIProject]
|
|
613
|
+
:keyword fail_on_evaluator_errors: Whether or not the evaluation should cancel early with an EvaluationException
|
|
614
|
+
if ANY evaluator fails during their evaluation.
|
|
615
|
+
Defaults to false, which means that evaluations will continue regardless of failures.
|
|
616
|
+
If such failures occur, metrics may be missing, and evidence of failures can be found in the evaluation's logs.
|
|
617
|
+
:paramtype fail_on_evaluator_errors: bool
|
|
618
618
|
:return: Evaluation results.
|
|
619
619
|
:rtype: ~azure.ai.evaluation.EvaluationResult
|
|
620
620
|
|
|
@@ -636,6 +636,7 @@ def evaluate(
|
|
|
636
636
|
evaluator_config=evaluator_config,
|
|
637
637
|
azure_ai_project=azure_ai_project,
|
|
638
638
|
output_path=output_path,
|
|
639
|
+
fail_on_evaluator_errors=fail_on_evaluator_errors,
|
|
639
640
|
**kwargs,
|
|
640
641
|
)
|
|
641
642
|
except Exception as e:
|
|
@@ -684,6 +685,16 @@ def _print_summary(per_evaluator_results: Dict[str, Any]) -> None:
|
|
|
684
685
|
print("\n====================================================\n")
|
|
685
686
|
|
|
686
687
|
|
|
688
|
+
def _print_fail_flag_warning() -> None:
|
|
689
|
+
print(
|
|
690
|
+
"Notice: fail_on_evaluator_errors is enabled. It is recommended that you disable "
|
|
691
|
+
+ "this flag for evaluations on large datasets (loosely defined as more than 10 rows of inputs, "
|
|
692
|
+
+ "or more than 4 evaluators). Using this flag on large datasets runs the risk of large runs failing "
|
|
693
|
+
+ "without producing any outputs, since a single failure will cancel the entire run "
|
|
694
|
+
"when fail_on_evaluator_errors is enabled."
|
|
695
|
+
)
|
|
696
|
+
|
|
697
|
+
|
|
687
698
|
def _evaluate( # pylint: disable=too-many-locals,too-many-statements
|
|
688
699
|
*,
|
|
689
700
|
evaluators: Dict[str, Callable],
|
|
@@ -693,8 +704,11 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
|
|
|
693
704
|
evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None,
|
|
694
705
|
azure_ai_project: Optional[AzureAIProject] = None,
|
|
695
706
|
output_path: Optional[Union[str, os.PathLike]] = None,
|
|
707
|
+
fail_on_evaluator_errors: bool = False,
|
|
696
708
|
**kwargs,
|
|
697
709
|
) -> EvaluationResult:
|
|
710
|
+
if fail_on_evaluator_errors:
|
|
711
|
+
_print_fail_flag_warning()
|
|
698
712
|
input_data_df = _validate_and_load_data(target, data, evaluators, output_path, azure_ai_project, evaluation_name)
|
|
699
713
|
|
|
700
714
|
# Process evaluator config to replace ${target.} with ${data.}
|
|
@@ -711,6 +725,7 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
|
|
|
711
725
|
if target is not None:
|
|
712
726
|
_validate_columns_for_target(input_data_df, target)
|
|
713
727
|
|
|
728
|
+
Configuration.get_instance().set_config("trace.destination", "none")
|
|
714
729
|
pf_client = PFClient(user_agent=USER_AGENT)
|
|
715
730
|
target_run: Optional[Run] = None
|
|
716
731
|
|
|
@@ -723,7 +738,7 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
|
|
|
723
738
|
target_generated_columns: Set[str] = set()
|
|
724
739
|
if data is not None and target is not None:
|
|
725
740
|
input_data_df, target_generated_columns, target_run = _apply_target_to_data(
|
|
726
|
-
target, data, pf_client, input_data_df, evaluation_name, **kwargs
|
|
741
|
+
target, data, ProxyClient(pf_client), input_data_df, evaluation_name, **kwargs
|
|
727
742
|
)
|
|
728
743
|
|
|
729
744
|
for evaluator_name, mapping in column_mapping.items():
|
|
@@ -794,6 +809,10 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
|
|
|
794
809
|
evaluators_result_df = None
|
|
795
810
|
evaluators_metric = {}
|
|
796
811
|
for evaluator_name, evaluator_result in per_evaluator_results.items():
|
|
812
|
+
if fail_on_evaluator_errors and evaluator_result["run_summary"]["failed_lines"] > 0:
|
|
813
|
+
_print_summary(per_evaluator_results)
|
|
814
|
+
_turn_error_logs_into_exception(evaluator_result["run_summary"]["log_path"] + "/error.json")
|
|
815
|
+
|
|
797
816
|
evaluator_result_df = evaluator_result["result"]
|
|
798
817
|
|
|
799
818
|
# drop input columns
|
|
@@ -834,11 +853,7 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
|
|
|
834
853
|
studio_url = None
|
|
835
854
|
if trace_destination:
|
|
836
855
|
studio_url = _log_metrics_and_instance_results(
|
|
837
|
-
metrics,
|
|
838
|
-
result_df,
|
|
839
|
-
trace_destination,
|
|
840
|
-
target_run,
|
|
841
|
-
evaluation_name,
|
|
856
|
+
metrics, result_df, trace_destination, target_run, evaluation_name, **kwargs
|
|
842
857
|
)
|
|
843
858
|
|
|
844
859
|
result_df_dict = result_df.to_dict("records")
|
|
@@ -850,3 +865,20 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
|
|
|
850
865
|
_write_output(output_path, result)
|
|
851
866
|
|
|
852
867
|
return result
|
|
868
|
+
|
|
869
|
+
|
|
870
|
+
def _turn_error_logs_into_exception(log_path: str) -> None:
|
|
871
|
+
"""Produce an EvaluationException using the contents of the inputted
|
|
872
|
+
file as the error message.
|
|
873
|
+
|
|
874
|
+
:param log_path: The path to the error log file.
|
|
875
|
+
:type log_path: str
|
|
876
|
+
"""
|
|
877
|
+
with open(log_path, "r", encoding=DefaultOpenEncoding.READ) as file:
|
|
878
|
+
error_message = file.read()
|
|
879
|
+
raise EvaluationException(
|
|
880
|
+
message=error_message,
|
|
881
|
+
target=ErrorTarget.EVALUATE,
|
|
882
|
+
category=ErrorCategory.FAILED_EXECUTION,
|
|
883
|
+
blame=ErrorBlame.UNKNOWN,
|
|
884
|
+
)
|
|
@@ -7,12 +7,11 @@ import os
|
|
|
7
7
|
import re
|
|
8
8
|
import tempfile
|
|
9
9
|
from pathlib import Path
|
|
10
|
-
from typing import Any, Dict, NamedTuple, Optional,
|
|
10
|
+
from typing import Any, Dict, NamedTuple, Optional, Union, cast
|
|
11
11
|
import uuid
|
|
12
12
|
import base64
|
|
13
13
|
|
|
14
14
|
import pandas as pd
|
|
15
|
-
from promptflow.client import PFClient
|
|
16
15
|
from promptflow.entities import Run
|
|
17
16
|
|
|
18
17
|
from azure.ai.evaluation._constants import (
|
|
@@ -23,6 +22,8 @@ from azure.ai.evaluation._constants import (
|
|
|
23
22
|
)
|
|
24
23
|
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
25
24
|
from azure.ai.evaluation._model_configurations import AzureAIProject
|
|
25
|
+
from azure.ai.evaluation._version import VERSION
|
|
26
|
+
from azure.ai.evaluation._azure._clients import LiteMLClient
|
|
26
27
|
|
|
27
28
|
LOGGER = logging.getLogger(__name__)
|
|
28
29
|
|
|
@@ -45,6 +46,8 @@ def is_none(value) -> bool:
|
|
|
45
46
|
def extract_workspace_triad_from_trace_provider( # pylint: disable=name-too-long
|
|
46
47
|
trace_provider: str,
|
|
47
48
|
) -> AzureMLWorkspace:
|
|
49
|
+
from promptflow._cli._utils import get_workspace_triad_from_local
|
|
50
|
+
|
|
48
51
|
match = re.match(AZURE_WORKSPACE_REGEX_FORMAT, trace_provider)
|
|
49
52
|
if not match or len(match.groups()) != 5:
|
|
50
53
|
raise EvaluationException(
|
|
@@ -58,10 +61,20 @@ def extract_workspace_triad_from_trace_provider( # pylint: disable=name-too-lon
|
|
|
58
61
|
category=ErrorCategory.INVALID_VALUE,
|
|
59
62
|
blame=ErrorBlame.UNKNOWN,
|
|
60
63
|
)
|
|
64
|
+
|
|
61
65
|
subscription_id = match.group(1)
|
|
62
66
|
resource_group_name = match.group(3)
|
|
63
67
|
workspace_name = match.group(5)
|
|
64
|
-
|
|
68
|
+
|
|
69
|
+
# In theory this if statement should never evaluate to True, but we'll keep it here just in case
|
|
70
|
+
# for backwards compatibility with what the original code that depended on promptflow-azure did
|
|
71
|
+
if not (subscription_id and resource_group_name and workspace_name):
|
|
72
|
+
local = get_workspace_triad_from_local()
|
|
73
|
+
subscription_id = subscription_id or local.subscription_id or os.getenv("AZUREML_ARM_SUBSCRIPTION")
|
|
74
|
+
resource_group_name = resource_group_name or local.resource_group_name or os.getenv("AZUREML_ARM_RESOURCEGROUP")
|
|
75
|
+
workspace_name = workspace_name or local.workspace_name or os.getenv("AZUREML_ARM_WORKSPACE_NAME")
|
|
76
|
+
|
|
77
|
+
return AzureMLWorkspace(subscription_id or "", resource_group_name or "", workspace_name or "")
|
|
65
78
|
|
|
66
79
|
|
|
67
80
|
def load_jsonl(path):
|
|
@@ -69,19 +82,6 @@ def load_jsonl(path):
|
|
|
69
82
|
return [json.loads(line) for line in f.readlines()]
|
|
70
83
|
|
|
71
84
|
|
|
72
|
-
def _azure_pf_client_and_triad(trace_destination) -> Tuple[PFClient, AzureMLWorkspace]:
|
|
73
|
-
from promptflow.azure._cli._utils import _get_azure_pf_client
|
|
74
|
-
|
|
75
|
-
ws_triad = extract_workspace_triad_from_trace_provider(trace_destination)
|
|
76
|
-
azure_pf_client = _get_azure_pf_client(
|
|
77
|
-
subscription_id=ws_triad.subscription_id,
|
|
78
|
-
resource_group=ws_triad.resource_group_name,
|
|
79
|
-
workspace_name=ws_triad.workspace_name,
|
|
80
|
-
)
|
|
81
|
-
|
|
82
|
-
return azure_pf_client, ws_triad
|
|
83
|
-
|
|
84
|
-
|
|
85
85
|
def _store_multimodal_content(messages, tmpdir: str):
|
|
86
86
|
# verify if images folder exists
|
|
87
87
|
images_folder_path = os.path.join(tmpdir, "images")
|
|
@@ -91,23 +91,40 @@ def _store_multimodal_content(messages, tmpdir: str):
|
|
|
91
91
|
for message in messages:
|
|
92
92
|
if isinstance(message.get("content", []), list):
|
|
93
93
|
for content in message.get("content", []):
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
94
|
+
process_message_content(content, images_folder_path)
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def process_message_content(content, images_folder_path):
|
|
98
|
+
if content.get("type", "") == "image_url":
|
|
99
|
+
image_url = content.get("image_url")
|
|
100
|
+
|
|
101
|
+
if not image_url or "url" not in image_url:
|
|
102
|
+
return None
|
|
103
|
+
|
|
104
|
+
url = image_url["url"]
|
|
105
|
+
if not url.startswith("data:image/"):
|
|
106
|
+
return None
|
|
107
|
+
|
|
108
|
+
match = re.search("data:image/([^;]+);", url)
|
|
109
|
+
if not match:
|
|
110
|
+
return None
|
|
99
111
|
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
112
|
+
ext = match.group(1)
|
|
113
|
+
# Extract the base64 string
|
|
114
|
+
base64image = image_url["url"].replace(f"data:image/{ext};base64,", "")
|
|
103
115
|
|
|
104
|
-
|
|
105
|
-
|
|
116
|
+
# Generate a unique filename
|
|
117
|
+
image_file_name = f"{str(uuid.uuid4())}.{ext}"
|
|
118
|
+
image_url["url"] = f"images/{image_file_name}" # Replace the base64 URL with the file path
|
|
106
119
|
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
120
|
+
# Decode the base64 string to binary image data
|
|
121
|
+
image_data_binary = base64.b64decode(base64image)
|
|
122
|
+
|
|
123
|
+
# Write the binary image data to the file
|
|
124
|
+
image_file_path = os.path.join(images_folder_path, image_file_name)
|
|
125
|
+
with open(image_file_path, "wb") as f:
|
|
126
|
+
f.write(image_data_binary)
|
|
127
|
+
return None
|
|
111
128
|
|
|
112
129
|
|
|
113
130
|
def _log_metrics_and_instance_results(
|
|
@@ -116,6 +133,7 @@ def _log_metrics_and_instance_results(
|
|
|
116
133
|
trace_destination: Optional[str],
|
|
117
134
|
run: Run,
|
|
118
135
|
evaluation_name: Optional[str],
|
|
136
|
+
**kwargs,
|
|
119
137
|
) -> Optional[str]:
|
|
120
138
|
from azure.ai.evaluation._evaluate._eval_run import EvalRun
|
|
121
139
|
|
|
@@ -123,19 +141,26 @@ def _log_metrics_and_instance_results(
|
|
|
123
141
|
LOGGER.debug("Skip uploading evaluation results to AI Studio since no trace destination was provided.")
|
|
124
142
|
return None
|
|
125
143
|
|
|
126
|
-
|
|
127
|
-
|
|
144
|
+
ws_triad = extract_workspace_triad_from_trace_provider(trace_destination)
|
|
145
|
+
management_client = LiteMLClient(
|
|
146
|
+
subscription_id=ws_triad.subscription_id,
|
|
147
|
+
resource_group=ws_triad.resource_group_name,
|
|
148
|
+
logger=LOGGER,
|
|
149
|
+
credential=kwargs.get("credential"),
|
|
150
|
+
# let the client automatically determine the credentials to use
|
|
151
|
+
)
|
|
152
|
+
tracking_uri = management_client.workspace_get_info(ws_triad.workspace_name).ml_flow_tracking_uri
|
|
128
153
|
|
|
129
154
|
# Adding line_number as index column this is needed by UI to form link to individual instance run
|
|
130
155
|
instance_results["line_number"] = instance_results.index.values
|
|
131
156
|
|
|
132
157
|
with EvalRun(
|
|
133
158
|
run_name=run.name if run is not None else evaluation_name,
|
|
134
|
-
tracking_uri=tracking_uri,
|
|
159
|
+
tracking_uri=cast(str, tracking_uri),
|
|
135
160
|
subscription_id=ws_triad.subscription_id,
|
|
136
161
|
group_name=ws_triad.resource_group_name,
|
|
137
162
|
workspace_name=ws_triad.workspace_name,
|
|
138
|
-
|
|
163
|
+
management_client=management_client,
|
|
139
164
|
promptflow_run=run,
|
|
140
165
|
) as ev_run:
|
|
141
166
|
artifact_name = EvalRun.EVALUATION_ARTIFACT
|
|
@@ -166,9 +191,16 @@ def _log_metrics_and_instance_results(
|
|
|
166
191
|
properties={
|
|
167
192
|
EvaluationRunProperties.RUN_TYPE: "eval_run",
|
|
168
193
|
EvaluationRunProperties.EVALUATION_RUN: "promptflow.BatchRun",
|
|
194
|
+
EvaluationRunProperties.EVALUATION_SDK: f"azure-ai-evaluation:{VERSION}",
|
|
169
195
|
"_azureml.evaluate_artifacts": json.dumps([{"path": artifact_name, "type": "table"}]),
|
|
170
196
|
}
|
|
171
197
|
)
|
|
198
|
+
else:
|
|
199
|
+
ev_run.write_properties_to_run_history(
|
|
200
|
+
properties={
|
|
201
|
+
EvaluationRunProperties.EVALUATION_SDK: f"azure-ai-evaluation:{VERSION}",
|
|
202
|
+
}
|
|
203
|
+
)
|
|
172
204
|
|
|
173
205
|
for metric_name, metric_value in metrics.items():
|
|
174
206
|
ev_run.log_metric(metric_name, metric_value)
|
|
@@ -296,3 +328,30 @@ def set_event_loop_policy() -> None:
|
|
|
296
328
|
# Reference: https://stackoverflow.com/questions/45600579/asyncio-event-loop-is-closed-when-getting-loop
|
|
297
329
|
# On Windows seems to be a problem with EventLoopPolicy, use this snippet to work around it
|
|
298
330
|
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy()) # type: ignore[attr-defined]
|
|
331
|
+
|
|
332
|
+
|
|
333
|
+
class JSONLDataFileLoader:
|
|
334
|
+
def __init__(self, filename: Union[os.PathLike, str]):
|
|
335
|
+
self.filename = filename
|
|
336
|
+
|
|
337
|
+
def load(self) -> pd.DataFrame:
|
|
338
|
+
return pd.read_json(self.filename, lines=True)
|
|
339
|
+
|
|
340
|
+
|
|
341
|
+
class CSVDataFileLoader:
|
|
342
|
+
def __init__(self, filename: Union[os.PathLike, str]):
|
|
343
|
+
self.filename = filename
|
|
344
|
+
|
|
345
|
+
def load(self) -> pd.DataFrame:
|
|
346
|
+
return pd.read_csv(self.filename)
|
|
347
|
+
|
|
348
|
+
|
|
349
|
+
class DataLoaderFactory:
|
|
350
|
+
@staticmethod
|
|
351
|
+
def get_loader(filename: Union[os.PathLike, str]) -> Union[JSONLDataFileLoader, CSVDataFileLoader]:
|
|
352
|
+
filename_str = str(filename).lower()
|
|
353
|
+
if filename_str.endswith(".csv"):
|
|
354
|
+
return CSVDataFileLoader(filename)
|
|
355
|
+
|
|
356
|
+
# fallback to JSONL to maintain backward compatibility
|
|
357
|
+
return JSONLDataFileLoader(filename)
|