azure-ai-evaluation 1.0.1__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of azure-ai-evaluation might be problematic. Click here for more details.
- azure/ai/evaluation/_azure/__init__.py +3 -0
- azure/ai/evaluation/_azure/_clients.py +188 -0
- azure/ai/evaluation/_azure/_models.py +227 -0
- azure/ai/evaluation/_azure/_token_manager.py +118 -0
- azure/ai/evaluation/_common/rai_service.py +30 -21
- azure/ai/evaluation/_constants.py +1 -0
- azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +1 -1
- azure/ai/evaluation/_evaluate/_eval_run.py +14 -43
- azure/ai/evaluation/_evaluate/_evaluate.py +9 -34
- azure/ai/evaluation/_evaluate/_utils.py +66 -34
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +71 -1
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +12 -1
- azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +33 -1
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +3 -3
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +1 -1
- azure/ai/evaluation/_http_utils.py +6 -4
- azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +0 -4
- azure/ai/evaluation/_vendor/rouge_score/scoring.py +0 -4
- azure/ai/evaluation/_vendor/rouge_score/tokenize.py +0 -4
- azure/ai/evaluation/_version.py +1 -1
- azure/ai/evaluation/simulator/_adversarial_scenario.py +2 -0
- azure/ai/evaluation/simulator/_adversarial_simulator.py +35 -16
- azure/ai/evaluation/simulator/_conversation/__init__.py +128 -7
- azure/ai/evaluation/simulator/_conversation/_conversation.py +0 -1
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +1 -0
- azure/ai/evaluation/simulator/_model_tools/_rai_client.py +40 -0
- azure/ai/evaluation/simulator/_model_tools/_template_handler.py +1 -0
- azure/ai/evaluation/simulator/_simulator.py +3 -0
- {azure_ai_evaluation-1.0.1.dist-info → azure_ai_evaluation-1.1.0.dist-info}/METADATA +15 -10
- {azure_ai_evaluation-1.0.1.dist-info → azure_ai_evaluation-1.1.0.dist-info}/RECORD +33 -29
- {azure_ai_evaluation-1.0.1.dist-info → azure_ai_evaluation-1.1.0.dist-info}/NOTICE.txt +0 -0
- {azure_ai_evaluation-1.0.1.dist-info → azure_ai_evaluation-1.1.0.dist-info}/WHEEL +0 -0
- {azure_ai_evaluation-1.0.1.dist-info → azure_ai_evaluation-1.1.0.dist-info}/top_level.txt +0 -0
|
@@ -22,29 +22,12 @@ from azure.ai.evaluation._version import VERSION
|
|
|
22
22
|
from azure.core.pipeline.policies import RetryPolicy
|
|
23
23
|
from azure.core.rest import HttpResponse
|
|
24
24
|
from azure.core.exceptions import HttpResponseError
|
|
25
|
+
from azure.storage.blob import BlobServiceClient
|
|
26
|
+
from azure.ai.evaluation._azure._clients import LiteMLClient
|
|
25
27
|
|
|
26
28
|
LOGGER = logging.getLogger(__name__)
|
|
27
29
|
|
|
28
30
|
|
|
29
|
-
# Handle optional import. The azure libraries are only present if
|
|
30
|
-
# promptflow-azure is installed.
|
|
31
|
-
try:
|
|
32
|
-
from azure.ai.ml import MLClient
|
|
33
|
-
from azure.ai.ml.entities._credentials import AccountKeyConfiguration # pylint: disable=ungrouped-imports
|
|
34
|
-
from azure.ai.ml.entities._datastore.datastore import Datastore
|
|
35
|
-
from azure.storage.blob import BlobServiceClient
|
|
36
|
-
except (ModuleNotFoundError, ImportError):
|
|
37
|
-
raise EvaluationException( # pylint: disable=raise-missing-from
|
|
38
|
-
message=(
|
|
39
|
-
"The required packages for remote tracking are missing.\n"
|
|
40
|
-
'To resolve this, please install them by running "pip install azure-ai-evaluation[remote]".'
|
|
41
|
-
),
|
|
42
|
-
target=ErrorTarget.EVALUATE,
|
|
43
|
-
category=ErrorCategory.MISSING_PACKAGE,
|
|
44
|
-
blame=ErrorBlame.USER_ERROR,
|
|
45
|
-
)
|
|
46
|
-
|
|
47
|
-
|
|
48
31
|
@dataclasses.dataclass
|
|
49
32
|
class RunInfo:
|
|
50
33
|
"""
|
|
@@ -93,15 +76,16 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
|
|
|
93
76
|
:type group_name: str
|
|
94
77
|
:param workspace_name: The name of workspace/project used to track run.
|
|
95
78
|
:type workspace_name: str
|
|
96
|
-
:param
|
|
97
|
-
:type
|
|
79
|
+
:param management_client: The trace destination string to parse the AI ML workspace blob store from.
|
|
80
|
+
:type management_client:
|
|
81
|
+
~azure.ai.evaluation._promptflow.azure._lite_azure_management_client.LiteMLClient
|
|
98
82
|
:param promptflow_run: The promptflow run used by the
|
|
83
|
+
:type promptflow_run: Optional[promptflow._sdk.entities.Run]
|
|
99
84
|
"""
|
|
100
85
|
|
|
101
86
|
_MAX_RETRIES = 5
|
|
102
87
|
_BACKOFF_FACTOR = 2
|
|
103
88
|
_TIMEOUT = 5
|
|
104
|
-
_SCOPE = "https://management.azure.com/.default"
|
|
105
89
|
|
|
106
90
|
EVALUATION_ARTIFACT = "instance_results.jsonl"
|
|
107
91
|
|
|
@@ -112,14 +96,14 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
|
|
|
112
96
|
subscription_id: str,
|
|
113
97
|
group_name: str,
|
|
114
98
|
workspace_name: str,
|
|
115
|
-
|
|
99
|
+
management_client: LiteMLClient,
|
|
116
100
|
promptflow_run: Optional[Run] = None,
|
|
117
101
|
) -> None:
|
|
118
102
|
self._tracking_uri: str = tracking_uri
|
|
119
103
|
self._subscription_id: str = subscription_id
|
|
120
104
|
self._resource_group_name: str = group_name
|
|
121
105
|
self._workspace_name: str = workspace_name
|
|
122
|
-
self.
|
|
106
|
+
self._management_client: LiteMLClient = management_client
|
|
123
107
|
self._is_promptflow_run: bool = promptflow_run is not None
|
|
124
108
|
self._run_name = run_name
|
|
125
109
|
self._promptflow_run = promptflow_run
|
|
@@ -184,7 +168,7 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
|
|
|
184
168
|
if self._promptflow_run is not None:
|
|
185
169
|
self._info = RunInfo(
|
|
186
170
|
self._promptflow_run.name,
|
|
187
|
-
self._promptflow_run._experiment_name, # pylint: disable=protected-access
|
|
171
|
+
self._promptflow_run._experiment_name or "", # pylint: disable=protected-access
|
|
188
172
|
self._promptflow_run.name,
|
|
189
173
|
)
|
|
190
174
|
else:
|
|
@@ -310,12 +294,8 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
|
|
|
310
294
|
"""
|
|
311
295
|
return f"https://{self._url_base}" "/mlflow/v2.0" f"{self._get_scope()}" f"/api/2.0/mlflow/runs/log-metric"
|
|
312
296
|
|
|
313
|
-
def _get_token(self):
|
|
314
|
-
|
|
315
|
-
# is an optional dependency.
|
|
316
|
-
from promptflow.azure._utils._token_cache import ArmTokenCache # pylint: disable=import-error,no-name-in-module
|
|
317
|
-
|
|
318
|
-
return ArmTokenCache().get_token(self._ml_client._credential) # pylint: disable=protected-access
|
|
297
|
+
def _get_token(self) -> str:
|
|
298
|
+
return self._management_client.get_token()
|
|
319
299
|
|
|
320
300
|
def request_with_retry(
|
|
321
301
|
self, url: str, method: str, json_dict: Dict[str, Any], headers: Optional[Dict[str, str]] = None
|
|
@@ -441,9 +421,10 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
|
|
|
441
421
|
local_paths.append(local_file_path)
|
|
442
422
|
|
|
443
423
|
# We will write the artifacts to the workspaceblobstore
|
|
444
|
-
datastore = self.
|
|
424
|
+
datastore = self._management_client.workspace_get_default_datastore(self._workspace_name, True)
|
|
445
425
|
account_url = f"{datastore.account_name}.blob.{datastore.endpoint}"
|
|
446
|
-
|
|
426
|
+
|
|
427
|
+
svc_client = BlobServiceClient(account_url=account_url, credential=datastore.credential)
|
|
447
428
|
try:
|
|
448
429
|
for local, remote in zip(local_paths, remote_paths["paths"]):
|
|
449
430
|
blob_client = svc_client.get_blob_client(container=datastore.container_name, blob=remote["path"])
|
|
@@ -515,16 +496,6 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
|
|
|
515
496
|
except Exception as ex: # pylint: disable=broad-exception-caught
|
|
516
497
|
LOGGER.debug("Exception occurred while registering image artifact. ex: %s", ex)
|
|
517
498
|
|
|
518
|
-
def _get_datastore_credential(self, datastore: "Datastore"):
|
|
519
|
-
# Reference the logic in azure.ai.ml._artifact._artifact_utilities
|
|
520
|
-
# https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/ml/azure-ai-ml/azure/ai/ml/_artifacts/_artifact_utilities.py#L103
|
|
521
|
-
credential = datastore.credentials
|
|
522
|
-
if isinstance(credential, AccountKeyConfiguration):
|
|
523
|
-
return credential.account_key
|
|
524
|
-
if hasattr(credential, "sas_token"):
|
|
525
|
-
return credential.sas_token
|
|
526
|
-
return self._ml_client.datastores._credential # pylint: disable=protected-access
|
|
527
|
-
|
|
528
499
|
def log_metric(self, key: str, value: float) -> None:
|
|
529
500
|
"""
|
|
530
501
|
Log the metric to azure similar to how it is done by mlflow.
|
|
@@ -10,7 +10,6 @@ from typing import Any, Callable, Dict, List, Optional, Set, Tuple, TypedDict, T
|
|
|
10
10
|
|
|
11
11
|
import pandas as pd
|
|
12
12
|
from promptflow._sdk._constants import LINE_NUMBER
|
|
13
|
-
from promptflow._sdk._errors import UserAuthenticationError, UploadInternalError
|
|
14
13
|
from promptflow.client import PFClient
|
|
15
14
|
from promptflow.entities import Run
|
|
16
15
|
|
|
@@ -21,7 +20,6 @@ from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarg
|
|
|
21
20
|
from .._constants import (
|
|
22
21
|
CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT,
|
|
23
22
|
EvaluationMetrics,
|
|
24
|
-
EvaluationRunProperties,
|
|
25
23
|
Prefixes,
|
|
26
24
|
_InternalEvaluationMetrics,
|
|
27
25
|
)
|
|
@@ -468,33 +466,14 @@ def _apply_target_to_data(
|
|
|
468
466
|
:rtype: Tuple[pandas.DataFrame, List[str]]
|
|
469
467
|
"""
|
|
470
468
|
_run_name = kwargs.get("_run_name")
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
properties={EvaluationRunProperties.RUN_TYPE: "eval_run", "isEvaluatorRun": "true"},
|
|
480
|
-
stream=True,
|
|
481
|
-
name=_run_name,
|
|
482
|
-
)
|
|
483
|
-
except (UserAuthenticationError, UploadInternalError) as ex:
|
|
484
|
-
if "Failed to upload run" in ex.message:
|
|
485
|
-
msg = (
|
|
486
|
-
"Failed to upload the target run to the cloud. "
|
|
487
|
-
"This may be caused by insufficient permission to access storage or other errors."
|
|
488
|
-
)
|
|
489
|
-
raise EvaluationException(
|
|
490
|
-
message=msg,
|
|
491
|
-
target=ErrorTarget.EVALUATE,
|
|
492
|
-
category=ErrorCategory.FAILED_REMOTE_TRACKING,
|
|
493
|
-
blame=ErrorBlame.USER_ERROR,
|
|
494
|
-
tsg_link="https://aka.ms/azsdk/python/evaluation/remotetracking/troubleshoot",
|
|
495
|
-
) from ex
|
|
496
|
-
|
|
497
|
-
raise ex
|
|
469
|
+
with TargetRunContext():
|
|
470
|
+
run: Run = pf_client.run(
|
|
471
|
+
flow=target,
|
|
472
|
+
display_name=evaluation_name,
|
|
473
|
+
data=data,
|
|
474
|
+
stream=True,
|
|
475
|
+
name=_run_name,
|
|
476
|
+
)
|
|
498
477
|
|
|
499
478
|
target_output: pd.DataFrame = pf_client.runs.get_details(run, all_results=True)
|
|
500
479
|
# Remove input and output prefix
|
|
@@ -834,11 +813,7 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
|
|
|
834
813
|
studio_url = None
|
|
835
814
|
if trace_destination:
|
|
836
815
|
studio_url = _log_metrics_and_instance_results(
|
|
837
|
-
metrics,
|
|
838
|
-
result_df,
|
|
839
|
-
trace_destination,
|
|
840
|
-
target_run,
|
|
841
|
-
evaluation_name,
|
|
816
|
+
metrics, result_df, trace_destination, target_run, evaluation_name, **kwargs
|
|
842
817
|
)
|
|
843
818
|
|
|
844
819
|
result_df_dict = result_df.to_dict("records")
|
|
@@ -7,12 +7,11 @@ import os
|
|
|
7
7
|
import re
|
|
8
8
|
import tempfile
|
|
9
9
|
from pathlib import Path
|
|
10
|
-
from typing import Any, Dict, NamedTuple, Optional,
|
|
10
|
+
from typing import Any, Dict, NamedTuple, Optional, Union, cast
|
|
11
11
|
import uuid
|
|
12
12
|
import base64
|
|
13
13
|
|
|
14
14
|
import pandas as pd
|
|
15
|
-
from promptflow.client import PFClient
|
|
16
15
|
from promptflow.entities import Run
|
|
17
16
|
|
|
18
17
|
from azure.ai.evaluation._constants import (
|
|
@@ -23,6 +22,8 @@ from azure.ai.evaluation._constants import (
|
|
|
23
22
|
)
|
|
24
23
|
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
25
24
|
from azure.ai.evaluation._model_configurations import AzureAIProject
|
|
25
|
+
from azure.ai.evaluation._version import VERSION
|
|
26
|
+
from azure.ai.evaluation._azure._clients import LiteMLClient
|
|
26
27
|
|
|
27
28
|
LOGGER = logging.getLogger(__name__)
|
|
28
29
|
|
|
@@ -45,6 +46,8 @@ def is_none(value) -> bool:
|
|
|
45
46
|
def extract_workspace_triad_from_trace_provider( # pylint: disable=name-too-long
|
|
46
47
|
trace_provider: str,
|
|
47
48
|
) -> AzureMLWorkspace:
|
|
49
|
+
from promptflow._cli._utils import get_workspace_triad_from_local
|
|
50
|
+
|
|
48
51
|
match = re.match(AZURE_WORKSPACE_REGEX_FORMAT, trace_provider)
|
|
49
52
|
if not match or len(match.groups()) != 5:
|
|
50
53
|
raise EvaluationException(
|
|
@@ -58,10 +61,20 @@ def extract_workspace_triad_from_trace_provider( # pylint: disable=name-too-lon
|
|
|
58
61
|
category=ErrorCategory.INVALID_VALUE,
|
|
59
62
|
blame=ErrorBlame.UNKNOWN,
|
|
60
63
|
)
|
|
64
|
+
|
|
61
65
|
subscription_id = match.group(1)
|
|
62
66
|
resource_group_name = match.group(3)
|
|
63
67
|
workspace_name = match.group(5)
|
|
64
|
-
|
|
68
|
+
|
|
69
|
+
# In theory this if statement should never evaluate to True, but we'll keep it here just in case
|
|
70
|
+
# for backwards compatibility with what the original code that depended on promptflow-azure did
|
|
71
|
+
if not (subscription_id and resource_group_name and workspace_name):
|
|
72
|
+
local = get_workspace_triad_from_local()
|
|
73
|
+
subscription_id = subscription_id or local.subscription_id or os.getenv("AZUREML_ARM_SUBSCRIPTION")
|
|
74
|
+
resource_group_name = resource_group_name or local.resource_group_name or os.getenv("AZUREML_ARM_RESOURCEGROUP")
|
|
75
|
+
workspace_name = workspace_name or local.workspace_name or os.getenv("AZUREML_ARM_WORKSPACE_NAME")
|
|
76
|
+
|
|
77
|
+
return AzureMLWorkspace(subscription_id or "", resource_group_name or "", workspace_name or "")
|
|
65
78
|
|
|
66
79
|
|
|
67
80
|
def load_jsonl(path):
|
|
@@ -69,19 +82,6 @@ def load_jsonl(path):
|
|
|
69
82
|
return [json.loads(line) for line in f.readlines()]
|
|
70
83
|
|
|
71
84
|
|
|
72
|
-
def _azure_pf_client_and_triad(trace_destination) -> Tuple[PFClient, AzureMLWorkspace]:
|
|
73
|
-
from promptflow.azure._cli._utils import _get_azure_pf_client
|
|
74
|
-
|
|
75
|
-
ws_triad = extract_workspace_triad_from_trace_provider(trace_destination)
|
|
76
|
-
azure_pf_client = _get_azure_pf_client(
|
|
77
|
-
subscription_id=ws_triad.subscription_id,
|
|
78
|
-
resource_group=ws_triad.resource_group_name,
|
|
79
|
-
workspace_name=ws_triad.workspace_name,
|
|
80
|
-
)
|
|
81
|
-
|
|
82
|
-
return azure_pf_client, ws_triad
|
|
83
|
-
|
|
84
|
-
|
|
85
85
|
def _store_multimodal_content(messages, tmpdir: str):
|
|
86
86
|
# verify if images folder exists
|
|
87
87
|
images_folder_path = os.path.join(tmpdir, "images")
|
|
@@ -91,23 +91,40 @@ def _store_multimodal_content(messages, tmpdir: str):
|
|
|
91
91
|
for message in messages:
|
|
92
92
|
if isinstance(message.get("content", []), list):
|
|
93
93
|
for content in message.get("content", []):
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
if image_url and "url" in image_url and image_url["url"].startswith("data:image/jpg;base64,"):
|
|
97
|
-
# Extract the base64 string
|
|
98
|
-
base64image = image_url["url"].replace("data:image/jpg;base64,", "")
|
|
94
|
+
process_message_content(content, images_folder_path)
|
|
95
|
+
|
|
99
96
|
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
97
|
+
def process_message_content(content, images_folder_path):
|
|
98
|
+
if content.get("type", "") == "image_url":
|
|
99
|
+
image_url = content.get("image_url")
|
|
103
100
|
|
|
104
|
-
|
|
105
|
-
|
|
101
|
+
if not image_url or "url" not in image_url:
|
|
102
|
+
return None
|
|
106
103
|
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
104
|
+
url = image_url["url"]
|
|
105
|
+
if not url.startswith("data:image/"):
|
|
106
|
+
return None
|
|
107
|
+
|
|
108
|
+
match = re.search("data:image/([^;]+);", url)
|
|
109
|
+
if not match:
|
|
110
|
+
return None
|
|
111
|
+
|
|
112
|
+
ext = match.group(1)
|
|
113
|
+
# Extract the base64 string
|
|
114
|
+
base64image = image_url["url"].replace(f"data:image/{ext};base64,", "")
|
|
115
|
+
|
|
116
|
+
# Generate a unique filename
|
|
117
|
+
image_file_name = f"{str(uuid.uuid4())}.{ext}"
|
|
118
|
+
image_url["url"] = f"images/{image_file_name}" # Replace the base64 URL with the file path
|
|
119
|
+
|
|
120
|
+
# Decode the base64 string to binary image data
|
|
121
|
+
image_data_binary = base64.b64decode(base64image)
|
|
122
|
+
|
|
123
|
+
# Write the binary image data to the file
|
|
124
|
+
image_file_path = os.path.join(images_folder_path, image_file_name)
|
|
125
|
+
with open(image_file_path, "wb") as f:
|
|
126
|
+
f.write(image_data_binary)
|
|
127
|
+
return None
|
|
111
128
|
|
|
112
129
|
|
|
113
130
|
def _log_metrics_and_instance_results(
|
|
@@ -116,6 +133,7 @@ def _log_metrics_and_instance_results(
|
|
|
116
133
|
trace_destination: Optional[str],
|
|
117
134
|
run: Run,
|
|
118
135
|
evaluation_name: Optional[str],
|
|
136
|
+
**kwargs,
|
|
119
137
|
) -> Optional[str]:
|
|
120
138
|
from azure.ai.evaluation._evaluate._eval_run import EvalRun
|
|
121
139
|
|
|
@@ -123,19 +141,26 @@ def _log_metrics_and_instance_results(
|
|
|
123
141
|
LOGGER.debug("Skip uploading evaluation results to AI Studio since no trace destination was provided.")
|
|
124
142
|
return None
|
|
125
143
|
|
|
126
|
-
|
|
127
|
-
|
|
144
|
+
ws_triad = extract_workspace_triad_from_trace_provider(trace_destination)
|
|
145
|
+
management_client = LiteMLClient(
|
|
146
|
+
subscription_id=ws_triad.subscription_id,
|
|
147
|
+
resource_group=ws_triad.resource_group_name,
|
|
148
|
+
logger=LOGGER,
|
|
149
|
+
credential=kwargs.get("credential"),
|
|
150
|
+
# let the client automatically determine the credentials to use
|
|
151
|
+
)
|
|
152
|
+
tracking_uri = management_client.workspace_get_info(ws_triad.workspace_name).ml_flow_tracking_uri
|
|
128
153
|
|
|
129
154
|
# Adding line_number as index column this is needed by UI to form link to individual instance run
|
|
130
155
|
instance_results["line_number"] = instance_results.index.values
|
|
131
156
|
|
|
132
157
|
with EvalRun(
|
|
133
158
|
run_name=run.name if run is not None else evaluation_name,
|
|
134
|
-
tracking_uri=tracking_uri,
|
|
159
|
+
tracking_uri=cast(str, tracking_uri),
|
|
135
160
|
subscription_id=ws_triad.subscription_id,
|
|
136
161
|
group_name=ws_triad.resource_group_name,
|
|
137
162
|
workspace_name=ws_triad.workspace_name,
|
|
138
|
-
|
|
163
|
+
management_client=management_client,
|
|
139
164
|
promptflow_run=run,
|
|
140
165
|
) as ev_run:
|
|
141
166
|
artifact_name = EvalRun.EVALUATION_ARTIFACT
|
|
@@ -166,9 +191,16 @@ def _log_metrics_and_instance_results(
|
|
|
166
191
|
properties={
|
|
167
192
|
EvaluationRunProperties.RUN_TYPE: "eval_run",
|
|
168
193
|
EvaluationRunProperties.EVALUATION_RUN: "promptflow.BatchRun",
|
|
194
|
+
EvaluationRunProperties.EVALUATION_SDK: f"azure-ai-evaluation:{VERSION}",
|
|
169
195
|
"_azureml.evaluate_artifacts": json.dumps([{"path": artifact_name, "type": "table"}]),
|
|
170
196
|
}
|
|
171
197
|
)
|
|
198
|
+
else:
|
|
199
|
+
ev_run.write_properties_to_run_history(
|
|
200
|
+
properties={
|
|
201
|
+
EvaluationRunProperties.EVALUATION_SDK: f"azure-ai-evaluation:{VERSION}",
|
|
202
|
+
}
|
|
203
|
+
)
|
|
172
204
|
|
|
173
205
|
for metric_name, metric_value in metrics.items():
|
|
174
206
|
ev_run.log_metric(metric_name, metric_value)
|
|
@@ -12,6 +12,7 @@ from typing_extensions import ParamSpec, TypeAlias, get_overloads
|
|
|
12
12
|
from azure.ai.evaluation._common.math import list_mean
|
|
13
13
|
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
14
14
|
from azure.ai.evaluation._common.utils import remove_optional_singletons
|
|
15
|
+
from azure.ai.evaluation._model_configurations import Conversation
|
|
15
16
|
|
|
16
17
|
P = ParamSpec("P")
|
|
17
18
|
T = TypeVar("T")
|
|
@@ -202,6 +203,59 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
|
|
|
202
203
|
|
|
203
204
|
return converter
|
|
204
205
|
|
|
206
|
+
def _derive_multi_modal_conversation_converter(self) -> Callable[[Dict], List[Dict[str, Any]]]:
|
|
207
|
+
"""Produce the function that will be used to convert multi-modal conversations to a list of evaluable inputs.
|
|
208
|
+
This uses the inputs derived from the _derive_singleton_inputs function to determine which
|
|
209
|
+
aspects of a conversation ought to be extracted.
|
|
210
|
+
|
|
211
|
+
:return: The function that will be used to convert conversations to evaluable inputs.
|
|
212
|
+
:rtype: Callable
|
|
213
|
+
"""
|
|
214
|
+
|
|
215
|
+
def multi_modal_converter(conversation: Dict) -> List[Dict[str, Any]]:
|
|
216
|
+
messages = cast(List[Dict[str, Any]], conversation["messages"])
|
|
217
|
+
# Extract user messages, assistant messages from conversation
|
|
218
|
+
user_messages: List[Dict[str, Any]] = []
|
|
219
|
+
assistant_messages: List[Dict[str, Any]] = []
|
|
220
|
+
system_messages: List[Dict[str, Any]] = []
|
|
221
|
+
|
|
222
|
+
# Convert conversation slice into queries and responses.
|
|
223
|
+
# Assume that 'user' role is asking queries and 'assistant' role is responding.
|
|
224
|
+
if self._eval_last_turn and len(messages) > 1:
|
|
225
|
+
messages = messages[-2:]
|
|
226
|
+
|
|
227
|
+
for each_turn in messages:
|
|
228
|
+
role = each_turn["role"]
|
|
229
|
+
if role == "user":
|
|
230
|
+
user_messages.append(each_turn)
|
|
231
|
+
elif role == "assistant":
|
|
232
|
+
assistant_messages.append(each_turn)
|
|
233
|
+
elif role == "system":
|
|
234
|
+
system_messages.append(each_turn)
|
|
235
|
+
|
|
236
|
+
# validation
|
|
237
|
+
if len(user_messages) != len(assistant_messages):
|
|
238
|
+
raise EvaluationException(
|
|
239
|
+
message="Mismatched number of user and assistant messages.",
|
|
240
|
+
internal_message=("Mismatched number of user and assistant messages."),
|
|
241
|
+
)
|
|
242
|
+
if len(assistant_messages) > 1:
|
|
243
|
+
raise EvaluationException(
|
|
244
|
+
message="Conversation can have only one assistant message.",
|
|
245
|
+
internal_message=("Conversation can have only one assistant message."),
|
|
246
|
+
)
|
|
247
|
+
eval_conv_inputs = []
|
|
248
|
+
for user_msg, assist_msg in zip(user_messages, assistant_messages):
|
|
249
|
+
conv_messages = []
|
|
250
|
+
if len(system_messages) == 1:
|
|
251
|
+
conv_messages.append(system_messages[0])
|
|
252
|
+
conv_messages.append(user_msg)
|
|
253
|
+
conv_messages.append(assist_msg)
|
|
254
|
+
eval_conv_inputs.append({"conversation": Conversation(messages=conv_messages)})
|
|
255
|
+
return eval_conv_inputs
|
|
256
|
+
|
|
257
|
+
return multi_modal_converter
|
|
258
|
+
|
|
205
259
|
def _convert_kwargs_to_eval_input(self, **kwargs) -> Union[List[Dict], List[DerivedEvalInput]]:
|
|
206
260
|
"""Convert an arbitrary input into a list of inputs for evaluators.
|
|
207
261
|
It is assumed that evaluators generally make use of their inputs in one of two ways.
|
|
@@ -210,7 +264,7 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
|
|
|
210
264
|
values.
|
|
211
265
|
|
|
212
266
|
The self._singleton_inputs list assigned during initialization is used to find and extract
|
|
213
|
-
singleton keywords, and self.
|
|
267
|
+
singleton keywords, and self._allow_conversation_input is used to determine if a conversation
|
|
214
268
|
is a valid input.
|
|
215
269
|
|
|
216
270
|
If both conversations and singletons are allowed, the function will raise an exception if both
|
|
@@ -241,6 +295,8 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
|
|
|
241
295
|
)
|
|
242
296
|
# Handle Conversation
|
|
243
297
|
if conversation is not None:
|
|
298
|
+
if self._is_multi_modal_conversation(conversation):
|
|
299
|
+
return self._derive_multi_modal_conversation_converter()(conversation)
|
|
244
300
|
return self._derive_conversation_converter()(conversation)
|
|
245
301
|
# Handle Singletons
|
|
246
302
|
required_singletons = remove_optional_singletons(self, singletons)
|
|
@@ -255,6 +311,20 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
|
|
|
255
311
|
target=ErrorTarget.CONVERSATION,
|
|
256
312
|
)
|
|
257
313
|
|
|
314
|
+
def _is_multi_modal_conversation(self, conversation: Dict) -> bool:
|
|
315
|
+
if "messages" not in conversation:
|
|
316
|
+
return False
|
|
317
|
+
messages = conversation["messages"]
|
|
318
|
+
if not isinstance(messages, list):
|
|
319
|
+
return False
|
|
320
|
+
for message in messages:
|
|
321
|
+
if "content" in message:
|
|
322
|
+
content = message.get("content", "")
|
|
323
|
+
if isinstance(content, list):
|
|
324
|
+
if any(item.get("type") == "image_url" and "url" in item.get("image_url", {}) for item in content):
|
|
325
|
+
return True
|
|
326
|
+
return False
|
|
327
|
+
|
|
258
328
|
def _aggregate_results(self, per_turn_results: List[DoEvalResult[T_EvalValue]]) -> AggregateResult[T_EvalValue]:
|
|
259
329
|
"""Aggregate the evaluation results of each conversation turn into a single result.
|
|
260
330
|
|
|
@@ -10,6 +10,7 @@ from promptflow.core import AsyncPrompty
|
|
|
10
10
|
from typing_extensions import override
|
|
11
11
|
|
|
12
12
|
from azure.ai.evaluation._common.constants import PROMPT_BASED_REASON_EVALUATORS
|
|
13
|
+
from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
|
|
13
14
|
from ..._common.utils import construct_prompty_model_config, validate_model_config, parse_quality_evaluator_reason_score
|
|
14
15
|
from . import EvaluatorBase
|
|
15
16
|
|
|
@@ -47,10 +48,12 @@ class PromptyEvaluatorBase(EvaluatorBase[T]):
|
|
|
47
48
|
self._prompty_file = prompty_file
|
|
48
49
|
super().__init__(eval_last_turn=eval_last_turn)
|
|
49
50
|
|
|
51
|
+
subclass_name = self.__class__.__name__
|
|
52
|
+
user_agent = f"{USER_AGENT} (type=evaluator subtype={subclass_name})"
|
|
50
53
|
prompty_model_config = construct_prompty_model_config(
|
|
51
54
|
validate_model_config(model_config),
|
|
52
55
|
self._DEFAULT_OPEN_API_VERSION,
|
|
53
|
-
|
|
56
|
+
user_agent,
|
|
54
57
|
)
|
|
55
58
|
|
|
56
59
|
self._flow = AsyncPrompty.load(source=prompty_file, model=prompty_model_config)
|
|
@@ -69,6 +72,14 @@ class PromptyEvaluatorBase(EvaluatorBase[T]):
|
|
|
69
72
|
:return: The evaluation result.
|
|
70
73
|
:rtype: Dict
|
|
71
74
|
"""
|
|
75
|
+
if "query" not in eval_input and "response" not in eval_input:
|
|
76
|
+
raise EvaluationException(
|
|
77
|
+
message="Only text conversation inputs are supported.",
|
|
78
|
+
internal_message="Only text conversation inputs are supported.",
|
|
79
|
+
blame=ErrorBlame.USER_ERROR,
|
|
80
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
81
|
+
target=ErrorTarget.CONVERSATION,
|
|
82
|
+
)
|
|
72
83
|
llm_output = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
|
|
73
84
|
|
|
74
85
|
score = math.nan
|
|
@@ -11,9 +11,10 @@ from azure.ai.evaluation._common.constants import (
|
|
|
11
11
|
Tasks,
|
|
12
12
|
_InternalAnnotationTasks,
|
|
13
13
|
)
|
|
14
|
-
from azure.ai.evaluation._common.rai_service import evaluate_with_rai_service
|
|
14
|
+
from azure.ai.evaluation._common.rai_service import evaluate_with_rai_service, evaluate_with_rai_service_multimodal
|
|
15
15
|
from azure.ai.evaluation._common.utils import validate_azure_ai_project
|
|
16
16
|
from azure.ai.evaluation._exceptions import EvaluationException
|
|
17
|
+
from azure.ai.evaluation._common.utils import validate_conversation
|
|
17
18
|
from azure.core.credentials import TokenCredential
|
|
18
19
|
|
|
19
20
|
from . import EvaluatorBase
|
|
@@ -81,6 +82,36 @@ class RaiServiceEvaluatorBase(EvaluatorBase[T]):
|
|
|
81
82
|
:return: The evaluation result.
|
|
82
83
|
:rtype: Dict
|
|
83
84
|
"""
|
|
85
|
+
if "query" in eval_input and "response" in eval_input:
|
|
86
|
+
return await self._evaluate_query_response(eval_input)
|
|
87
|
+
|
|
88
|
+
conversation = eval_input.get("conversation", None)
|
|
89
|
+
return await self._evaluate_conversation(conversation)
|
|
90
|
+
|
|
91
|
+
async def _evaluate_conversation(self, conversation: Dict) -> Dict[str, T]:
|
|
92
|
+
"""
|
|
93
|
+
Evaluates content according to this evaluator's metric.
|
|
94
|
+
:keyword conversation: The conversation contains list of messages to be evaluated.
|
|
95
|
+
Each message should have "role" and "content" keys.
|
|
96
|
+
|
|
97
|
+
:param conversation: The conversation to evaluate.
|
|
98
|
+
:type conversation: ~azure.ai.evaluation.Conversation
|
|
99
|
+
:return: The evaluation score computation based on the Content Safety metric (self.metric).
|
|
100
|
+
:rtype: Dict[str, Union[float, str]]
|
|
101
|
+
"""
|
|
102
|
+
# validate inputs
|
|
103
|
+
validate_conversation(conversation)
|
|
104
|
+
messages = conversation["messages"]
|
|
105
|
+
# Run score computation based on supplied metric.
|
|
106
|
+
result = await evaluate_with_rai_service_multimodal(
|
|
107
|
+
messages=messages,
|
|
108
|
+
metric_name=self._eval_metric,
|
|
109
|
+
project_scope=self._azure_ai_project,
|
|
110
|
+
credential=self._credential,
|
|
111
|
+
)
|
|
112
|
+
return result
|
|
113
|
+
|
|
114
|
+
async def _evaluate_query_response(self, eval_input: Dict) -> Dict[str, T]:
|
|
84
115
|
query = eval_input.get("query", None)
|
|
85
116
|
response = eval_input.get("response", None)
|
|
86
117
|
if query is None or response is None:
|
|
@@ -111,6 +142,7 @@ class RaiServiceEvaluatorBase(EvaluatorBase[T]):
|
|
|
111
142
|
project_scope=self._azure_ai_project,
|
|
112
143
|
credential=self._credential,
|
|
113
144
|
annotation_task=self._get_task(),
|
|
145
|
+
evaluator_name=self.__class__.__name__,
|
|
114
146
|
)
|
|
115
147
|
|
|
116
148
|
def _get_task(self):
|
|
@@ -20,7 +20,7 @@ from ._violence import ViolenceEvaluator
|
|
|
20
20
|
@experimental
|
|
21
21
|
class ContentSafetyEvaluator(EvaluatorBase[Union[str, float]]):
|
|
22
22
|
"""
|
|
23
|
-
Initialize a content safety evaluator configured to evaluate content
|
|
23
|
+
Initialize a content safety evaluator configured to evaluate content safety metrics for QA scenario.
|
|
24
24
|
|
|
25
25
|
:param credential: The credential for connecting to Azure AI project. Required
|
|
26
26
|
:type credential: ~azure.core.credentials.TokenCredential
|
|
@@ -47,7 +47,7 @@ class ContentSafetyEvaluator(EvaluatorBase[Union[str, float]]):
|
|
|
47
47
|
# TODO address 3579092 to re-enabled parallel evals.
|
|
48
48
|
def __init__(self, credential, azure_ai_project, **kwargs):
|
|
49
49
|
super().__init__()
|
|
50
|
-
self._parallel = kwargs.pop("_parallel",
|
|
50
|
+
self._parallel = kwargs.pop("_parallel", True)
|
|
51
51
|
self._evaluators: List[Callable[..., Dict[str, Union[str, float]]]] = [
|
|
52
52
|
ViolenceEvaluator(credential, azure_ai_project),
|
|
53
53
|
SexualEvaluator(credential, azure_ai_project),
|
|
@@ -130,7 +130,7 @@ class ContentSafetyEvaluator(EvaluatorBase[Union[str, float]]):
|
|
|
130
130
|
with ThreadPoolExecutor() as executor:
|
|
131
131
|
# pylint: disable=no-value-for-parameter
|
|
132
132
|
futures = {
|
|
133
|
-
executor.submit(query=query, response=response, conversation=conversation): evaluator
|
|
133
|
+
executor.submit(evaluator, query=query, response=response, conversation=conversation): evaluator
|
|
134
134
|
for evaluator in self._evaluators
|
|
135
135
|
}
|
|
136
136
|
|
|
@@ -80,7 +80,7 @@ class _AsyncSimilarityEvaluator:
|
|
|
80
80
|
|
|
81
81
|
class SimilarityEvaluator:
|
|
82
82
|
"""
|
|
83
|
-
Evaluates similarity score for a given query, response, and ground truth
|
|
83
|
+
Evaluates similarity score for a given query, response, and ground truth.
|
|
84
84
|
|
|
85
85
|
The similarity measure evaluates the likeness between a ground truth sentence (or document) and the
|
|
86
86
|
AI model's generated prediction. This calculation involves creating sentence-level embeddings for both
|
|
@@ -448,19 +448,21 @@ class AsyncHttpPipeline(AsyncPipeline):
|
|
|
448
448
|
return cast(Self, await super().__aenter__())
|
|
449
449
|
|
|
450
450
|
|
|
451
|
-
def get_http_client() -> HttpPipeline:
|
|
451
|
+
def get_http_client(**kwargs: Any) -> HttpPipeline:
|
|
452
452
|
"""Get an HttpPipeline configured with common policies.
|
|
453
453
|
|
|
454
454
|
:returns: An HttpPipeline with a set of applied policies:
|
|
455
455
|
:rtype: HttpPipeline
|
|
456
456
|
"""
|
|
457
|
-
|
|
457
|
+
kwargs.setdefault("user_agent_policy", UserAgentPolicy(base_user_agent=USER_AGENT))
|
|
458
|
+
return HttpPipeline(**kwargs)
|
|
458
459
|
|
|
459
460
|
|
|
460
|
-
def get_async_http_client() -> AsyncHttpPipeline:
|
|
461
|
+
def get_async_http_client(**kwargs: Any) -> AsyncHttpPipeline:
|
|
461
462
|
"""Get an AsyncHttpPipeline configured with common policies.
|
|
462
463
|
|
|
463
464
|
:returns: An AsyncHttpPipeline with a set of applied policies:
|
|
464
465
|
:rtype: AsyncHttpPipeline
|
|
465
466
|
"""
|
|
466
|
-
|
|
467
|
+
kwargs.setdefault("user_agent_policy", UserAgentPolicy(base_user_agent=USER_AGENT))
|
|
468
|
+
return AsyncHttpPipeline(**kwargs)
|
|
@@ -32,10 +32,6 @@ ROUGE-1.5.5.pl -m -e data -n 2 -a settings.xml
|
|
|
32
32
|
In these examples settings.xml lists input files and formats.
|
|
33
33
|
"""
|
|
34
34
|
|
|
35
|
-
from __future__ import absolute_import
|
|
36
|
-
from __future__ import division
|
|
37
|
-
from __future__ import print_function
|
|
38
|
-
|
|
39
35
|
import collections
|
|
40
36
|
import re
|
|
41
37
|
|
|
@@ -21,10 +21,6 @@ Aggregation functions use bootstrap resampling to compute confidence intervals
|
|
|
21
21
|
as per the original ROUGE perl implementation.
|
|
22
22
|
"""
|
|
23
23
|
|
|
24
|
-
from __future__ import absolute_import
|
|
25
|
-
from __future__ import division
|
|
26
|
-
from __future__ import print_function
|
|
27
|
-
|
|
28
24
|
import abc
|
|
29
25
|
import collections
|
|
30
26
|
from typing import Dict
|