azure-ai-evaluation 1.0.1__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (33) hide show
  1. azure/ai/evaluation/_azure/__init__.py +3 -0
  2. azure/ai/evaluation/_azure/_clients.py +188 -0
  3. azure/ai/evaluation/_azure/_models.py +227 -0
  4. azure/ai/evaluation/_azure/_token_manager.py +118 -0
  5. azure/ai/evaluation/_common/rai_service.py +30 -21
  6. azure/ai/evaluation/_constants.py +1 -0
  7. azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +1 -1
  8. azure/ai/evaluation/_evaluate/_eval_run.py +14 -43
  9. azure/ai/evaluation/_evaluate/_evaluate.py +9 -34
  10. azure/ai/evaluation/_evaluate/_utils.py +66 -34
  11. azure/ai/evaluation/_evaluators/_common/_base_eval.py +71 -1
  12. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +12 -1
  13. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +33 -1
  14. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +3 -3
  15. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +1 -1
  16. azure/ai/evaluation/_http_utils.py +6 -4
  17. azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +0 -4
  18. azure/ai/evaluation/_vendor/rouge_score/scoring.py +0 -4
  19. azure/ai/evaluation/_vendor/rouge_score/tokenize.py +0 -4
  20. azure/ai/evaluation/_version.py +1 -1
  21. azure/ai/evaluation/simulator/_adversarial_scenario.py +2 -0
  22. azure/ai/evaluation/simulator/_adversarial_simulator.py +35 -16
  23. azure/ai/evaluation/simulator/_conversation/__init__.py +128 -7
  24. azure/ai/evaluation/simulator/_conversation/_conversation.py +0 -1
  25. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +1 -0
  26. azure/ai/evaluation/simulator/_model_tools/_rai_client.py +40 -0
  27. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +1 -0
  28. azure/ai/evaluation/simulator/_simulator.py +3 -0
  29. {azure_ai_evaluation-1.0.1.dist-info → azure_ai_evaluation-1.1.0.dist-info}/METADATA +15 -10
  30. {azure_ai_evaluation-1.0.1.dist-info → azure_ai_evaluation-1.1.0.dist-info}/RECORD +33 -29
  31. {azure_ai_evaluation-1.0.1.dist-info → azure_ai_evaluation-1.1.0.dist-info}/NOTICE.txt +0 -0
  32. {azure_ai_evaluation-1.0.1.dist-info → azure_ai_evaluation-1.1.0.dist-info}/WHEEL +0 -0
  33. {azure_ai_evaluation-1.0.1.dist-info → azure_ai_evaluation-1.1.0.dist-info}/top_level.txt +0 -0
@@ -22,29 +22,12 @@ from azure.ai.evaluation._version import VERSION
22
22
  from azure.core.pipeline.policies import RetryPolicy
23
23
  from azure.core.rest import HttpResponse
24
24
  from azure.core.exceptions import HttpResponseError
25
+ from azure.storage.blob import BlobServiceClient
26
+ from azure.ai.evaluation._azure._clients import LiteMLClient
25
27
 
26
28
  LOGGER = logging.getLogger(__name__)
27
29
 
28
30
 
29
- # Handle optional import. The azure libraries are only present if
30
- # promptflow-azure is installed.
31
- try:
32
- from azure.ai.ml import MLClient
33
- from azure.ai.ml.entities._credentials import AccountKeyConfiguration # pylint: disable=ungrouped-imports
34
- from azure.ai.ml.entities._datastore.datastore import Datastore
35
- from azure.storage.blob import BlobServiceClient
36
- except (ModuleNotFoundError, ImportError):
37
- raise EvaluationException( # pylint: disable=raise-missing-from
38
- message=(
39
- "The required packages for remote tracking are missing.\n"
40
- 'To resolve this, please install them by running "pip install azure-ai-evaluation[remote]".'
41
- ),
42
- target=ErrorTarget.EVALUATE,
43
- category=ErrorCategory.MISSING_PACKAGE,
44
- blame=ErrorBlame.USER_ERROR,
45
- )
46
-
47
-
48
31
  @dataclasses.dataclass
49
32
  class RunInfo:
50
33
  """
@@ -93,15 +76,16 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
93
76
  :type group_name: str
94
77
  :param workspace_name: The name of workspace/project used to track run.
95
78
  :type workspace_name: str
96
- :param ml_client: The ml client used for authentication into Azure.
97
- :type ml_client: azure.ai.ml.MLClient
79
+ :param management_client: The trace destination string to parse the AI ML workspace blob store from.
80
+ :type management_client:
81
+ ~azure.ai.evaluation._promptflow.azure._lite_azure_management_client.LiteMLClient
98
82
  :param promptflow_run: The promptflow run used by the
83
+ :type promptflow_run: Optional[promptflow._sdk.entities.Run]
99
84
  """
100
85
 
101
86
  _MAX_RETRIES = 5
102
87
  _BACKOFF_FACTOR = 2
103
88
  _TIMEOUT = 5
104
- _SCOPE = "https://management.azure.com/.default"
105
89
 
106
90
  EVALUATION_ARTIFACT = "instance_results.jsonl"
107
91
 
@@ -112,14 +96,14 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
112
96
  subscription_id: str,
113
97
  group_name: str,
114
98
  workspace_name: str,
115
- ml_client: "MLClient",
99
+ management_client: LiteMLClient,
116
100
  promptflow_run: Optional[Run] = None,
117
101
  ) -> None:
118
102
  self._tracking_uri: str = tracking_uri
119
103
  self._subscription_id: str = subscription_id
120
104
  self._resource_group_name: str = group_name
121
105
  self._workspace_name: str = workspace_name
122
- self._ml_client: Any = ml_client
106
+ self._management_client: LiteMLClient = management_client
123
107
  self._is_promptflow_run: bool = promptflow_run is not None
124
108
  self._run_name = run_name
125
109
  self._promptflow_run = promptflow_run
@@ -184,7 +168,7 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
184
168
  if self._promptflow_run is not None:
185
169
  self._info = RunInfo(
186
170
  self._promptflow_run.name,
187
- self._promptflow_run._experiment_name, # pylint: disable=protected-access
171
+ self._promptflow_run._experiment_name or "", # pylint: disable=protected-access
188
172
  self._promptflow_run.name,
189
173
  )
190
174
  else:
@@ -310,12 +294,8 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
310
294
  """
311
295
  return f"https://{self._url_base}" "/mlflow/v2.0" f"{self._get_scope()}" f"/api/2.0/mlflow/runs/log-metric"
312
296
 
313
- def _get_token(self):
314
- # We have to use lazy import because promptflow.azure
315
- # is an optional dependency.
316
- from promptflow.azure._utils._token_cache import ArmTokenCache # pylint: disable=import-error,no-name-in-module
317
-
318
- return ArmTokenCache().get_token(self._ml_client._credential) # pylint: disable=protected-access
297
+ def _get_token(self) -> str:
298
+ return self._management_client.get_token()
319
299
 
320
300
  def request_with_retry(
321
301
  self, url: str, method: str, json_dict: Dict[str, Any], headers: Optional[Dict[str, str]] = None
@@ -441,9 +421,10 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
441
421
  local_paths.append(local_file_path)
442
422
 
443
423
  # We will write the artifacts to the workspaceblobstore
444
- datastore = self._ml_client.datastores.get_default(include_secrets=True)
424
+ datastore = self._management_client.workspace_get_default_datastore(self._workspace_name, True)
445
425
  account_url = f"{datastore.account_name}.blob.{datastore.endpoint}"
446
- svc_client = BlobServiceClient(account_url=account_url, credential=self._get_datastore_credential(datastore))
426
+
427
+ svc_client = BlobServiceClient(account_url=account_url, credential=datastore.credential)
447
428
  try:
448
429
  for local, remote in zip(local_paths, remote_paths["paths"]):
449
430
  blob_client = svc_client.get_blob_client(container=datastore.container_name, blob=remote["path"])
@@ -515,16 +496,6 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
515
496
  except Exception as ex: # pylint: disable=broad-exception-caught
516
497
  LOGGER.debug("Exception occurred while registering image artifact. ex: %s", ex)
517
498
 
518
- def _get_datastore_credential(self, datastore: "Datastore"):
519
- # Reference the logic in azure.ai.ml._artifact._artifact_utilities
520
- # https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/ml/azure-ai-ml/azure/ai/ml/_artifacts/_artifact_utilities.py#L103
521
- credential = datastore.credentials
522
- if isinstance(credential, AccountKeyConfiguration):
523
- return credential.account_key
524
- if hasattr(credential, "sas_token"):
525
- return credential.sas_token
526
- return self._ml_client.datastores._credential # pylint: disable=protected-access
527
-
528
499
  def log_metric(self, key: str, value: float) -> None:
529
500
  """
530
501
  Log the metric to azure similar to how it is done by mlflow.
@@ -10,7 +10,6 @@ from typing import Any, Callable, Dict, List, Optional, Set, Tuple, TypedDict, T
10
10
 
11
11
  import pandas as pd
12
12
  from promptflow._sdk._constants import LINE_NUMBER
13
- from promptflow._sdk._errors import UserAuthenticationError, UploadInternalError
14
13
  from promptflow.client import PFClient
15
14
  from promptflow.entities import Run
16
15
 
@@ -21,7 +20,6 @@ from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarg
21
20
  from .._constants import (
22
21
  CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT,
23
22
  EvaluationMetrics,
24
- EvaluationRunProperties,
25
23
  Prefixes,
26
24
  _InternalEvaluationMetrics,
27
25
  )
@@ -468,33 +466,14 @@ def _apply_target_to_data(
468
466
  :rtype: Tuple[pandas.DataFrame, List[str]]
469
467
  """
470
468
  _run_name = kwargs.get("_run_name")
471
- upload_target_snaphot = kwargs.get("_upload_target_snapshot", False)
472
-
473
- try:
474
- with TargetRunContext(upload_target_snaphot):
475
- run: Run = pf_client.run(
476
- flow=target,
477
- display_name=evaluation_name,
478
- data=data,
479
- properties={EvaluationRunProperties.RUN_TYPE: "eval_run", "isEvaluatorRun": "true"},
480
- stream=True,
481
- name=_run_name,
482
- )
483
- except (UserAuthenticationError, UploadInternalError) as ex:
484
- if "Failed to upload run" in ex.message:
485
- msg = (
486
- "Failed to upload the target run to the cloud. "
487
- "This may be caused by insufficient permission to access storage or other errors."
488
- )
489
- raise EvaluationException(
490
- message=msg,
491
- target=ErrorTarget.EVALUATE,
492
- category=ErrorCategory.FAILED_REMOTE_TRACKING,
493
- blame=ErrorBlame.USER_ERROR,
494
- tsg_link="https://aka.ms/azsdk/python/evaluation/remotetracking/troubleshoot",
495
- ) from ex
496
-
497
- raise ex
469
+ with TargetRunContext():
470
+ run: Run = pf_client.run(
471
+ flow=target,
472
+ display_name=evaluation_name,
473
+ data=data,
474
+ stream=True,
475
+ name=_run_name,
476
+ )
498
477
 
499
478
  target_output: pd.DataFrame = pf_client.runs.get_details(run, all_results=True)
500
479
  # Remove input and output prefix
@@ -834,11 +813,7 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
834
813
  studio_url = None
835
814
  if trace_destination:
836
815
  studio_url = _log_metrics_and_instance_results(
837
- metrics,
838
- result_df,
839
- trace_destination,
840
- target_run,
841
- evaluation_name,
816
+ metrics, result_df, trace_destination, target_run, evaluation_name, **kwargs
842
817
  )
843
818
 
844
819
  result_df_dict = result_df.to_dict("records")
@@ -7,12 +7,11 @@ import os
7
7
  import re
8
8
  import tempfile
9
9
  from pathlib import Path
10
- from typing import Any, Dict, NamedTuple, Optional, Tuple, Union
10
+ from typing import Any, Dict, NamedTuple, Optional, Union, cast
11
11
  import uuid
12
12
  import base64
13
13
 
14
14
  import pandas as pd
15
- from promptflow.client import PFClient
16
15
  from promptflow.entities import Run
17
16
 
18
17
  from azure.ai.evaluation._constants import (
@@ -23,6 +22,8 @@ from azure.ai.evaluation._constants import (
23
22
  )
24
23
  from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
25
24
  from azure.ai.evaluation._model_configurations import AzureAIProject
25
+ from azure.ai.evaluation._version import VERSION
26
+ from azure.ai.evaluation._azure._clients import LiteMLClient
26
27
 
27
28
  LOGGER = logging.getLogger(__name__)
28
29
 
@@ -45,6 +46,8 @@ def is_none(value) -> bool:
45
46
  def extract_workspace_triad_from_trace_provider( # pylint: disable=name-too-long
46
47
  trace_provider: str,
47
48
  ) -> AzureMLWorkspace:
49
+ from promptflow._cli._utils import get_workspace_triad_from_local
50
+
48
51
  match = re.match(AZURE_WORKSPACE_REGEX_FORMAT, trace_provider)
49
52
  if not match or len(match.groups()) != 5:
50
53
  raise EvaluationException(
@@ -58,10 +61,20 @@ def extract_workspace_triad_from_trace_provider( # pylint: disable=name-too-lon
58
61
  category=ErrorCategory.INVALID_VALUE,
59
62
  blame=ErrorBlame.UNKNOWN,
60
63
  )
64
+
61
65
  subscription_id = match.group(1)
62
66
  resource_group_name = match.group(3)
63
67
  workspace_name = match.group(5)
64
- return AzureMLWorkspace(subscription_id, resource_group_name, workspace_name)
68
+
69
+ # In theory this if statement should never evaluate to True, but we'll keep it here just in case
70
+ # for backwards compatibility with what the original code that depended on promptflow-azure did
71
+ if not (subscription_id and resource_group_name and workspace_name):
72
+ local = get_workspace_triad_from_local()
73
+ subscription_id = subscription_id or local.subscription_id or os.getenv("AZUREML_ARM_SUBSCRIPTION")
74
+ resource_group_name = resource_group_name or local.resource_group_name or os.getenv("AZUREML_ARM_RESOURCEGROUP")
75
+ workspace_name = workspace_name or local.workspace_name or os.getenv("AZUREML_ARM_WORKSPACE_NAME")
76
+
77
+ return AzureMLWorkspace(subscription_id or "", resource_group_name or "", workspace_name or "")
65
78
 
66
79
 
67
80
  def load_jsonl(path):
@@ -69,19 +82,6 @@ def load_jsonl(path):
69
82
  return [json.loads(line) for line in f.readlines()]
70
83
 
71
84
 
72
- def _azure_pf_client_and_triad(trace_destination) -> Tuple[PFClient, AzureMLWorkspace]:
73
- from promptflow.azure._cli._utils import _get_azure_pf_client
74
-
75
- ws_triad = extract_workspace_triad_from_trace_provider(trace_destination)
76
- azure_pf_client = _get_azure_pf_client(
77
- subscription_id=ws_triad.subscription_id,
78
- resource_group=ws_triad.resource_group_name,
79
- workspace_name=ws_triad.workspace_name,
80
- )
81
-
82
- return azure_pf_client, ws_triad
83
-
84
-
85
85
  def _store_multimodal_content(messages, tmpdir: str):
86
86
  # verify if images folder exists
87
87
  images_folder_path = os.path.join(tmpdir, "images")
@@ -91,23 +91,40 @@ def _store_multimodal_content(messages, tmpdir: str):
91
91
  for message in messages:
92
92
  if isinstance(message.get("content", []), list):
93
93
  for content in message.get("content", []):
94
- if content.get("type") == "image_url":
95
- image_url = content.get("image_url")
96
- if image_url and "url" in image_url and image_url["url"].startswith("data:image/jpg;base64,"):
97
- # Extract the base64 string
98
- base64image = image_url["url"].replace("data:image/jpg;base64,", "")
94
+ process_message_content(content, images_folder_path)
95
+
99
96
 
100
- # Generate a unique filename
101
- image_file_name = f"{str(uuid.uuid4())}.jpg"
102
- image_url["url"] = f"images/{image_file_name}" # Replace the base64 URL with the file path
97
+ def process_message_content(content, images_folder_path):
98
+ if content.get("type", "") == "image_url":
99
+ image_url = content.get("image_url")
103
100
 
104
- # Decode the base64 string to binary image data
105
- image_data_binary = base64.b64decode(base64image)
101
+ if not image_url or "url" not in image_url:
102
+ return None
106
103
 
107
- # Write the binary image data to the file
108
- image_file_path = os.path.join(images_folder_path, image_file_name)
109
- with open(image_file_path, "wb") as f:
110
- f.write(image_data_binary)
104
+ url = image_url["url"]
105
+ if not url.startswith("data:image/"):
106
+ return None
107
+
108
+ match = re.search("data:image/([^;]+);", url)
109
+ if not match:
110
+ return None
111
+
112
+ ext = match.group(1)
113
+ # Extract the base64 string
114
+ base64image = image_url["url"].replace(f"data:image/{ext};base64,", "")
115
+
116
+ # Generate a unique filename
117
+ image_file_name = f"{str(uuid.uuid4())}.{ext}"
118
+ image_url["url"] = f"images/{image_file_name}" # Replace the base64 URL with the file path
119
+
120
+ # Decode the base64 string to binary image data
121
+ image_data_binary = base64.b64decode(base64image)
122
+
123
+ # Write the binary image data to the file
124
+ image_file_path = os.path.join(images_folder_path, image_file_name)
125
+ with open(image_file_path, "wb") as f:
126
+ f.write(image_data_binary)
127
+ return None
111
128
 
112
129
 
113
130
  def _log_metrics_and_instance_results(
@@ -116,6 +133,7 @@ def _log_metrics_and_instance_results(
116
133
  trace_destination: Optional[str],
117
134
  run: Run,
118
135
  evaluation_name: Optional[str],
136
+ **kwargs,
119
137
  ) -> Optional[str]:
120
138
  from azure.ai.evaluation._evaluate._eval_run import EvalRun
121
139
 
@@ -123,19 +141,26 @@ def _log_metrics_and_instance_results(
123
141
  LOGGER.debug("Skip uploading evaluation results to AI Studio since no trace destination was provided.")
124
142
  return None
125
143
 
126
- azure_pf_client, ws_triad = _azure_pf_client_and_triad(trace_destination)
127
- tracking_uri = azure_pf_client.ml_client.workspaces.get(ws_triad.workspace_name).mlflow_tracking_uri
144
+ ws_triad = extract_workspace_triad_from_trace_provider(trace_destination)
145
+ management_client = LiteMLClient(
146
+ subscription_id=ws_triad.subscription_id,
147
+ resource_group=ws_triad.resource_group_name,
148
+ logger=LOGGER,
149
+ credential=kwargs.get("credential"),
150
+ # let the client automatically determine the credentials to use
151
+ )
152
+ tracking_uri = management_client.workspace_get_info(ws_triad.workspace_name).ml_flow_tracking_uri
128
153
 
129
154
  # Adding line_number as index column this is needed by UI to form link to individual instance run
130
155
  instance_results["line_number"] = instance_results.index.values
131
156
 
132
157
  with EvalRun(
133
158
  run_name=run.name if run is not None else evaluation_name,
134
- tracking_uri=tracking_uri,
159
+ tracking_uri=cast(str, tracking_uri),
135
160
  subscription_id=ws_triad.subscription_id,
136
161
  group_name=ws_triad.resource_group_name,
137
162
  workspace_name=ws_triad.workspace_name,
138
- ml_client=azure_pf_client.ml_client,
163
+ management_client=management_client,
139
164
  promptflow_run=run,
140
165
  ) as ev_run:
141
166
  artifact_name = EvalRun.EVALUATION_ARTIFACT
@@ -166,9 +191,16 @@ def _log_metrics_and_instance_results(
166
191
  properties={
167
192
  EvaluationRunProperties.RUN_TYPE: "eval_run",
168
193
  EvaluationRunProperties.EVALUATION_RUN: "promptflow.BatchRun",
194
+ EvaluationRunProperties.EVALUATION_SDK: f"azure-ai-evaluation:{VERSION}",
169
195
  "_azureml.evaluate_artifacts": json.dumps([{"path": artifact_name, "type": "table"}]),
170
196
  }
171
197
  )
198
+ else:
199
+ ev_run.write_properties_to_run_history(
200
+ properties={
201
+ EvaluationRunProperties.EVALUATION_SDK: f"azure-ai-evaluation:{VERSION}",
202
+ }
203
+ )
172
204
 
173
205
  for metric_name, metric_value in metrics.items():
174
206
  ev_run.log_metric(metric_name, metric_value)
@@ -12,6 +12,7 @@ from typing_extensions import ParamSpec, TypeAlias, get_overloads
12
12
  from azure.ai.evaluation._common.math import list_mean
13
13
  from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
14
14
  from azure.ai.evaluation._common.utils import remove_optional_singletons
15
+ from azure.ai.evaluation._model_configurations import Conversation
15
16
 
16
17
  P = ParamSpec("P")
17
18
  T = TypeVar("T")
@@ -202,6 +203,59 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
202
203
 
203
204
  return converter
204
205
 
206
+ def _derive_multi_modal_conversation_converter(self) -> Callable[[Dict], List[Dict[str, Any]]]:
207
+ """Produce the function that will be used to convert multi-modal conversations to a list of evaluable inputs.
208
+ This uses the inputs derived from the _derive_singleton_inputs function to determine which
209
+ aspects of a conversation ought to be extracted.
210
+
211
+ :return: The function that will be used to convert conversations to evaluable inputs.
212
+ :rtype: Callable
213
+ """
214
+
215
+ def multi_modal_converter(conversation: Dict) -> List[Dict[str, Any]]:
216
+ messages = cast(List[Dict[str, Any]], conversation["messages"])
217
+ # Extract user messages, assistant messages from conversation
218
+ user_messages: List[Dict[str, Any]] = []
219
+ assistant_messages: List[Dict[str, Any]] = []
220
+ system_messages: List[Dict[str, Any]] = []
221
+
222
+ # Convert conversation slice into queries and responses.
223
+ # Assume that 'user' role is asking queries and 'assistant' role is responding.
224
+ if self._eval_last_turn and len(messages) > 1:
225
+ messages = messages[-2:]
226
+
227
+ for each_turn in messages:
228
+ role = each_turn["role"]
229
+ if role == "user":
230
+ user_messages.append(each_turn)
231
+ elif role == "assistant":
232
+ assistant_messages.append(each_turn)
233
+ elif role == "system":
234
+ system_messages.append(each_turn)
235
+
236
+ # validation
237
+ if len(user_messages) != len(assistant_messages):
238
+ raise EvaluationException(
239
+ message="Mismatched number of user and assistant messages.",
240
+ internal_message=("Mismatched number of user and assistant messages."),
241
+ )
242
+ if len(assistant_messages) > 1:
243
+ raise EvaluationException(
244
+ message="Conversation can have only one assistant message.",
245
+ internal_message=("Conversation can have only one assistant message."),
246
+ )
247
+ eval_conv_inputs = []
248
+ for user_msg, assist_msg in zip(user_messages, assistant_messages):
249
+ conv_messages = []
250
+ if len(system_messages) == 1:
251
+ conv_messages.append(system_messages[0])
252
+ conv_messages.append(user_msg)
253
+ conv_messages.append(assist_msg)
254
+ eval_conv_inputs.append({"conversation": Conversation(messages=conv_messages)})
255
+ return eval_conv_inputs
256
+
257
+ return multi_modal_converter
258
+
205
259
  def _convert_kwargs_to_eval_input(self, **kwargs) -> Union[List[Dict], List[DerivedEvalInput]]:
206
260
  """Convert an arbitrary input into a list of inputs for evaluators.
207
261
  It is assumed that evaluators generally make use of their inputs in one of two ways.
@@ -210,7 +264,7 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
210
264
  values.
211
265
 
212
266
  The self._singleton_inputs list assigned during initialization is used to find and extract
213
- singleton keywords, and self._allow_converssation_input is used to determine if a conversation
267
+ singleton keywords, and self._allow_conversation_input is used to determine if a conversation
214
268
  is a valid input.
215
269
 
216
270
  If both conversations and singletons are allowed, the function will raise an exception if both
@@ -241,6 +295,8 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
241
295
  )
242
296
  # Handle Conversation
243
297
  if conversation is not None:
298
+ if self._is_multi_modal_conversation(conversation):
299
+ return self._derive_multi_modal_conversation_converter()(conversation)
244
300
  return self._derive_conversation_converter()(conversation)
245
301
  # Handle Singletons
246
302
  required_singletons = remove_optional_singletons(self, singletons)
@@ -255,6 +311,20 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
255
311
  target=ErrorTarget.CONVERSATION,
256
312
  )
257
313
 
314
+ def _is_multi_modal_conversation(self, conversation: Dict) -> bool:
315
+ if "messages" not in conversation:
316
+ return False
317
+ messages = conversation["messages"]
318
+ if not isinstance(messages, list):
319
+ return False
320
+ for message in messages:
321
+ if "content" in message:
322
+ content = message.get("content", "")
323
+ if isinstance(content, list):
324
+ if any(item.get("type") == "image_url" and "url" in item.get("image_url", {}) for item in content):
325
+ return True
326
+ return False
327
+
258
328
  def _aggregate_results(self, per_turn_results: List[DoEvalResult[T_EvalValue]]) -> AggregateResult[T_EvalValue]:
259
329
  """Aggregate the evaluation results of each conversation turn into a single result.
260
330
 
@@ -10,6 +10,7 @@ from promptflow.core import AsyncPrompty
10
10
  from typing_extensions import override
11
11
 
12
12
  from azure.ai.evaluation._common.constants import PROMPT_BASED_REASON_EVALUATORS
13
+ from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
13
14
  from ..._common.utils import construct_prompty_model_config, validate_model_config, parse_quality_evaluator_reason_score
14
15
  from . import EvaluatorBase
15
16
 
@@ -47,10 +48,12 @@ class PromptyEvaluatorBase(EvaluatorBase[T]):
47
48
  self._prompty_file = prompty_file
48
49
  super().__init__(eval_last_turn=eval_last_turn)
49
50
 
51
+ subclass_name = self.__class__.__name__
52
+ user_agent = f"{USER_AGENT} (type=evaluator subtype={subclass_name})"
50
53
  prompty_model_config = construct_prompty_model_config(
51
54
  validate_model_config(model_config),
52
55
  self._DEFAULT_OPEN_API_VERSION,
53
- USER_AGENT,
56
+ user_agent,
54
57
  )
55
58
 
56
59
  self._flow = AsyncPrompty.load(source=prompty_file, model=prompty_model_config)
@@ -69,6 +72,14 @@ class PromptyEvaluatorBase(EvaluatorBase[T]):
69
72
  :return: The evaluation result.
70
73
  :rtype: Dict
71
74
  """
75
+ if "query" not in eval_input and "response" not in eval_input:
76
+ raise EvaluationException(
77
+ message="Only text conversation inputs are supported.",
78
+ internal_message="Only text conversation inputs are supported.",
79
+ blame=ErrorBlame.USER_ERROR,
80
+ category=ErrorCategory.INVALID_VALUE,
81
+ target=ErrorTarget.CONVERSATION,
82
+ )
72
83
  llm_output = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
73
84
 
74
85
  score = math.nan
@@ -11,9 +11,10 @@ from azure.ai.evaluation._common.constants import (
11
11
  Tasks,
12
12
  _InternalAnnotationTasks,
13
13
  )
14
- from azure.ai.evaluation._common.rai_service import evaluate_with_rai_service
14
+ from azure.ai.evaluation._common.rai_service import evaluate_with_rai_service, evaluate_with_rai_service_multimodal
15
15
  from azure.ai.evaluation._common.utils import validate_azure_ai_project
16
16
  from azure.ai.evaluation._exceptions import EvaluationException
17
+ from azure.ai.evaluation._common.utils import validate_conversation
17
18
  from azure.core.credentials import TokenCredential
18
19
 
19
20
  from . import EvaluatorBase
@@ -81,6 +82,36 @@ class RaiServiceEvaluatorBase(EvaluatorBase[T]):
81
82
  :return: The evaluation result.
82
83
  :rtype: Dict
83
84
  """
85
+ if "query" in eval_input and "response" in eval_input:
86
+ return await self._evaluate_query_response(eval_input)
87
+
88
+ conversation = eval_input.get("conversation", None)
89
+ return await self._evaluate_conversation(conversation)
90
+
91
+ async def _evaluate_conversation(self, conversation: Dict) -> Dict[str, T]:
92
+ """
93
+ Evaluates content according to this evaluator's metric.
94
+ :keyword conversation: The conversation contains list of messages to be evaluated.
95
+ Each message should have "role" and "content" keys.
96
+
97
+ :param conversation: The conversation to evaluate.
98
+ :type conversation: ~azure.ai.evaluation.Conversation
99
+ :return: The evaluation score computation based on the Content Safety metric (self.metric).
100
+ :rtype: Dict[str, Union[float, str]]
101
+ """
102
+ # validate inputs
103
+ validate_conversation(conversation)
104
+ messages = conversation["messages"]
105
+ # Run score computation based on supplied metric.
106
+ result = await evaluate_with_rai_service_multimodal(
107
+ messages=messages,
108
+ metric_name=self._eval_metric,
109
+ project_scope=self._azure_ai_project,
110
+ credential=self._credential,
111
+ )
112
+ return result
113
+
114
+ async def _evaluate_query_response(self, eval_input: Dict) -> Dict[str, T]:
84
115
  query = eval_input.get("query", None)
85
116
  response = eval_input.get("response", None)
86
117
  if query is None or response is None:
@@ -111,6 +142,7 @@ class RaiServiceEvaluatorBase(EvaluatorBase[T]):
111
142
  project_scope=self._azure_ai_project,
112
143
  credential=self._credential,
113
144
  annotation_task=self._get_task(),
145
+ evaluator_name=self.__class__.__name__,
114
146
  )
115
147
 
116
148
  def _get_task(self):
@@ -20,7 +20,7 @@ from ._violence import ViolenceEvaluator
20
20
  @experimental
21
21
  class ContentSafetyEvaluator(EvaluatorBase[Union[str, float]]):
22
22
  """
23
- Initialize a content safety evaluator configured to evaluate content safetry metrics for QA scenario.
23
+ Initialize a content safety evaluator configured to evaluate content safety metrics for QA scenario.
24
24
 
25
25
  :param credential: The credential for connecting to Azure AI project. Required
26
26
  :type credential: ~azure.core.credentials.TokenCredential
@@ -47,7 +47,7 @@ class ContentSafetyEvaluator(EvaluatorBase[Union[str, float]]):
47
47
  # TODO address 3579092 to re-enabled parallel evals.
48
48
  def __init__(self, credential, azure_ai_project, **kwargs):
49
49
  super().__init__()
50
- self._parallel = kwargs.pop("_parallel", False)
50
+ self._parallel = kwargs.pop("_parallel", True)
51
51
  self._evaluators: List[Callable[..., Dict[str, Union[str, float]]]] = [
52
52
  ViolenceEvaluator(credential, azure_ai_project),
53
53
  SexualEvaluator(credential, azure_ai_project),
@@ -130,7 +130,7 @@ class ContentSafetyEvaluator(EvaluatorBase[Union[str, float]]):
130
130
  with ThreadPoolExecutor() as executor:
131
131
  # pylint: disable=no-value-for-parameter
132
132
  futures = {
133
- executor.submit(query=query, response=response, conversation=conversation): evaluator
133
+ executor.submit(evaluator, query=query, response=response, conversation=conversation): evaluator
134
134
  for evaluator in self._evaluators
135
135
  }
136
136
 
@@ -80,7 +80,7 @@ class _AsyncSimilarityEvaluator:
80
80
 
81
81
  class SimilarityEvaluator:
82
82
  """
83
- Evaluates similarity score for a given query, response, and ground truth or a multi-turn conversation.
83
+ Evaluates similarity score for a given query, response, and ground truth.
84
84
 
85
85
  The similarity measure evaluates the likeness between a ground truth sentence (or document) and the
86
86
  AI model's generated prediction. This calculation involves creating sentence-level embeddings for both
@@ -448,19 +448,21 @@ class AsyncHttpPipeline(AsyncPipeline):
448
448
  return cast(Self, await super().__aenter__())
449
449
 
450
450
 
451
- def get_http_client() -> HttpPipeline:
451
+ def get_http_client(**kwargs: Any) -> HttpPipeline:
452
452
  """Get an HttpPipeline configured with common policies.
453
453
 
454
454
  :returns: An HttpPipeline with a set of applied policies:
455
455
  :rtype: HttpPipeline
456
456
  """
457
- return HttpPipeline(user_agent_policy=UserAgentPolicy(base_user_agent=USER_AGENT))
457
+ kwargs.setdefault("user_agent_policy", UserAgentPolicy(base_user_agent=USER_AGENT))
458
+ return HttpPipeline(**kwargs)
458
459
 
459
460
 
460
- def get_async_http_client() -> AsyncHttpPipeline:
461
+ def get_async_http_client(**kwargs: Any) -> AsyncHttpPipeline:
461
462
  """Get an AsyncHttpPipeline configured with common policies.
462
463
 
463
464
  :returns: An AsyncHttpPipeline with a set of applied policies:
464
465
  :rtype: AsyncHttpPipeline
465
466
  """
466
- return AsyncHttpPipeline(user_agent_policy=UserAgentPolicy(base_user_agent=USER_AGENT))
467
+ kwargs.setdefault("user_agent_policy", UserAgentPolicy(base_user_agent=USER_AGENT))
468
+ return AsyncHttpPipeline(**kwargs)
@@ -32,10 +32,6 @@ ROUGE-1.5.5.pl -m -e data -n 2 -a settings.xml
32
32
  In these examples settings.xml lists input files and formats.
33
33
  """
34
34
 
35
- from __future__ import absolute_import
36
- from __future__ import division
37
- from __future__ import print_function
38
-
39
35
  import collections
40
36
  import re
41
37
 
@@ -21,10 +21,6 @@ Aggregation functions use bootstrap resampling to compute confidence intervals
21
21
  as per the original ROUGE perl implementation.
22
22
  """
23
23
 
24
- from __future__ import absolute_import
25
- from __future__ import division
26
- from __future__ import print_function
27
-
28
24
  import abc
29
25
  import collections
30
26
  from typing import Dict