azure-ai-evaluation 1.1.0__py3-none-any.whl → 1.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. azure/ai/evaluation/__init__.py +1 -15
  2. azure/ai/evaluation/_azure/_clients.py +24 -8
  3. azure/ai/evaluation/_azure/_models.py +2 -2
  4. azure/ai/evaluation/_common/utils.py +8 -8
  5. azure/ai/evaluation/_constants.py +21 -0
  6. azure/ai/evaluation/_evaluate/_batch_run/__init__.py +2 -1
  7. azure/ai/evaluation/_evaluate/_eval_run.py +3 -1
  8. azure/ai/evaluation/_evaluate/_evaluate.py +74 -14
  9. azure/ai/evaluation/_evaluate/_utils.py +27 -0
  10. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +46 -25
  11. azure/ai/evaluation/_evaluators/_common/__init__.py +2 -0
  12. azure/ai/evaluation/_evaluators/_common/_base_eval.py +69 -4
  13. azure/ai/evaluation/_evaluators/_common/_base_multi_eval.py +61 -0
  14. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +7 -1
  15. azure/ai/evaluation/_evaluators/_common/_conversation_aggregators.py +49 -0
  16. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +5 -42
  17. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +2 -0
  18. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +2 -0
  19. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +2 -0
  20. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +2 -0
  21. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +61 -68
  22. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +45 -23
  23. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +55 -34
  24. azure/ai/evaluation/_evaluators/_qa/_qa.py +32 -27
  25. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +44 -23
  26. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +41 -81
  27. azure/ai/evaluation/_exceptions.py +0 -1
  28. azure/ai/evaluation/_safety_evaluation/__init__.py +3 -0
  29. azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +640 -0
  30. azure/ai/evaluation/_version.py +2 -1
  31. azure/ai/evaluation/simulator/_adversarial_simulator.py +10 -3
  32. azure/ai/evaluation/simulator/_conversation/__init__.py +4 -5
  33. azure/ai/evaluation/simulator/_conversation/_conversation.py +4 -0
  34. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +2 -0
  35. azure/ai/evaluation/simulator/_simulator.py +21 -13
  36. {azure_ai_evaluation-1.1.0.dist-info → azure_ai_evaluation-1.3.0.dist-info}/METADATA +77 -7
  37. {azure_ai_evaluation-1.1.0.dist-info → azure_ai_evaluation-1.3.0.dist-info}/RECORD +40 -44
  38. azure/ai/evaluation/_evaluators/_multimodal/__init__.py +0 -20
  39. azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py +0 -132
  40. azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal_base.py +0 -55
  41. azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py +0 -100
  42. azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py +0 -124
  43. azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py +0 -100
  44. azure/ai/evaluation/_evaluators/_multimodal/_sexual.py +0 -100
  45. azure/ai/evaluation/_evaluators/_multimodal/_violence.py +0 -100
  46. {azure_ai_evaluation-1.1.0.dist-info → azure_ai_evaluation-1.3.0.dist-info}/NOTICE.txt +0 -0
  47. {azure_ai_evaluation-1.1.0.dist-info → azure_ai_evaluation-1.3.0.dist-info}/WHEEL +0 -0
  48. {azure_ai_evaluation-1.1.0.dist-info → azure_ai_evaluation-1.3.0.dist-info}/top_level.txt +0 -0
@@ -12,14 +12,6 @@ from ._evaluators._content_safety import (
12
12
  SexualEvaluator,
13
13
  ViolenceEvaluator,
14
14
  )
15
- from ._evaluators._multimodal._content_safety_multimodal import (
16
- ContentSafetyMultimodalEvaluator,
17
- HateUnfairnessMultimodalEvaluator,
18
- SelfHarmMultimodalEvaluator,
19
- SexualMultimodalEvaluator,
20
- ViolenceMultimodalEvaluator,
21
- )
22
- from ._evaluators._multimodal._protected_material import ProtectedMaterialMultimodalEvaluator
23
15
  from ._evaluators._f1_score import F1ScoreEvaluator
24
16
  from ._evaluators._fluency import FluencyEvaluator
25
17
  from ._evaluators._gleu import GleuScoreEvaluator
@@ -72,11 +64,5 @@ __all__ = [
72
64
  "EvaluatorConfig",
73
65
  "Conversation",
74
66
  "Message",
75
- "EvaluationResult",
76
- "ContentSafetyMultimodalEvaluator",
77
- "HateUnfairnessMultimodalEvaluator",
78
- "SelfHarmMultimodalEvaluator",
79
- "SexualMultimodalEvaluator",
80
- "ViolenceMultimodalEvaluator",
81
- "ProtectedMaterialMultimodalEvaluator",
67
+ "EvaluationResult"
82
68
  ]
@@ -17,7 +17,7 @@ from azure.ai.evaluation.simulator._model_tools._identity_manager import TokenSc
17
17
  from ._models import BlobStoreInfo, Workspace
18
18
 
19
19
 
20
- API_VERSION: Final[str] = "2024-10-01"
20
+ API_VERSION: Final[str] = "2024-07-01-preview"
21
21
  QUERY_KEY_API_VERSION: Final[str] = "api-version"
22
22
  PATH_ML_WORKSPACES = ("providers", "Microsoft.MachineLearningServices", "workspaces")
23
23
 
@@ -69,7 +69,9 @@ class LiteMLClient:
69
69
  self._get_token_manager()
70
70
  return cast(TokenCredential, self._credential)
71
71
 
72
- def workspace_get_default_datastore(self, workspace_name: str, include_credentials: bool = False) -> BlobStoreInfo:
72
+ def workspace_get_default_datastore(
73
+ self, workspace_name: str, *, include_credentials: bool = False, **kwargs: Any
74
+ ) -> BlobStoreInfo:
73
75
  # 1. Get the default blob store
74
76
  # REST API documentation:
75
77
  # https://learn.microsoft.com/rest/api/azureml/datastores/list?view=rest-azureml-2024-10-01
@@ -92,18 +94,29 @@ class LiteMLClient:
92
94
  account_name = props_json["accountName"]
93
95
  endpoint = props_json["endpoint"]
94
96
  container_name = props_json["containerName"]
97
+ credential_type = props_json.get("credentials", {}).get("credentialsType")
95
98
 
96
99
  # 2. Get the SAS token to use for accessing the blob store
97
100
  # REST API documentation:
98
101
  # https://learn.microsoft.com/rest/api/azureml/datastores/list-secrets?view=rest-azureml-2024-10-01
99
- blob_store_credential: Optional[Union[AzureSasCredential, str]] = None
100
- if include_credentials:
102
+ blob_store_credential: Optional[Union[AzureSasCredential, TokenCredential, str]]
103
+ if not include_credentials:
104
+ blob_store_credential = None
105
+ elif credential_type and credential_type.lower() == "none":
106
+ # If storage account key access is disabled, and only Microsoft Entra ID authentication is available,
107
+ # the credentialsType will be "None" and we should not attempt to get the secrets.
108
+ blob_store_credential = self.get_credential()
109
+ else:
101
110
  url = self._generate_path(
102
111
  *PATH_ML_WORKSPACES, workspace_name, "datastores", "workspaceblobstore", "listSecrets"
103
112
  )
104
113
  secrets_response = self._http_client.request(
105
114
  method="POST",
106
115
  url=url,
116
+ json={
117
+ "expirableSecret": True,
118
+ "expireAfterHours": int(kwargs.get("key_expiration_hours", 1)),
119
+ },
107
120
  params={
108
121
  QUERY_KEY_API_VERSION: self._api_version,
109
122
  },
@@ -114,10 +127,13 @@ class LiteMLClient:
114
127
  secrets_json = secrets_response.json()
115
128
  secrets_type = secrets_json["secretsType"].lower()
116
129
 
130
+ # As per this website, only SAS tokens, access tokens, or Entra IDs are valid for accessing blob data
131
+ # stores:
132
+ # https://learn.microsoft.com/rest/api/storageservices/authorize-requests-to-azure-storage.
117
133
  if secrets_type == "sas":
118
134
  blob_store_credential = AzureSasCredential(secrets_json["sasToken"])
119
135
  elif secrets_type == "accountkey":
120
- # To support olders versions of azure-storage-blob better, we return a string here instead of
136
+ # To support older versions of azure-storage-blob better, we return a string here instead of
121
137
  # an AzureNamedKeyCredential
122
138
  blob_store_credential = secrets_json["key"]
123
139
  else:
@@ -164,19 +180,19 @@ class LiteMLClient:
164
180
  # nothing to see here, move along
165
181
  return
166
182
 
167
- additional_info: Optional[str] = None
183
+ message = f"The {description} request failed with HTTP {response.status_code}"
168
184
  try:
169
185
  error_json = response.json()["error"]
170
186
  additional_info = f"({error_json['code']}) {error_json['message']}"
187
+ message += f" - {additional_info}"
171
188
  except (JSONDecodeError, ValueError, KeyError):
172
189
  pass
173
190
 
174
191
  raise EvaluationException(
175
- message=f"The {description} request failed with HTTP {response.status_code}",
192
+ message=message,
176
193
  target=ErrorTarget.EVALUATE,
177
194
  category=ErrorCategory.FAILED_EXECUTION,
178
195
  blame=ErrorBlame.SYSTEM_ERROR,
179
- internal_message=additional_info,
180
196
  )
181
197
 
182
198
  def _generate_path(self, *paths: str) -> str:
@@ -8,7 +8,7 @@
8
8
 
9
9
  from typing import Dict, List, NamedTuple, Optional, Union
10
10
  from msrest.serialization import Model
11
- from azure.core.credentials import AzureSasCredential
11
+ from azure.core.credentials import AzureSasCredential, TokenCredential
12
12
 
13
13
 
14
14
  class BlobStoreInfo(NamedTuple):
@@ -16,7 +16,7 @@ class BlobStoreInfo(NamedTuple):
16
16
  account_name: str
17
17
  endpoint: str
18
18
  container_name: str
19
- credential: Optional[Union[AzureSasCredential, str]]
19
+ credential: Optional[Union[AzureSasCredential, TokenCredential, str]]
20
20
 
21
21
 
22
22
  class WorkspaceHubConfig(Model):
@@ -366,7 +366,7 @@ def validate_conversation(conversation):
366
366
  if not isinstance(messages, list):
367
367
  raise_exception(
368
368
  "'messages' parameter must be a JSON-compatible list of chat messages",
369
- ErrorTarget.CONTENT_SAFETY_MULTIMODAL_EVALUATOR,
369
+ ErrorTarget.CONTENT_SAFETY_CHAT_EVALUATOR,
370
370
  )
371
371
  expected_roles = {"user", "assistant", "system"}
372
372
  image_found = False
@@ -393,7 +393,7 @@ def validate_conversation(conversation):
393
393
  ):
394
394
  raise_exception(
395
395
  f"Messages must be a strongly typed class of ChatRequestMessage. Message number: {num}",
396
- ErrorTarget.CONTENT_SAFETY_MULTIMODAL_EVALUATOR,
396
+ ErrorTarget.CONTENT_SAFETY_CHAT_EVALUATOR,
397
397
  )
398
398
  if isinstance(message, AssistantMessage):
399
399
  assistant_message_count += 1
@@ -407,7 +407,7 @@ def validate_conversation(conversation):
407
407
  if message.get("role") not in expected_roles:
408
408
  raise_exception(
409
409
  f"Invalid role provided: {message.get('role')}. Message number: {num}",
410
- ErrorTarget.CONTENT_SAFETY_MULTIMODAL_EVALUATOR,
410
+ ErrorTarget.CONTENT_SAFETY_CHAT_EVALUATOR,
411
411
  )
412
412
  if message.get("role") == "assistant":
413
413
  assistant_message_count += 1
@@ -417,7 +417,7 @@ def validate_conversation(conversation):
417
417
  if not isinstance(content, (str, list)):
418
418
  raise_exception(
419
419
  f"Content in each turn must be a string or array. Message number: {num}",
420
- ErrorTarget.CONTENT_SAFETY_MULTIMODAL_EVALUATOR,
420
+ ErrorTarget.CONTENT_SAFETY_CHAT_EVALUATOR,
421
421
  )
422
422
  if isinstance(content, list):
423
423
  if any(item.get("type") == "image_url" and "url" in item.get("image_url", {}) for item in content):
@@ -425,21 +425,21 @@ def validate_conversation(conversation):
425
425
  if not image_found:
426
426
  raise_exception(
427
427
  "Message needs to have multi-modal input like images.",
428
- ErrorTarget.CONTENT_SAFETY_MULTIMODAL_EVALUATOR,
428
+ ErrorTarget.CONTENT_SAFETY_CHAT_EVALUATOR,
429
429
  )
430
430
  if assistant_message_count == 0:
431
431
  raise_exception(
432
432
  "Assistant role required in one of the messages.",
433
- ErrorTarget.CONTENT_SAFETY_MULTIMODAL_EVALUATOR,
433
+ ErrorTarget.CONTENT_SAFETY_CHAT_EVALUATOR,
434
434
  )
435
435
  if user_message_count == 0:
436
436
  raise_exception(
437
437
  "User role required in one of the messages.",
438
- ErrorTarget.CONTENT_SAFETY_MULTIMODAL_EVALUATOR,
438
+ ErrorTarget.CONTENT_SAFETY_CHAT_EVALUATOR,
439
439
  )
440
440
  if assistant_message_count > 1:
441
441
  raise_exception(
442
442
  "Evaluators for multimodal conversations only support single turn. "
443
443
  "User and assistant role expected as the only role in each message.",
444
- ErrorTarget.CONTENT_SAFETY_MULTIMODAL_EVALUATOR,
444
+ ErrorTarget.CONTENT_SAFETY_CHAT_EVALUATOR,
445
445
  )
@@ -1,7 +1,9 @@
1
1
  # ---------------------------------------------------------
2
2
  # Copyright (c) Microsoft Corporation. All rights reserved.
3
3
  # ---------------------------------------------------------
4
+ import enum
4
5
  from typing import Literal
6
+ from azure.ai.evaluation._common._experimental import experimental
5
7
 
6
8
 
7
9
  class EvaluationMetrics:
@@ -20,6 +22,9 @@ class EvaluationMetrics:
20
22
  SELF_HARM = "self_harm"
21
23
  SEXUAL = "sexual"
22
24
  PROTECTED_MATERIAL = "protected_material"
25
+ ARTWORK = "artwork"
26
+ FICTIONAL_CHARACTERS = "fictional_characters"
27
+ LOGOS_AND_BRANDS = "logos_and_brands"
23
28
  XPIA = "xpia"
24
29
 
25
30
 
@@ -57,6 +62,22 @@ class EvaluationRunProperties:
57
62
  EVALUATION_SDK = "_azureml.evaluation_sdk_name"
58
63
 
59
64
 
65
+ @experimental
66
+ class _AggregationType(enum.Enum):
67
+ """Defines how numeric evaluation results should be aggregated
68
+ to produce a single value. Used by individual evaluators to combine per-turn results for
69
+ a conversation-based input. In general, wherever this enum is used, it is also possible
70
+ to directly assign the underlying aggregation function for more complex use cases.
71
+ The 'custom' value is generally not an acceptable input, and should only be used as an output
72
+ to indicate that a custom aggregation function has been injected."""
73
+
74
+ MEAN = "mean"
75
+ MAX = "max"
76
+ MIN = "min"
77
+ SUM = "sum"
78
+ CUSTOM = "custom"
79
+
80
+
60
81
  DEFAULT_EVALUATION_RESULTS_FILE_NAME = "evaluation_results.json"
61
82
 
62
83
  CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT = 4
@@ -5,5 +5,6 @@ from .eval_run_context import EvalRunContext
5
5
  from .code_client import CodeClient
6
6
  from .proxy_client import ProxyClient
7
7
  from .target_run_context import TargetRunContext
8
+ from .proxy_client import ProxyRun
8
9
 
9
- __all__ = ["CodeClient", "ProxyClient", "EvalRunContext", "TargetRunContext"]
10
+ __all__ = ["CodeClient", "ProxyClient", "EvalRunContext", "TargetRunContext", "ProxyRun"]
@@ -421,7 +421,9 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
421
421
  local_paths.append(local_file_path)
422
422
 
423
423
  # We will write the artifacts to the workspaceblobstore
424
- datastore = self._management_client.workspace_get_default_datastore(self._workspace_name, True)
424
+ datastore = self._management_client.workspace_get_default_datastore(
425
+ self._workspace_name, include_credentials=True
426
+ )
425
427
  account_url = f"{datastore.account_name}.blob.{datastore.endpoint}"
426
428
 
427
429
  svc_client = BlobServiceClient(account_url=account_url, credential=datastore.credential)
@@ -12,6 +12,7 @@ import pandas as pd
12
12
  from promptflow._sdk._constants import LINE_NUMBER
13
13
  from promptflow.client import PFClient
14
14
  from promptflow.entities import Run
15
+ from promptflow._sdk._configuration import Configuration
15
16
 
16
17
  from azure.ai.evaluation._common.math import list_mean_nan_safe, apply_transform_nan_safe
17
18
  from azure.ai.evaluation._common.utils import validate_azure_ai_project
@@ -20,17 +21,19 @@ from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarg
20
21
  from .._constants import (
21
22
  CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT,
22
23
  EvaluationMetrics,
24
+ DefaultOpenEncoding,
23
25
  Prefixes,
24
26
  _InternalEvaluationMetrics,
25
27
  )
26
28
  from .._model_configurations import AzureAIProject, EvaluationResult, EvaluatorConfig
27
29
  from .._user_agent import USER_AGENT
28
- from ._batch_run import EvalRunContext, CodeClient, ProxyClient, TargetRunContext
30
+ from ._batch_run import EvalRunContext, CodeClient, ProxyClient, TargetRunContext, ProxyRun
29
31
  from ._utils import (
30
32
  _apply_column_mapping,
31
33
  _log_metrics_and_instance_results,
32
34
  _trace_destination_from_project_scope,
33
35
  _write_output,
36
+ DataLoaderFactory,
34
37
  )
35
38
 
36
39
  TClient = TypeVar("TClient", ProxyClient, CodeClient)
@@ -144,6 +147,9 @@ def _aggregate_label_defect_metrics(df: pd.DataFrame) -> Tuple[List[str], Dict[s
144
147
  """
145
148
  handled_metrics = [
146
149
  EvaluationMetrics.PROTECTED_MATERIAL,
150
+ EvaluationMetrics.FICTIONAL_CHARACTERS,
151
+ EvaluationMetrics.ARTWORK,
152
+ EvaluationMetrics.LOGOS_AND_BRANDS,
147
153
  _InternalEvaluationMetrics.ECI,
148
154
  EvaluationMetrics.XPIA,
149
155
  ]
@@ -429,10 +435,11 @@ def _validate_and_load_data(target, data, evaluators, output_path, azure_ai_proj
429
435
  )
430
436
 
431
437
  try:
432
- initial_data_df = pd.read_json(data, lines=True)
438
+ data_loader = DataLoaderFactory.get_loader(data)
439
+ initial_data_df = data_loader.load()
433
440
  except Exception as e:
434
441
  raise EvaluationException(
435
- message=f"Unable to load data from '{data}'. Please ensure the input is valid JSONL format. Detailed error: {e}.",
442
+ message=f"Unable to load data from '{data}'. Supported formats are JSONL and CSV. Detailed error: {e}.",
436
443
  target=ErrorTarget.EVALUATE,
437
444
  category=ErrorCategory.INVALID_VALUE,
438
445
  blame=ErrorBlame.USER_ERROR,
@@ -444,7 +451,7 @@ def _validate_and_load_data(target, data, evaluators, output_path, azure_ai_proj
444
451
  def _apply_target_to_data(
445
452
  target: Callable,
446
453
  data: Union[str, os.PathLike],
447
- pf_client: PFClient,
454
+ batch_client: TClient,
448
455
  initial_data: pd.DataFrame,
449
456
  evaluation_name: Optional[str] = None,
450
457
  **kwargs,
@@ -454,10 +461,10 @@ def _apply_target_to_data(
454
461
 
455
462
  :param target: The function to be applied to data.
456
463
  :type target: Callable
457
- :param data: The path to input jsonl file.
464
+ :param data: The path to input jsonl or csv file.
458
465
  :type data: Union[str, os.PathLike]
459
- :param pf_client: The promptflow client to be used.
460
- :type pf_client: PFClient
466
+ :param batch_client: The promptflow client to be used.
467
+ :type batch_client: PFClient
461
468
  :param initial_data: The data frame with the loaded data.
462
469
  :type initial_data: pd.DataFrame
463
470
  :param evaluation_name: The name of the evaluation.
@@ -467,7 +474,7 @@ def _apply_target_to_data(
467
474
  """
468
475
  _run_name = kwargs.get("_run_name")
469
476
  with TargetRunContext():
470
- run: Run = pf_client.run(
477
+ run: ProxyRun = batch_client.run(
471
478
  flow=target,
472
479
  display_name=evaluation_name,
473
480
  data=data,
@@ -475,7 +482,18 @@ def _apply_target_to_data(
475
482
  name=_run_name,
476
483
  )
477
484
 
478
- target_output: pd.DataFrame = pf_client.runs.get_details(run, all_results=True)
485
+ target_output: pd.DataFrame = batch_client.get_details(run, all_results=True)
486
+ run_summary = batch_client.get_run_summary(run)
487
+
488
+ if run_summary["completed_lines"] == 0:
489
+ msg = (f"Evaluation target failed to produce any results."
490
+ f" Please check the logs at {run_summary['log_path']} for more details about cause of failure.")
491
+ raise EvaluationException(
492
+ message=msg,
493
+ target=ErrorTarget.EVALUATE,
494
+ category=ErrorCategory.FAILED_EXECUTION,
495
+ blame=ErrorBlame.USER_ERROR,
496
+ )
479
497
  # Remove input and output prefix
480
498
  generated_columns = {
481
499
  col[len(Prefixes.OUTPUTS) :] for col in target_output.columns if col.startswith(Prefixes.OUTPUTS)
@@ -494,7 +512,7 @@ def _apply_target_to_data(
494
512
  # Concatenate output to input
495
513
  target_output = pd.concat([target_output, initial_data], axis=1)
496
514
 
497
- return target_output, generated_columns, run
515
+ return target_output, generated_columns, run.run.result()
498
516
 
499
517
 
500
518
  def _process_column_mappings(
@@ -510,7 +528,7 @@ def _process_column_mappings(
510
528
 
511
529
  processed_config: Dict[str, Dict[str, str]] = {}
512
530
 
513
- unexpected_references = re.compile(r"\${(?!target\.|data\.).+?}")
531
+ expected_references = re.compile(r"^\$\{(target|data)\.[a-zA-Z_]+\}$")
514
532
 
515
533
  if column_mapping:
516
534
  for evaluator, mapping_config in column_mapping.items():
@@ -519,7 +537,7 @@ def _process_column_mappings(
519
537
 
520
538
  for map_to_key, map_value in mapping_config.items():
521
539
  # Check if there's any unexpected reference other than ${target.} or ${data.}
522
- if unexpected_references.search(map_value):
540
+ if not expected_references.search(map_value):
523
541
  msg = "Unexpected references detected in 'column_mapping'. Ensure only ${target.} and ${data.} are used."
524
542
  raise EvaluationException(
525
543
  message=msg,
@@ -569,13 +587,14 @@ def evaluate(
569
587
  evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None,
570
588
  azure_ai_project: Optional[AzureAIProject] = None,
571
589
  output_path: Optional[Union[str, os.PathLike]] = None,
590
+ fail_on_evaluator_errors: bool = False,
572
591
  **kwargs,
573
592
  ) -> EvaluationResult:
574
593
  """Evaluates target or data with built-in or custom evaluators. If both target and data are provided,
575
594
  data will be run through target function and then results will be evaluated.
576
595
 
577
596
  :keyword data: Path to the data to be evaluated or passed to target if target is set.
578
- Only .jsonl format files are supported. `target` and `data` both cannot be None. Required.
597
+ JSONL and CSV files are supported. `target` and `data` both cannot be None. Required.
579
598
  :paramtype data: str
580
599
  :keyword evaluators: Evaluators to be used for evaluation. It should be a dictionary with key as alias for evaluator
581
600
  and value as the evaluator function. Required.
@@ -594,6 +613,11 @@ def evaluate(
594
613
  :paramtype output_path: Optional[str]
595
614
  :keyword azure_ai_project: Logs evaluation results to AI Studio if set.
596
615
  :paramtype azure_ai_project: Optional[~azure.ai.evaluation.AzureAIProject]
616
+ :keyword fail_on_evaluator_errors: Whether or not the evaluation should cancel early with an EvaluationException
617
+ if ANY evaluator fails during their evaluation.
618
+ Defaults to false, which means that evaluations will continue regardless of failures.
619
+ If such failures occur, metrics may be missing, and evidence of failures can be found in the evaluation's logs.
620
+ :paramtype fail_on_evaluator_errors: bool
597
621
  :return: Evaluation results.
598
622
  :rtype: ~azure.ai.evaluation.EvaluationResult
599
623
 
@@ -615,6 +639,7 @@ def evaluate(
615
639
  evaluator_config=evaluator_config,
616
640
  azure_ai_project=azure_ai_project,
617
641
  output_path=output_path,
642
+ fail_on_evaluator_errors=fail_on_evaluator_errors,
618
643
  **kwargs,
619
644
  )
620
645
  except Exception as e:
@@ -663,6 +688,16 @@ def _print_summary(per_evaluator_results: Dict[str, Any]) -> None:
663
688
  print("\n====================================================\n")
664
689
 
665
690
 
691
+ def _print_fail_flag_warning() -> None:
692
+ print(
693
+ "Notice: fail_on_evaluator_errors is enabled. It is recommended that you disable "
694
+ + "this flag for evaluations on large datasets (loosely defined as more than 10 rows of inputs, "
695
+ + "or more than 4 evaluators). Using this flag on large datasets runs the risk of large runs failing "
696
+ + "without producing any outputs, since a single failure will cancel the entire run "
697
+ "when fail_on_evaluator_errors is enabled."
698
+ )
699
+
700
+
666
701
  def _evaluate( # pylint: disable=too-many-locals,too-many-statements
667
702
  *,
668
703
  evaluators: Dict[str, Callable],
@@ -672,8 +707,11 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
672
707
  evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None,
673
708
  azure_ai_project: Optional[AzureAIProject] = None,
674
709
  output_path: Optional[Union[str, os.PathLike]] = None,
710
+ fail_on_evaluator_errors: bool = False,
675
711
  **kwargs,
676
712
  ) -> EvaluationResult:
713
+ if fail_on_evaluator_errors:
714
+ _print_fail_flag_warning()
677
715
  input_data_df = _validate_and_load_data(target, data, evaluators, output_path, azure_ai_project, evaluation_name)
678
716
 
679
717
  # Process evaluator config to replace ${target.} with ${data.}
@@ -690,6 +728,7 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
690
728
  if target is not None:
691
729
  _validate_columns_for_target(input_data_df, target)
692
730
 
731
+ Configuration.get_instance().set_config("trace.destination", "none")
693
732
  pf_client = PFClient(user_agent=USER_AGENT)
694
733
  target_run: Optional[Run] = None
695
734
 
@@ -702,7 +741,7 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
702
741
  target_generated_columns: Set[str] = set()
703
742
  if data is not None and target is not None:
704
743
  input_data_df, target_generated_columns, target_run = _apply_target_to_data(
705
- target, data, pf_client, input_data_df, evaluation_name, **kwargs
744
+ target, data, ProxyClient(pf_client), input_data_df, evaluation_name, **kwargs
706
745
  )
707
746
 
708
747
  for evaluator_name, mapping in column_mapping.items():
@@ -773,6 +812,10 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
773
812
  evaluators_result_df = None
774
813
  evaluators_metric = {}
775
814
  for evaluator_name, evaluator_result in per_evaluator_results.items():
815
+ if fail_on_evaluator_errors and evaluator_result["run_summary"]["failed_lines"] > 0:
816
+ _print_summary(per_evaluator_results)
817
+ _turn_error_logs_into_exception(evaluator_result["run_summary"]["log_path"] + "/error.json")
818
+
776
819
  evaluator_result_df = evaluator_result["result"]
777
820
 
778
821
  # drop input columns
@@ -825,3 +868,20 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
825
868
  _write_output(output_path, result)
826
869
 
827
870
  return result
871
+
872
+
873
+ def _turn_error_logs_into_exception(log_path: str) -> None:
874
+ """Produce an EvaluationException using the contents of the inputted
875
+ file as the error message.
876
+
877
+ :param log_path: The path to the error log file.
878
+ :type log_path: str
879
+ """
880
+ with open(log_path, "r", encoding=DefaultOpenEncoding.READ) as file:
881
+ error_message = file.read()
882
+ raise EvaluationException(
883
+ message=error_message,
884
+ target=ErrorTarget.EVALUATE,
885
+ category=ErrorCategory.FAILED_EXECUTION,
886
+ blame=ErrorBlame.UNKNOWN,
887
+ )
@@ -328,3 +328,30 @@ def set_event_loop_policy() -> None:
328
328
  # Reference: https://stackoverflow.com/questions/45600579/asyncio-event-loop-is-closed-when-getting-loop
329
329
  # On Windows seems to be a problem with EventLoopPolicy, use this snippet to work around it
330
330
  asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy()) # type: ignore[attr-defined]
331
+
332
+
333
+ class JSONLDataFileLoader:
334
+ def __init__(self, filename: Union[os.PathLike, str]):
335
+ self.filename = filename
336
+
337
+ def load(self) -> pd.DataFrame:
338
+ return pd.read_json(self.filename, lines=True)
339
+
340
+
341
+ class CSVDataFileLoader:
342
+ def __init__(self, filename: Union[os.PathLike, str]):
343
+ self.filename = filename
344
+
345
+ def load(self) -> pd.DataFrame:
346
+ return pd.read_csv(self.filename)
347
+
348
+
349
+ class DataLoaderFactory:
350
+ @staticmethod
351
+ def get_loader(filename: Union[os.PathLike, str]) -> Union[JSONLDataFileLoader, CSVDataFileLoader]:
352
+ filename_str = str(filename).lower()
353
+ if filename_str.endswith(".csv"):
354
+ return CSVDataFileLoader(filename)
355
+
356
+ # fallback to JSONL to maintain backward compatibility
357
+ return JSONLDataFileLoader(filename)
@@ -1,30 +1,16 @@
1
1
  # ---------------------------------------------------------
2
2
  # Copyright (c) Microsoft Corporation. All rights reserved.
3
3
  # ---------------------------------------------------------
4
+ from typing import Dict
4
5
  from nltk.translate.bleu_score import SmoothingFunction, sentence_bleu
5
- from promptflow._utils.async_utils import async_run_allowing_running_loop
6
+ from typing_extensions import overload, override
6
7
 
7
8
  from azure.ai.evaluation._common.utils import nltk_tokenize
8
9
 
10
+ from azure.ai.evaluation._evaluators._common import EvaluatorBase
9
11
 
10
- class _AsyncBleuScoreEvaluator:
11
- def __init__(self):
12
- pass
13
-
14
- async def __call__(self, *, response: str, ground_truth: str, **kwargs):
15
- reference_tokens = nltk_tokenize(ground_truth)
16
- hypothesis_tokens = nltk_tokenize(response)
17
-
18
- # NIST Smoothing
19
- smoothing_function = SmoothingFunction().method4
20
- score = sentence_bleu([reference_tokens], hypothesis_tokens, smoothing_function=smoothing_function)
21
-
22
- return {
23
- "bleu_score": score,
24
- }
25
12
 
26
-
27
- class BleuScoreEvaluator:
13
+ class BleuScoreEvaluator(EvaluatorBase):
28
14
  """
29
15
  Calculate the BLEU score for a given response and ground truth.
30
16
 
@@ -51,9 +37,32 @@ class BleuScoreEvaluator:
51
37
  """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
52
38
 
53
39
  def __init__(self):
54
- self._async_evaluator = _AsyncBleuScoreEvaluator()
40
+ super().__init__()
41
+
42
+ @override
43
+ async def _do_eval(self, eval_input: Dict) -> Dict[str, float]:
44
+ """Produce a glue score evaluation result.
45
+
46
+ :param eval_input: The input to the evaluation function.
47
+ :type eval_input: Dict
48
+ :return: The evaluation result.
49
+ :rtype: Dict
50
+ """
51
+ ground_truth = eval_input["ground_truth"]
52
+ response = eval_input["response"]
53
+ reference_tokens = nltk_tokenize(ground_truth)
54
+ hypothesis_tokens = nltk_tokenize(response)
55
55
 
56
- def __call__(self, *, response: str, ground_truth: str, **kwargs):
56
+ # NIST Smoothing
57
+ smoothing_function = SmoothingFunction().method4
58
+ score = sentence_bleu([reference_tokens], hypothesis_tokens, smoothing_function=smoothing_function)
59
+
60
+ return {
61
+ "bleu_score": score,
62
+ }
63
+
64
+ @overload # type: ignore
65
+ def __call__(self, *, response: str, ground_truth: str):
57
66
  """
58
67
  Evaluate the BLEU score between the response and the ground truth.
59
68
 
@@ -64,9 +73,21 @@ class BleuScoreEvaluator:
64
73
  :return: The BLEU score.
65
74
  :rtype: Dict[str, float]
66
75
  """
67
- return async_run_allowing_running_loop(
68
- self._async_evaluator, response=response, ground_truth=ground_truth, **kwargs
69
- )
70
76
 
71
- def _to_async(self):
72
- return self._async_evaluator
77
+ @override
78
+ def __call__( # pylint: disable=docstring-missing-param
79
+ self,
80
+ *args,
81
+ **kwargs,
82
+ ):
83
+ """
84
+ Evaluate the BLEU score between the response and the ground truth.
85
+
86
+ :keyword response: The response to be evaluated.
87
+ :paramtype response: str
88
+ :keyword ground_truth: The ground truth to be compared against.
89
+ :paramtype ground_truth: str
90
+ :return: The BLEU score.
91
+ :rtype: Dict[str, float]
92
+ """
93
+ return super().__call__(*args, **kwargs)
@@ -5,9 +5,11 @@
5
5
  from ._base_eval import EvaluatorBase
6
6
  from ._base_prompty_eval import PromptyEvaluatorBase
7
7
  from ._base_rai_svc_eval import RaiServiceEvaluatorBase
8
+ from ._base_multi_eval import MultiEvaluatorBase
8
9
 
9
10
  __all__ = [
10
11
  "EvaluatorBase",
11
12
  "PromptyEvaluatorBase",
12
13
  "RaiServiceEvaluatorBase",
14
+ "MultiEvaluatorBase",
13
15
  ]