azure-ai-evaluation 1.1.0__py3-none-any.whl → 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (31) hide show
  1. azure/ai/evaluation/_azure/_clients.py +24 -8
  2. azure/ai/evaluation/_azure/_models.py +2 -2
  3. azure/ai/evaluation/_constants.py +18 -0
  4. azure/ai/evaluation/_evaluate/_batch_run/__init__.py +2 -1
  5. azure/ai/evaluation/_evaluate/_eval_run.py +3 -1
  6. azure/ai/evaluation/_evaluate/_evaluate.py +69 -12
  7. azure/ai/evaluation/_evaluate/_utils.py +27 -0
  8. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +46 -25
  9. azure/ai/evaluation/_evaluators/_common/__init__.py +2 -0
  10. azure/ai/evaluation/_evaluators/_common/_base_eval.py +69 -4
  11. azure/ai/evaluation/_evaluators/_common/_base_multi_eval.py +61 -0
  12. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +7 -1
  13. azure/ai/evaluation/_evaluators/_common/_conversation_aggregators.py +49 -0
  14. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +5 -42
  15. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +2 -0
  16. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +2 -0
  17. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +2 -0
  18. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +2 -0
  19. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +61 -68
  20. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +45 -23
  21. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +55 -34
  22. azure/ai/evaluation/_evaluators/_qa/_qa.py +32 -27
  23. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +44 -23
  24. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +41 -81
  25. azure/ai/evaluation/_version.py +1 -1
  26. azure/ai/evaluation/simulator/_simulator.py +21 -13
  27. {azure_ai_evaluation-1.1.0.dist-info → azure_ai_evaluation-1.2.0.dist-info}/METADATA +71 -7
  28. {azure_ai_evaluation-1.1.0.dist-info → azure_ai_evaluation-1.2.0.dist-info}/RECORD +31 -29
  29. {azure_ai_evaluation-1.1.0.dist-info → azure_ai_evaluation-1.2.0.dist-info}/NOTICE.txt +0 -0
  30. {azure_ai_evaluation-1.1.0.dist-info → azure_ai_evaluation-1.2.0.dist-info}/WHEEL +0 -0
  31. {azure_ai_evaluation-1.1.0.dist-info → azure_ai_evaluation-1.2.0.dist-info}/top_level.txt +0 -0
@@ -17,7 +17,7 @@ from azure.ai.evaluation.simulator._model_tools._identity_manager import TokenSc
17
17
  from ._models import BlobStoreInfo, Workspace
18
18
 
19
19
 
20
- API_VERSION: Final[str] = "2024-10-01"
20
+ API_VERSION: Final[str] = "2024-07-01-preview"
21
21
  QUERY_KEY_API_VERSION: Final[str] = "api-version"
22
22
  PATH_ML_WORKSPACES = ("providers", "Microsoft.MachineLearningServices", "workspaces")
23
23
 
@@ -69,7 +69,9 @@ class LiteMLClient:
69
69
  self._get_token_manager()
70
70
  return cast(TokenCredential, self._credential)
71
71
 
72
- def workspace_get_default_datastore(self, workspace_name: str, include_credentials: bool = False) -> BlobStoreInfo:
72
+ def workspace_get_default_datastore(
73
+ self, workspace_name: str, *, include_credentials: bool = False, **kwargs: Any
74
+ ) -> BlobStoreInfo:
73
75
  # 1. Get the default blob store
74
76
  # REST API documentation:
75
77
  # https://learn.microsoft.com/rest/api/azureml/datastores/list?view=rest-azureml-2024-10-01
@@ -92,18 +94,29 @@ class LiteMLClient:
92
94
  account_name = props_json["accountName"]
93
95
  endpoint = props_json["endpoint"]
94
96
  container_name = props_json["containerName"]
97
+ credential_type = props_json.get("credentials", {}).get("credentialsType")
95
98
 
96
99
  # 2. Get the SAS token to use for accessing the blob store
97
100
  # REST API documentation:
98
101
  # https://learn.microsoft.com/rest/api/azureml/datastores/list-secrets?view=rest-azureml-2024-10-01
99
- blob_store_credential: Optional[Union[AzureSasCredential, str]] = None
100
- if include_credentials:
102
+ blob_store_credential: Optional[Union[AzureSasCredential, TokenCredential, str]]
103
+ if not include_credentials:
104
+ blob_store_credential = None
105
+ elif credential_type and credential_type.lower() == "none":
106
+ # If storage account key access is disabled, and only Microsoft Entra ID authentication is available,
107
+ # the credentialsType will be "None" and we should not attempt to get the secrets.
108
+ blob_store_credential = self.get_credential()
109
+ else:
101
110
  url = self._generate_path(
102
111
  *PATH_ML_WORKSPACES, workspace_name, "datastores", "workspaceblobstore", "listSecrets"
103
112
  )
104
113
  secrets_response = self._http_client.request(
105
114
  method="POST",
106
115
  url=url,
116
+ json={
117
+ "expirableSecret": True,
118
+ "expireAfterHours": int(kwargs.get("key_expiration_hours", 1)),
119
+ },
107
120
  params={
108
121
  QUERY_KEY_API_VERSION: self._api_version,
109
122
  },
@@ -114,10 +127,13 @@ class LiteMLClient:
114
127
  secrets_json = secrets_response.json()
115
128
  secrets_type = secrets_json["secretsType"].lower()
116
129
 
130
+ # As per this website, only SAS tokens, access tokens, or Entra IDs are valid for accessing blob data
131
+ # stores:
132
+ # https://learn.microsoft.com/rest/api/storageservices/authorize-requests-to-azure-storage.
117
133
  if secrets_type == "sas":
118
134
  blob_store_credential = AzureSasCredential(secrets_json["sasToken"])
119
135
  elif secrets_type == "accountkey":
120
- # To support olders versions of azure-storage-blob better, we return a string here instead of
136
+ # To support older versions of azure-storage-blob better, we return a string here instead of
121
137
  # an AzureNamedKeyCredential
122
138
  blob_store_credential = secrets_json["key"]
123
139
  else:
@@ -164,19 +180,19 @@ class LiteMLClient:
164
180
  # nothing to see here, move along
165
181
  return
166
182
 
167
- additional_info: Optional[str] = None
183
+ message = f"The {description} request failed with HTTP {response.status_code}"
168
184
  try:
169
185
  error_json = response.json()["error"]
170
186
  additional_info = f"({error_json['code']}) {error_json['message']}"
187
+ message += f" - {additional_info}"
171
188
  except (JSONDecodeError, ValueError, KeyError):
172
189
  pass
173
190
 
174
191
  raise EvaluationException(
175
- message=f"The {description} request failed with HTTP {response.status_code}",
192
+ message=message,
176
193
  target=ErrorTarget.EVALUATE,
177
194
  category=ErrorCategory.FAILED_EXECUTION,
178
195
  blame=ErrorBlame.SYSTEM_ERROR,
179
- internal_message=additional_info,
180
196
  )
181
197
 
182
198
  def _generate_path(self, *paths: str) -> str:
@@ -8,7 +8,7 @@
8
8
 
9
9
  from typing import Dict, List, NamedTuple, Optional, Union
10
10
  from msrest.serialization import Model
11
- from azure.core.credentials import AzureSasCredential
11
+ from azure.core.credentials import AzureSasCredential, TokenCredential
12
12
 
13
13
 
14
14
  class BlobStoreInfo(NamedTuple):
@@ -16,7 +16,7 @@ class BlobStoreInfo(NamedTuple):
16
16
  account_name: str
17
17
  endpoint: str
18
18
  container_name: str
19
- credential: Optional[Union[AzureSasCredential, str]]
19
+ credential: Optional[Union[AzureSasCredential, TokenCredential, str]]
20
20
 
21
21
 
22
22
  class WorkspaceHubConfig(Model):
@@ -1,7 +1,9 @@
1
1
  # ---------------------------------------------------------
2
2
  # Copyright (c) Microsoft Corporation. All rights reserved.
3
3
  # ---------------------------------------------------------
4
+ import enum
4
5
  from typing import Literal
6
+ from azure.ai.evaluation._common._experimental import experimental
5
7
 
6
8
 
7
9
  class EvaluationMetrics:
@@ -57,6 +59,22 @@ class EvaluationRunProperties:
57
59
  EVALUATION_SDK = "_azureml.evaluation_sdk_name"
58
60
 
59
61
 
62
+ @experimental
63
+ class _AggregationType(enum.Enum):
64
+ """Defines how numeric evaluation results should be aggregated
65
+ to produce a single value. Used by individual evaluators to combine per-turn results for
66
+ a conversation-based input. In general, wherever this enum is used, it is also possible
67
+ to directly assign the underlying aggregation function for more complex use cases.
68
+ The 'custom' value is generally not an acceptable input, and should only be used as an output
69
+ to indicate that a custom aggregation function has been injected."""
70
+
71
+ MEAN = "mean"
72
+ MAX = "max"
73
+ MIN = "min"
74
+ SUM = "sum"
75
+ CUSTOM = "custom"
76
+
77
+
60
78
  DEFAULT_EVALUATION_RESULTS_FILE_NAME = "evaluation_results.json"
61
79
 
62
80
  CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT = 4
@@ -5,5 +5,6 @@ from .eval_run_context import EvalRunContext
5
5
  from .code_client import CodeClient
6
6
  from .proxy_client import ProxyClient
7
7
  from .target_run_context import TargetRunContext
8
+ from .proxy_client import ProxyRun
8
9
 
9
- __all__ = ["CodeClient", "ProxyClient", "EvalRunContext", "TargetRunContext"]
10
+ __all__ = ["CodeClient", "ProxyClient", "EvalRunContext", "TargetRunContext", "ProxyRun"]
@@ -421,7 +421,9 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
421
421
  local_paths.append(local_file_path)
422
422
 
423
423
  # We will write the artifacts to the workspaceblobstore
424
- datastore = self._management_client.workspace_get_default_datastore(self._workspace_name, True)
424
+ datastore = self._management_client.workspace_get_default_datastore(
425
+ self._workspace_name, include_credentials=True
426
+ )
425
427
  account_url = f"{datastore.account_name}.blob.{datastore.endpoint}"
426
428
 
427
429
  svc_client = BlobServiceClient(account_url=account_url, credential=datastore.credential)
@@ -12,6 +12,7 @@ import pandas as pd
12
12
  from promptflow._sdk._constants import LINE_NUMBER
13
13
  from promptflow.client import PFClient
14
14
  from promptflow.entities import Run
15
+ from promptflow._sdk._configuration import Configuration
15
16
 
16
17
  from azure.ai.evaluation._common.math import list_mean_nan_safe, apply_transform_nan_safe
17
18
  from azure.ai.evaluation._common.utils import validate_azure_ai_project
@@ -20,17 +21,19 @@ from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarg
20
21
  from .._constants import (
21
22
  CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT,
22
23
  EvaluationMetrics,
24
+ DefaultOpenEncoding,
23
25
  Prefixes,
24
26
  _InternalEvaluationMetrics,
25
27
  )
26
28
  from .._model_configurations import AzureAIProject, EvaluationResult, EvaluatorConfig
27
29
  from .._user_agent import USER_AGENT
28
- from ._batch_run import EvalRunContext, CodeClient, ProxyClient, TargetRunContext
30
+ from ._batch_run import EvalRunContext, CodeClient, ProxyClient, TargetRunContext, ProxyRun
29
31
  from ._utils import (
30
32
  _apply_column_mapping,
31
33
  _log_metrics_and_instance_results,
32
34
  _trace_destination_from_project_scope,
33
35
  _write_output,
36
+ DataLoaderFactory,
34
37
  )
35
38
 
36
39
  TClient = TypeVar("TClient", ProxyClient, CodeClient)
@@ -429,10 +432,11 @@ def _validate_and_load_data(target, data, evaluators, output_path, azure_ai_proj
429
432
  )
430
433
 
431
434
  try:
432
- initial_data_df = pd.read_json(data, lines=True)
435
+ data_loader = DataLoaderFactory.get_loader(data)
436
+ initial_data_df = data_loader.load()
433
437
  except Exception as e:
434
438
  raise EvaluationException(
435
- message=f"Unable to load data from '{data}'. Please ensure the input is valid JSONL format. Detailed error: {e}.",
439
+ message=f"Unable to load data from '{data}'. Supported formats are JSONL and CSV. Detailed error: {e}.",
436
440
  target=ErrorTarget.EVALUATE,
437
441
  category=ErrorCategory.INVALID_VALUE,
438
442
  blame=ErrorBlame.USER_ERROR,
@@ -444,7 +448,7 @@ def _validate_and_load_data(target, data, evaluators, output_path, azure_ai_proj
444
448
  def _apply_target_to_data(
445
449
  target: Callable,
446
450
  data: Union[str, os.PathLike],
447
- pf_client: PFClient,
451
+ batch_client: TClient,
448
452
  initial_data: pd.DataFrame,
449
453
  evaluation_name: Optional[str] = None,
450
454
  **kwargs,
@@ -454,10 +458,10 @@ def _apply_target_to_data(
454
458
 
455
459
  :param target: The function to be applied to data.
456
460
  :type target: Callable
457
- :param data: The path to input jsonl file.
461
+ :param data: The path to input jsonl or csv file.
458
462
  :type data: Union[str, os.PathLike]
459
- :param pf_client: The promptflow client to be used.
460
- :type pf_client: PFClient
463
+ :param batch_client: The promptflow client to be used.
464
+ :type batch_client: PFClient
461
465
  :param initial_data: The data frame with the loaded data.
462
466
  :type initial_data: pd.DataFrame
463
467
  :param evaluation_name: The name of the evaluation.
@@ -467,7 +471,7 @@ def _apply_target_to_data(
467
471
  """
468
472
  _run_name = kwargs.get("_run_name")
469
473
  with TargetRunContext():
470
- run: Run = pf_client.run(
474
+ run: ProxyRun = batch_client.run(
471
475
  flow=target,
472
476
  display_name=evaluation_name,
473
477
  data=data,
@@ -475,7 +479,18 @@ def _apply_target_to_data(
475
479
  name=_run_name,
476
480
  )
477
481
 
478
- target_output: pd.DataFrame = pf_client.runs.get_details(run, all_results=True)
482
+ target_output: pd.DataFrame = batch_client.get_details(run, all_results=True)
483
+ run_summary = batch_client.get_run_summary(run)
484
+
485
+ if run_summary["completed_lines"] == 0:
486
+ msg = (f"Evaluation target failed to produce any results."
487
+ f" Please check the logs at {run_summary['log_path']} for more details about cause of failure.")
488
+ raise EvaluationException(
489
+ message=msg,
490
+ target=ErrorTarget.EVALUATE,
491
+ category=ErrorCategory.FAILED_EXECUTION,
492
+ blame=ErrorBlame.USER_ERROR,
493
+ )
479
494
  # Remove input and output prefix
480
495
  generated_columns = {
481
496
  col[len(Prefixes.OUTPUTS) :] for col in target_output.columns if col.startswith(Prefixes.OUTPUTS)
@@ -494,7 +509,7 @@ def _apply_target_to_data(
494
509
  # Concatenate output to input
495
510
  target_output = pd.concat([target_output, initial_data], axis=1)
496
511
 
497
- return target_output, generated_columns, run
512
+ return target_output, generated_columns, run.run.result()
498
513
 
499
514
 
500
515
  def _process_column_mappings(
@@ -569,13 +584,14 @@ def evaluate(
569
584
  evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None,
570
585
  azure_ai_project: Optional[AzureAIProject] = None,
571
586
  output_path: Optional[Union[str, os.PathLike]] = None,
587
+ fail_on_evaluator_errors: bool = False,
572
588
  **kwargs,
573
589
  ) -> EvaluationResult:
574
590
  """Evaluates target or data with built-in or custom evaluators. If both target and data are provided,
575
591
  data will be run through target function and then results will be evaluated.
576
592
 
577
593
  :keyword data: Path to the data to be evaluated or passed to target if target is set.
578
- Only .jsonl format files are supported. `target` and `data` both cannot be None. Required.
594
+ JSONL and CSV files are supported. `target` and `data` both cannot be None. Required.
579
595
  :paramtype data: str
580
596
  :keyword evaluators: Evaluators to be used for evaluation. It should be a dictionary with key as alias for evaluator
581
597
  and value as the evaluator function. Required.
@@ -594,6 +610,11 @@ def evaluate(
594
610
  :paramtype output_path: Optional[str]
595
611
  :keyword azure_ai_project: Logs evaluation results to AI Studio if set.
596
612
  :paramtype azure_ai_project: Optional[~azure.ai.evaluation.AzureAIProject]
613
+ :keyword fail_on_evaluator_errors: Whether or not the evaluation should cancel early with an EvaluationException
614
+ if ANY evaluator fails during their evaluation.
615
+ Defaults to false, which means that evaluations will continue regardless of failures.
616
+ If such failures occur, metrics may be missing, and evidence of failures can be found in the evaluation's logs.
617
+ :paramtype fail_on_evaluator_errors: bool
597
618
  :return: Evaluation results.
598
619
  :rtype: ~azure.ai.evaluation.EvaluationResult
599
620
 
@@ -615,6 +636,7 @@ def evaluate(
615
636
  evaluator_config=evaluator_config,
616
637
  azure_ai_project=azure_ai_project,
617
638
  output_path=output_path,
639
+ fail_on_evaluator_errors=fail_on_evaluator_errors,
618
640
  **kwargs,
619
641
  )
620
642
  except Exception as e:
@@ -663,6 +685,16 @@ def _print_summary(per_evaluator_results: Dict[str, Any]) -> None:
663
685
  print("\n====================================================\n")
664
686
 
665
687
 
688
+ def _print_fail_flag_warning() -> None:
689
+ print(
690
+ "Notice: fail_on_evaluator_errors is enabled. It is recommended that you disable "
691
+ + "this flag for evaluations on large datasets (loosely defined as more than 10 rows of inputs, "
692
+ + "or more than 4 evaluators). Using this flag on large datasets runs the risk of large runs failing "
693
+ + "without producing any outputs, since a single failure will cancel the entire run "
694
+ "when fail_on_evaluator_errors is enabled."
695
+ )
696
+
697
+
666
698
  def _evaluate( # pylint: disable=too-many-locals,too-many-statements
667
699
  *,
668
700
  evaluators: Dict[str, Callable],
@@ -672,8 +704,11 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
672
704
  evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None,
673
705
  azure_ai_project: Optional[AzureAIProject] = None,
674
706
  output_path: Optional[Union[str, os.PathLike]] = None,
707
+ fail_on_evaluator_errors: bool = False,
675
708
  **kwargs,
676
709
  ) -> EvaluationResult:
710
+ if fail_on_evaluator_errors:
711
+ _print_fail_flag_warning()
677
712
  input_data_df = _validate_and_load_data(target, data, evaluators, output_path, azure_ai_project, evaluation_name)
678
713
 
679
714
  # Process evaluator config to replace ${target.} with ${data.}
@@ -690,6 +725,7 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
690
725
  if target is not None:
691
726
  _validate_columns_for_target(input_data_df, target)
692
727
 
728
+ Configuration.get_instance().set_config("trace.destination", "none")
693
729
  pf_client = PFClient(user_agent=USER_AGENT)
694
730
  target_run: Optional[Run] = None
695
731
 
@@ -702,7 +738,7 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
702
738
  target_generated_columns: Set[str] = set()
703
739
  if data is not None and target is not None:
704
740
  input_data_df, target_generated_columns, target_run = _apply_target_to_data(
705
- target, data, pf_client, input_data_df, evaluation_name, **kwargs
741
+ target, data, ProxyClient(pf_client), input_data_df, evaluation_name, **kwargs
706
742
  )
707
743
 
708
744
  for evaluator_name, mapping in column_mapping.items():
@@ -773,6 +809,10 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
773
809
  evaluators_result_df = None
774
810
  evaluators_metric = {}
775
811
  for evaluator_name, evaluator_result in per_evaluator_results.items():
812
+ if fail_on_evaluator_errors and evaluator_result["run_summary"]["failed_lines"] > 0:
813
+ _print_summary(per_evaluator_results)
814
+ _turn_error_logs_into_exception(evaluator_result["run_summary"]["log_path"] + "/error.json")
815
+
776
816
  evaluator_result_df = evaluator_result["result"]
777
817
 
778
818
  # drop input columns
@@ -825,3 +865,20 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
825
865
  _write_output(output_path, result)
826
866
 
827
867
  return result
868
+
869
+
870
+ def _turn_error_logs_into_exception(log_path: str) -> None:
871
+ """Produce an EvaluationException using the contents of the inputted
872
+ file as the error message.
873
+
874
+ :param log_path: The path to the error log file.
875
+ :type log_path: str
876
+ """
877
+ with open(log_path, "r", encoding=DefaultOpenEncoding.READ) as file:
878
+ error_message = file.read()
879
+ raise EvaluationException(
880
+ message=error_message,
881
+ target=ErrorTarget.EVALUATE,
882
+ category=ErrorCategory.FAILED_EXECUTION,
883
+ blame=ErrorBlame.UNKNOWN,
884
+ )
@@ -328,3 +328,30 @@ def set_event_loop_policy() -> None:
328
328
  # Reference: https://stackoverflow.com/questions/45600579/asyncio-event-loop-is-closed-when-getting-loop
329
329
  # On Windows seems to be a problem with EventLoopPolicy, use this snippet to work around it
330
330
  asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy()) # type: ignore[attr-defined]
331
+
332
+
333
+ class JSONLDataFileLoader:
334
+ def __init__(self, filename: Union[os.PathLike, str]):
335
+ self.filename = filename
336
+
337
+ def load(self) -> pd.DataFrame:
338
+ return pd.read_json(self.filename, lines=True)
339
+
340
+
341
+ class CSVDataFileLoader:
342
+ def __init__(self, filename: Union[os.PathLike, str]):
343
+ self.filename = filename
344
+
345
+ def load(self) -> pd.DataFrame:
346
+ return pd.read_csv(self.filename)
347
+
348
+
349
+ class DataLoaderFactory:
350
+ @staticmethod
351
+ def get_loader(filename: Union[os.PathLike, str]) -> Union[JSONLDataFileLoader, CSVDataFileLoader]:
352
+ filename_str = str(filename).lower()
353
+ if filename_str.endswith(".csv"):
354
+ return CSVDataFileLoader(filename)
355
+
356
+ # fallback to JSONL to maintain backward compatibility
357
+ return JSONLDataFileLoader(filename)
@@ -1,30 +1,16 @@
1
1
  # ---------------------------------------------------------
2
2
  # Copyright (c) Microsoft Corporation. All rights reserved.
3
3
  # ---------------------------------------------------------
4
+ from typing import Dict
4
5
  from nltk.translate.bleu_score import SmoothingFunction, sentence_bleu
5
- from promptflow._utils.async_utils import async_run_allowing_running_loop
6
+ from typing_extensions import overload, override
6
7
 
7
8
  from azure.ai.evaluation._common.utils import nltk_tokenize
8
9
 
10
+ from azure.ai.evaluation._evaluators._common import EvaluatorBase
9
11
 
10
- class _AsyncBleuScoreEvaluator:
11
- def __init__(self):
12
- pass
13
-
14
- async def __call__(self, *, response: str, ground_truth: str, **kwargs):
15
- reference_tokens = nltk_tokenize(ground_truth)
16
- hypothesis_tokens = nltk_tokenize(response)
17
-
18
- # NIST Smoothing
19
- smoothing_function = SmoothingFunction().method4
20
- score = sentence_bleu([reference_tokens], hypothesis_tokens, smoothing_function=smoothing_function)
21
-
22
- return {
23
- "bleu_score": score,
24
- }
25
12
 
26
-
27
- class BleuScoreEvaluator:
13
+ class BleuScoreEvaluator(EvaluatorBase):
28
14
  """
29
15
  Calculate the BLEU score for a given response and ground truth.
30
16
 
@@ -51,9 +37,32 @@ class BleuScoreEvaluator:
51
37
  """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
52
38
 
53
39
  def __init__(self):
54
- self._async_evaluator = _AsyncBleuScoreEvaluator()
40
+ super().__init__()
41
+
42
+ @override
43
+ async def _do_eval(self, eval_input: Dict) -> Dict[str, float]:
44
+ """Produce a glue score evaluation result.
45
+
46
+ :param eval_input: The input to the evaluation function.
47
+ :type eval_input: Dict
48
+ :return: The evaluation result.
49
+ :rtype: Dict
50
+ """
51
+ ground_truth = eval_input["ground_truth"]
52
+ response = eval_input["response"]
53
+ reference_tokens = nltk_tokenize(ground_truth)
54
+ hypothesis_tokens = nltk_tokenize(response)
55
55
 
56
- def __call__(self, *, response: str, ground_truth: str, **kwargs):
56
+ # NIST Smoothing
57
+ smoothing_function = SmoothingFunction().method4
58
+ score = sentence_bleu([reference_tokens], hypothesis_tokens, smoothing_function=smoothing_function)
59
+
60
+ return {
61
+ "bleu_score": score,
62
+ }
63
+
64
+ @overload # type: ignore
65
+ def __call__(self, *, response: str, ground_truth: str):
57
66
  """
58
67
  Evaluate the BLEU score between the response and the ground truth.
59
68
 
@@ -64,9 +73,21 @@ class BleuScoreEvaluator:
64
73
  :return: The BLEU score.
65
74
  :rtype: Dict[str, float]
66
75
  """
67
- return async_run_allowing_running_loop(
68
- self._async_evaluator, response=response, ground_truth=ground_truth, **kwargs
69
- )
70
76
 
71
- def _to_async(self):
72
- return self._async_evaluator
77
+ @override
78
+ def __call__( # pylint: disable=docstring-missing-param
79
+ self,
80
+ *args,
81
+ **kwargs,
82
+ ):
83
+ """
84
+ Evaluate the BLEU score between the response and the ground truth.
85
+
86
+ :keyword response: The response to be evaluated.
87
+ :paramtype response: str
88
+ :keyword ground_truth: The ground truth to be compared against.
89
+ :paramtype ground_truth: str
90
+ :return: The BLEU score.
91
+ :rtype: Dict[str, float]
92
+ """
93
+ return super().__call__(*args, **kwargs)
@@ -5,9 +5,11 @@
5
5
  from ._base_eval import EvaluatorBase
6
6
  from ._base_prompty_eval import PromptyEvaluatorBase
7
7
  from ._base_rai_svc_eval import RaiServiceEvaluatorBase
8
+ from ._base_multi_eval import MultiEvaluatorBase
8
9
 
9
10
  __all__ = [
10
11
  "EvaluatorBase",
11
12
  "PromptyEvaluatorBase",
12
13
  "RaiServiceEvaluatorBase",
14
+ "MultiEvaluatorBase",
13
15
  ]
@@ -4,15 +4,18 @@
4
4
 
5
5
  import inspect
6
6
  from abc import ABC, abstractmethod
7
- from typing import Any, Callable, Dict, Generic, List, TypedDict, TypeVar, Union, cast, final
7
+ from typing import Any, Callable, Dict, Generic, List, TypedDict, TypeVar, Union, cast, final, Optional
8
8
 
9
9
  from promptflow._utils.async_utils import async_run_allowing_running_loop
10
10
  from typing_extensions import ParamSpec, TypeAlias, get_overloads
11
11
 
12
- from azure.ai.evaluation._common.math import list_mean
13
12
  from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
14
13
  from azure.ai.evaluation._common.utils import remove_optional_singletons
14
+ from azure.ai.evaluation._constants import _AggregationType
15
15
  from azure.ai.evaluation._model_configurations import Conversation
16
+ from azure.ai.evaluation._common._experimental import experimental
17
+
18
+ from ._conversation_aggregators import GetAggregator, GetAggregatorType
16
19
 
17
20
  P = ParamSpec("P")
18
21
  T = TypeVar("T")
@@ -25,6 +28,7 @@ class DerivedEvalInput(TypedDict, total=False):
25
28
  query: Dict[str, Any]
26
29
  response: Dict[str, Any]
27
30
  context: str
31
+ ground_truth: str
28
32
 
29
33
 
30
34
  AggregateResult: TypeAlias = Dict[str, Union[float, Dict[str, List[T]]]]
@@ -69,6 +73,13 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
69
73
  :type not_singleton_inputs: List[str]
70
74
  :param eval_last_turn: If True, only the last turn of the conversation will be evaluated. Default is False.
71
75
  :type eval_last_turn: bool
76
+ :param conversation_aggregation_type: The type of aggregation to perform on the per-turn results of a conversation
77
+ to produce a single result.
78
+ Default is ~azure.ai.evaluation._AggregationType.MEAN.
79
+ :type conversation_aggregation_type: ~azure.ai.evaluation._AggregationType
80
+ :param conversation_aggregator_override: A function that will be used to aggregate per-turn results. If provided,
81
+ overrides the standard aggregator implied by conversation_aggregation_type. None by default.
82
+ :type conversation_aggregator_override: Optional[Callable[[List[float]], float]]
72
83
  """
73
84
 
74
85
  # ~~~ METHODS THAT ALMOST ALWAYS NEED TO BE OVERRIDDEN BY CHILDREN~~~
@@ -80,11 +91,17 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
80
91
  *,
81
92
  not_singleton_inputs: List[str] = ["conversation", "kwargs"],
82
93
  eval_last_turn: bool = False,
94
+ conversation_aggregation_type: _AggregationType = _AggregationType.MEAN,
95
+ conversation_aggregator_override: Optional[Callable[[List[float]], float]] = None,
83
96
  ):
84
97
  self._not_singleton_inputs = not_singleton_inputs
85
98
  self._eval_last_turn = eval_last_turn
86
99
  self._singleton_inputs = self._derive_singleton_inputs()
87
100
  self._async_evaluator = AsyncEvaluatorBase(self._real_call)
101
+ self._conversation_aggregation_function = GetAggregator(conversation_aggregation_type)
102
+ if conversation_aggregator_override is not None:
103
+ # Type ignore since we already checked for None, but mypy doesn't know that.
104
+ self._conversation_aggregation_function = conversation_aggregator_override # type: ignore[assignment]
88
105
 
89
106
  # This needs to be overridden just to change the function header into something more informative,
90
107
  # and to be able to add a more specific docstring. The actual function contents should just be
@@ -158,6 +175,7 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
158
175
  include_context = "context" in self._singleton_inputs
159
176
  include_query = "query" in self._singleton_inputs
160
177
  include_response = "response" in self._singleton_inputs
178
+ include_ground_truth = "ground_truth" in self._singleton_inputs
161
179
 
162
180
  def converter(conversation: Dict) -> List[DerivedEvalInput]:
163
181
  messages = cast(List[Dict[str, Any]], conversation["messages"])
@@ -198,6 +216,8 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
198
216
  eval_input["response"] = response.get("content", "")
199
217
  if include_context:
200
218
  eval_input["context"] = str(context)
219
+ if include_ground_truth:
220
+ eval_input["ground_truth"] = response.get("ground_truth", "")
201
221
  eval_inputs.append(eval_input)
202
222
  return eval_inputs
203
223
 
@@ -355,7 +375,7 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
355
375
  # Find and average all numeric values
356
376
  for metric, values in evaluation_per_turn.items():
357
377
  if all(isinstance(value, (int, float)) for value in values):
358
- aggregated[metric] = list_mean(cast(List[Union[int, float]], values))
378
+ aggregated[metric] = self._conversation_aggregation_function(cast(List[Union[int, float]], values))
359
379
  # Slap the per-turn results back in.
360
380
  aggregated["evaluation_per_turn"] = evaluation_per_turn
361
381
  return aggregated
@@ -383,10 +403,51 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
383
403
  # Otherwise, aggregate results.
384
404
  return self._aggregate_results(per_turn_results=per_turn_results)
385
405
 
406
+ # ~~~ METHODS THAT SHOULD NOT BE OVERRIDDEN BY CHILDREN~~~``
407
+
386
408
  @final
387
409
  def _to_async(self) -> "AsyncEvaluatorBase":
388
410
  return self._async_evaluator
389
411
 
412
+ @experimental
413
+ @final
414
+ def _set_conversation_aggregation_type(self, conversation_aggregation_type: _AggregationType) -> None:
415
+ """Input a conversation aggregation type to re-assign the aggregator function used by this evaluator for
416
+ multi-turn conversations. This aggregator is used to combine numeric outputs from each evaluation of a
417
+ multi-turn conversation into a single top-level result.
418
+
419
+ :param conversation_aggregation_type: The type of aggregation to perform on the per-turn
420
+ results of a conversation to produce a single result.
421
+ :type conversation_aggregation_type: ~azure.ai.evaluation._AggregationType
422
+ """
423
+ self._conversation_aggregation_function = GetAggregator(conversation_aggregation_type)
424
+
425
+ @experimental
426
+ @final
427
+ def _set_conversation_aggregator(self, aggregator: Callable[[List[float]], float]) -> None:
428
+ """Set the conversation aggregator function directly. This function will be applied to all numeric outputs
429
+ of an evaluator when it evaluates a conversation with multiple-turns thus ends up with multiple results per
430
+ evaluation that is needs to coalesce into a single result. Use when built-in aggregators do not
431
+ suit your needs, but use with caution.
432
+
433
+ :param aggregator: The function to use to aggregate per-turn results.
434
+ :type aggregator: Callable[[List[float]], float]
435
+ """
436
+ self._conversation_aggregation_function = aggregator
437
+
438
+ @experimental
439
+ @final
440
+ def _get_conversation_aggregator_type(self) -> _AggregationType:
441
+ """Get the current conversation aggregation type used by this evaluator. This refers to the
442
+ method used when a single input produces multiple evaluation results (ex: when a multi-turn conversation
443
+ is inputted into an evaluator that evaluates each turn individually). The individual inputs
444
+ are combined by the function implied here to produce a single overall result.
445
+
446
+ :return: The conversation aggregation type.
447
+ :rtype: ~azure.ai.evaluation._AggregationType
448
+ """
449
+ return GetAggregatorType(self._conversation_aggregation_function)
450
+
390
451
 
391
452
  class AsyncEvaluatorBase:
392
453
  """The asynchronous evaluator hidden underneath all evaluators. This makes generous use passing functions
@@ -402,7 +463,9 @@ class AsyncEvaluatorBase:
402
463
  # are just not passed into this function instead of ending up in kwargs.
403
464
  # Since we want this to be relatively call-agnostic, we just account for every input that any children
404
465
  # are known to throw at this, mash them into kwargs, and then pass them into the real call.
405
- async def __call__(self, *, query=None, response=None, context=None, conversation=None, **kwargs):
466
+ async def __call__(
467
+ self, *, query=None, response=None, context=None, conversation=None, ground_truth=None, **kwargs
468
+ ):
406
469
  if conversation is not None:
407
470
  kwargs["conversation"] = conversation
408
471
  if query is not None:
@@ -411,4 +474,6 @@ class AsyncEvaluatorBase:
411
474
  kwargs["response"] = response
412
475
  if context is not None:
413
476
  kwargs["context"] = context
477
+ if ground_truth is not None:
478
+ kwargs["ground_truth"] = ground_truth
414
479
  return await self._real_call(**kwargs)