azure-ai-evaluation 1.0.0b5__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. azure/ai/evaluation/_common/_experimental.py +4 -0
  2. azure/ai/evaluation/_common/math.py +62 -2
  3. azure/ai/evaluation/_common/rai_service.py +80 -29
  4. azure/ai/evaluation/_common/utils.py +50 -16
  5. azure/ai/evaluation/_constants.py +1 -0
  6. azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +9 -0
  7. azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +13 -3
  8. azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +11 -0
  9. azure/ai/evaluation/_evaluate/_eval_run.py +34 -10
  10. azure/ai/evaluation/_evaluate/_evaluate.py +59 -103
  11. azure/ai/evaluation/_evaluate/_telemetry/__init__.py +2 -1
  12. azure/ai/evaluation/_evaluate/_utils.py +6 -4
  13. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +16 -17
  14. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +60 -29
  15. azure/ai/evaluation/_evaluators/_common/_base_eval.py +17 -5
  16. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +4 -2
  17. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +6 -9
  18. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +56 -50
  19. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +79 -34
  20. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +73 -34
  21. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +74 -33
  22. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +76 -34
  23. azure/ai/evaluation/_evaluators/_eci/_eci.py +28 -3
  24. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +20 -13
  25. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +57 -26
  26. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +13 -15
  27. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +68 -30
  28. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +17 -20
  29. azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py +10 -8
  30. azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal_base.py +0 -2
  31. azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py +6 -2
  32. azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py +10 -6
  33. azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py +6 -2
  34. azure/ai/evaluation/_evaluators/_multimodal/_sexual.py +6 -2
  35. azure/ai/evaluation/_evaluators/_multimodal/_violence.py +6 -2
  36. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +57 -34
  37. azure/ai/evaluation/_evaluators/_qa/_qa.py +25 -37
  38. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +63 -29
  39. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +76 -161
  40. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +24 -25
  41. azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +65 -67
  42. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +26 -20
  43. azure/ai/evaluation/_evaluators/_xpia/xpia.py +74 -40
  44. azure/ai/evaluation/_exceptions.py +2 -0
  45. azure/ai/evaluation/_model_configurations.py +65 -14
  46. azure/ai/evaluation/_version.py +1 -1
  47. azure/ai/evaluation/simulator/_adversarial_scenario.py +15 -1
  48. azure/ai/evaluation/simulator/_adversarial_simulator.py +25 -34
  49. azure/ai/evaluation/simulator/_constants.py +11 -1
  50. azure/ai/evaluation/simulator/_direct_attack_simulator.py +16 -8
  51. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +11 -1
  52. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +3 -1
  53. azure/ai/evaluation/simulator/_model_tools/_rai_client.py +8 -4
  54. azure/ai/evaluation/simulator/_simulator.py +51 -45
  55. azure/ai/evaluation/simulator/_utils.py +25 -7
  56. {azure_ai_evaluation-1.0.0b5.dist-info → azure_ai_evaluation-1.0.1.dist-info}/METADATA +232 -324
  57. {azure_ai_evaluation-1.0.0b5.dist-info → azure_ai_evaluation-1.0.1.dist-info}/RECORD +60 -61
  58. azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +0 -322
  59. {azure_ai_evaluation-1.0.0b5.dist-info → azure_ai_evaluation-1.0.1.dist-info}/NOTICE.txt +0 -0
  60. {azure_ai_evaluation-1.0.0b5.dist-info → azure_ai_evaluation-1.0.1.dist-info}/WHEEL +0 -0
  61. {azure_ai_evaluation-1.0.0b5.dist-info → azure_ai_evaluation-1.0.1.dist-info}/top_level.txt +0 -0
@@ -3,17 +3,18 @@
3
3
  # ---------------------------------------------------------
4
4
  import inspect
5
5
  import json
6
+ import logging
6
7
  import os
7
8
  import re
8
9
  from typing import Any, Callable, Dict, List, Optional, Set, Tuple, TypedDict, TypeVar, Union
9
10
 
10
11
  import pandas as pd
11
12
  from promptflow._sdk._constants import LINE_NUMBER
12
- from promptflow._sdk._errors import MissingAzurePackage, UserAuthenticationError, UploadInternalError
13
+ from promptflow._sdk._errors import UserAuthenticationError, UploadInternalError
13
14
  from promptflow.client import PFClient
14
15
  from promptflow.entities import Run
15
16
 
16
- from azure.ai.evaluation._common.math import list_sum
17
+ from azure.ai.evaluation._common.math import list_mean_nan_safe, apply_transform_nan_safe
17
18
  from azure.ai.evaluation._common.utils import validate_azure_ai_project
18
19
  from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
19
20
 
@@ -35,6 +36,7 @@ from ._utils import (
35
36
  )
36
37
 
37
38
  TClient = TypeVar("TClient", ProxyClient, CodeClient)
39
+ LOGGER = logging.getLogger(__name__)
38
40
 
39
41
  # For metrics (aggregates) whose metric names intentionally differ from their
40
42
  # originating column name, usually because the aggregation of the original value
@@ -69,10 +71,11 @@ def _aggregate_other_metrics(df: pd.DataFrame) -> Tuple[List[str], Dict[str, flo
69
71
  renamed_cols.append(col)
70
72
  new_col_name = metric_prefix + "." + METRIC_COLUMN_NAME_REPLACEMENTS[metric_name]
71
73
  col_with_numeric_values = pd.to_numeric(df[col], errors="coerce")
72
- metric_columns[new_col_name] = round(
73
- list_sum(col_with_numeric_values) / col_with_numeric_values.count(),
74
- 2,
75
- )
74
+ try:
75
+ metric_columns[new_col_name] = round(list_mean_nan_safe(col_with_numeric_values), 2)
76
+ except EvaluationException: # only exception that can be cause is all NaN values
77
+ msg = f"All score evaluations are NaN/None for column {col}. No aggregation can be performed."
78
+ LOGGER.warning(msg)
76
79
 
77
80
  return renamed_cols, metric_columns
78
81
 
@@ -119,11 +122,15 @@ def _aggregate_content_safety_metrics(
119
122
  for col in content_safety_df.columns:
120
123
  defect_rate_name = col.replace("_score", "_defect_rate")
121
124
  col_with_numeric_values = pd.to_numeric(content_safety_df[col], errors="coerce")
122
- defect_rates[defect_rate_name] = round(
123
- list_sum(col_with_numeric_values >= CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT)
124
- / col_with_numeric_values.count(),
125
- 2,
126
- )
125
+ try:
126
+ col_with_boolean_values = apply_transform_nan_safe(
127
+ col_with_numeric_values, lambda x: 1 if x >= CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT else 0
128
+ )
129
+ defect_rates[defect_rate_name] = round(list_mean_nan_safe(col_with_boolean_values), 2)
130
+ except EvaluationException: # only exception that can be cause is all NaN values
131
+ msg = f"All score evaluations are NaN/None for column {col}. No aggregation can be performed."
132
+ LOGGER.warning(msg)
133
+
127
134
  return content_safety_cols, defect_rates
128
135
 
129
136
 
@@ -153,10 +160,11 @@ def _aggregate_label_defect_metrics(df: pd.DataFrame) -> Tuple[List[str], Dict[s
153
160
  for col in label_df.columns:
154
161
  defect_rate_name = col.replace("_label", "_defect_rate")
155
162
  col_with_boolean_values = pd.to_numeric(label_df[col], errors="coerce")
156
- defect_rates[defect_rate_name] = round(
157
- list_sum(col_with_boolean_values) / col_with_boolean_values.count(),
158
- 2,
159
- )
163
+ try:
164
+ defect_rates[defect_rate_name] = round(list_mean_nan_safe(col_with_boolean_values), 2)
165
+ except EvaluationException: # only exception that can be cause is all NaN values
166
+ msg = f"All score evaluations are NaN/None for column {col}. No aggregation can be performed."
167
+ LOGGER.warning(msg)
160
168
  return label_cols, defect_rates
161
169
 
162
170
 
@@ -193,6 +201,9 @@ def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, Callable]) -> Dic
193
201
  # For rest of metrics, we will calculate mean
194
202
  df.drop(columns=handled_columns, inplace=True)
195
203
 
204
+ # NOTE: nan/None values don't count as as booleans, so boolean columns with
205
+ # nan/None values won't have a mean produced from them.
206
+ # This is different from label-based known evaluators, which have special handling.
196
207
  mean_value = df.mean(numeric_only=True)
197
208
  metrics = mean_value.to_dict()
198
209
  # Add defect rates back into metrics
@@ -287,7 +298,13 @@ def _validate_columns_for_evaluators(
287
298
  # Ignore the missing fields if "conversation" presents in the input data
288
299
  missing_inputs = []
289
300
  else:
290
- missing_inputs = [col for col in evaluator_params if col not in new_df.columns]
301
+ optional_params = (
302
+ evaluator._OPTIONAL_PARAMS # pylint: disable=protected-access
303
+ if hasattr(evaluator, "_OPTIONAL_PARAMS")
304
+ else []
305
+ )
306
+ excluded_params = set(new_df.columns).union(optional_params)
307
+ missing_inputs = [col for col in evaluator_params if col not in excluded_params]
291
308
 
292
309
  # If "conversation" is the only parameter and it is missing, keep it in the missing inputs
293
310
  # Otherwise, remove it from the missing inputs
@@ -391,7 +408,7 @@ def _validate_and_load_data(target, data, evaluators, output_path, azure_ai_proj
391
408
  )
392
409
 
393
410
  output_dir = output_path if os.path.isdir(output_path) else os.path.dirname(output_path)
394
- if not os.path.exists(output_dir):
411
+ if output_dir and not os.path.exists(output_dir):
395
412
  msg = f"The output directory '{output_dir}' does not exist. Please create the directory manually."
396
413
  raise EvaluationException(
397
414
  message=msg,
@@ -601,48 +618,14 @@ def evaluate(
601
618
  :return: Evaluation results.
602
619
  :rtype: ~azure.ai.evaluation.EvaluationResult
603
620
 
604
- :Example:
605
-
606
- Evaluate API can be used as follows:
607
-
608
- .. code-block:: python
609
-
610
- from azure.ai.evaluation import evaluate, RelevanceEvaluator, CoherenceEvaluator
611
-
612
-
613
- model_config = {
614
- "azure_endpoint": os.environ.get("AZURE_OPENAI_ENDPOINT"),
615
- "api_key": os.environ.get("AZURE_OPENAI_KEY"),
616
- "azure_deployment": os.environ.get("AZURE_OPENAI_DEPLOYMENT"),
617
- }
618
-
619
- coherence_eval = CoherenceEvaluator(model_config=model_config)
620
- relevance_eval = RelevanceEvaluator(model_config=model_config)
621
-
622
- path = "evaluate_test_data.jsonl"
623
- result = evaluate(
624
- data=path,
625
- evaluators={
626
- "coherence": coherence_eval,
627
- "relevance": relevance_eval,
628
- },
629
- evaluator_config={
630
- "coherence": {
631
- "column_mapping": {
632
- "response": "${data.response}",
633
- "query": "${data.query}",
634
- },
635
- },
636
- "relevance": {
637
- "column_mapping": {
638
- "response": "${data.response}",
639
- "context": "${data.context}",
640
- "query": "${data.query}",
641
- },
642
- },
643
- },
644
- )
621
+ .. admonition:: Example:
645
622
 
623
+ .. literalinclude:: ../samples/evaluation_samples_evaluate.py
624
+ :start-after: [START evaluate_method]
625
+ :end-before: [END evaluate_method]
626
+ :language: python
627
+ :dedent: 8
628
+ :caption: Run an evaluation on local data with Coherence and Relevance evaluators.
646
629
  """
647
630
  try:
648
631
  return _evaluate(
@@ -698,7 +681,7 @@ def _print_summary(per_evaluator_results: Dict[str, Any]) -> None:
698
681
  if output_dict:
699
682
  print("======= Combined Run Summary (Per Evaluator) =======\n")
700
683
  print(json.dumps(output_dict, indent=4))
701
- print("\n====================================================")
684
+ print("\n====================================================\n")
702
685
 
703
686
 
704
687
  def _evaluate( # pylint: disable=too-many-locals,too-many-statements
@@ -728,36 +711,7 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
728
711
  if target is not None:
729
712
  _validate_columns_for_target(input_data_df, target)
730
713
 
731
- # Target Run
732
- try:
733
- pf_client = PFClient(
734
- config=(
735
- {"trace.destination": _trace_destination_from_project_scope(azure_ai_project)}
736
- if azure_ai_project
737
- else None
738
- ),
739
- user_agent=USER_AGENT,
740
- )
741
- # pylint: disable=raise-missing-from
742
- except MissingAzurePackage:
743
- msg = (
744
- "The required packages for remote tracking are missing.\n"
745
- 'To resolve this, please install them by running "pip install azure-ai-evaluation[remote]".'
746
- )
747
-
748
- raise EvaluationException( # pylint: disable=raise-missing-from
749
- message=msg,
750
- target=ErrorTarget.EVALUATE,
751
- category=ErrorCategory.MISSING_PACKAGE,
752
- blame=ErrorBlame.USER_ERROR,
753
- )
754
-
755
- trace_destination: Optional[str] = pf_client._config.get_trace_destination() # pylint: disable=protected-access
756
-
757
- # Handle the case where the customer manually run "pf config set trace.destination=none"
758
- if trace_destination and trace_destination.lower() == "none":
759
- trace_destination = None
760
-
714
+ pf_client = PFClient(user_agent=USER_AGENT)
761
715
  target_run: Optional[Run] = None
762
716
 
763
717
  # Create default configuration for evaluators that directly maps
@@ -831,11 +785,7 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
831
785
  # Ensure the absolute path is passed to pf.run, as relative path doesn't work with
832
786
  # multiple evaluators. If the path is already absolute, abspath will return the original path.
833
787
  data = os.path.abspath(data)
834
-
835
- # A user reported intermittent errors when PFClient uploads evaluation runs to the cloud.
836
- # The root cause is still unclear, but it seems related to a conflict between the async run uploader
837
- # and the async batch run. As a quick mitigation, use a PFClient without a trace destination for batch runs.
838
- per_evaluator_results = eval_batch_run(ProxyClient(PFClient(user_agent=USER_AGENT)), data=data)
788
+ per_evaluator_results = eval_batch_run(ProxyClient(pf_client), data=data)
839
789
  else:
840
790
  data = input_data_df
841
791
  per_evaluator_results = eval_batch_run(CodeClient(), data=input_data_df)
@@ -877,20 +827,26 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
877
827
  result_df = pd.concat([input_data_df, evaluators_result_df], axis=1, verify_integrity=True)
878
828
  metrics = _aggregate_metrics(evaluators_result_df, evaluators)
879
829
  metrics.update(evaluators_metric)
880
- studio_url = _log_metrics_and_instance_results(
881
- metrics,
882
- result_df,
883
- trace_destination,
884
- target_run,
885
- evaluation_name,
886
- )
830
+
831
+ # Since tracing is disabled, pass None for target_run so a dummy evaluation run will be created each time.
832
+ target_run = None
833
+ trace_destination = _trace_destination_from_project_scope(azure_ai_project) if azure_ai_project else None
834
+ studio_url = None
835
+ if trace_destination:
836
+ studio_url = _log_metrics_and_instance_results(
837
+ metrics,
838
+ result_df,
839
+ trace_destination,
840
+ target_run,
841
+ evaluation_name,
842
+ )
887
843
 
888
844
  result_df_dict = result_df.to_dict("records")
889
845
  result: EvaluationResult = {"rows": result_df_dict, "metrics": metrics, "studio_url": studio_url} # type: ignore
890
846
 
847
+ _print_summary(per_evaluator_results)
848
+
891
849
  if output_path:
892
850
  _write_output(output_path, result)
893
851
 
894
- _print_summary(per_evaluator_results)
895
-
896
852
  return result
@@ -123,7 +123,8 @@ def log_evaluate_activity(func: Callable[P, EvaluationResult]) -> Callable[P, Ev
123
123
  user_agent=USER_AGENT,
124
124
  )
125
125
 
126
- track_in_cloud = bool(pf_client._config.get_trace_destination()) # pylint: disable=protected-access
126
+ trace_destination = pf_client._config.get_trace_destination() # pylint: disable=protected-access
127
+ track_in_cloud = bool(trace_destination) if trace_destination != "none" else False
127
128
  evaluate_target = bool(kwargs.get("target", None))
128
129
  evaluator_config = bool(kwargs.get("evaluator_config", None))
129
130
  custom_dimensions: Dict[str, Union[str, bool]] = {
@@ -21,7 +21,6 @@ from azure.ai.evaluation._constants import (
21
21
  EvaluationRunProperties,
22
22
  Prefixes,
23
23
  )
24
- from azure.ai.evaluation._evaluate._eval_run import EvalRun
25
24
  from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
26
25
  from azure.ai.evaluation._model_configurations import AzureAIProject
27
26
 
@@ -118,6 +117,8 @@ def _log_metrics_and_instance_results(
118
117
  run: Run,
119
118
  evaluation_name: Optional[str],
120
119
  ) -> Optional[str]:
120
+ from azure.ai.evaluation._evaluate._eval_run import EvalRun
121
+
121
122
  if trace_destination is None:
122
123
  LOGGER.debug("Skip uploading evaluation results to AI Studio since no trace destination was provided.")
123
124
  return None
@@ -137,7 +138,7 @@ def _log_metrics_and_instance_results(
137
138
  ml_client=azure_pf_client.ml_client,
138
139
  promptflow_run=run,
139
140
  ) as ev_run:
140
- artifact_name = EvalRun.EVALUATION_ARTIFACT if run else EvalRun.EVALUATION_ARTIFACT_DUMMY_RUN
141
+ artifact_name = EvalRun.EVALUATION_ARTIFACT
141
142
 
142
143
  with tempfile.TemporaryDirectory() as tmpdir:
143
144
  # storing multi_modal images if exists
@@ -164,9 +165,8 @@ def _log_metrics_and_instance_results(
164
165
  ev_run.write_properties_to_run_history(
165
166
  properties={
166
167
  EvaluationRunProperties.RUN_TYPE: "eval_run",
167
- EvaluationRunProperties.EVALUATION_RUN: "azure-ai-generative-parent",
168
+ EvaluationRunProperties.EVALUATION_RUN: "promptflow.BatchRun",
168
169
  "_azureml.evaluate_artifacts": json.dumps([{"path": artifact_name, "type": "table"}]),
169
- "isEvaluatorRun": "true",
170
170
  }
171
171
  )
172
172
 
@@ -211,6 +211,8 @@ def _write_output(path: Union[str, os.PathLike], data_dict: Any) -> None:
211
211
  with open(p, "w", encoding=DefaultOpenEncoding.WRITE) as f:
212
212
  json.dump(data_dict, f)
213
213
 
214
+ print(f'Evaluation results saved to "{p.resolve()}".\n')
215
+
214
216
 
215
217
  def _apply_column_mapping(
216
218
  source_df: pd.DataFrame, mapping_config: Optional[Dict[str, str]], inplace: bool = False
@@ -26,31 +26,30 @@ class _AsyncBleuScoreEvaluator:
26
26
 
27
27
  class BleuScoreEvaluator:
28
28
  """
29
- Evaluator that computes the BLEU Score between two strings.
29
+ Calculate the BLEU score for a given response and ground truth.
30
30
 
31
31
  BLEU (Bilingual Evaluation Understudy) score is commonly used in natural language processing (NLP) and machine
32
- translation. It is widely used in text summarization and text generation use cases. It evaluates how closely the
33
- generated text matches the reference text. The BLEU score ranges from 0 to 1, with higher scores indicating
34
- better quality.
32
+ translation. It is widely used in text summarization and text generation use cases.
35
33
 
36
- **Usage**
34
+ Use the BLEU score when you want to evaluate the similarity between the generated text and reference text,
35
+ especially in tasks such as machine translation or text summarization, where n-gram overlap is a significant
36
+ indicator of quality.
37
37
 
38
- .. code-block:: python
38
+ The BLEU score ranges from 0 to 1, with higher scores indicating better quality.
39
39
 
40
- eval_fn = BleuScoreEvaluator()
41
- result = eval_fn(
42
- response="Tokyo is the capital of Japan.",
43
- ground_truth="The capital of Japan is Tokyo.")
40
+ .. admonition:: Example:
44
41
 
45
- **Output format**
46
-
47
- .. code-block:: python
48
-
49
- {
50
- "bleu_score": 0.22
51
- }
42
+ .. literalinclude:: ../samples/evaluation_samples_evaluate.py
43
+ :start-after: [START bleu_score_evaluator]
44
+ :end-before: [END bleu_score_evaluator]
45
+ :language: python
46
+ :dedent: 8
47
+ :caption: Initialize and call an BleuScoreEvaluator.
52
48
  """
53
49
 
50
+ id = "azureml://registries/azureml/models/Bleu-Score-Evaluator/versions/3"
51
+ """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
52
+
54
53
  def __init__(self):
55
54
  self._async_evaluator = _AsyncBleuScoreEvaluator()
56
55
 
@@ -2,70 +2,101 @@
2
2
  # Copyright (c) Microsoft Corporation. All rights reserved.
3
3
  # ---------------------------------------------------------
4
4
  import os
5
- from typing import Optional
5
+ from typing import Dict, Union, List
6
6
 
7
- from typing_extensions import override
7
+ from typing_extensions import overload, override
8
8
 
9
9
  from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
10
+ from azure.ai.evaluation._model_configurations import Conversation
10
11
 
11
12
 
12
- class CoherenceEvaluator(PromptyEvaluatorBase):
13
+ class CoherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
13
14
  """
14
- Initialize a coherence evaluator configured for a specific Azure OpenAI model.
15
+ Evaluates coherence score for a given query and response or a multi-turn conversation, including reasoning.
16
+
17
+ The coherence measure assesses the ability of the language model to generate text that reads naturally,
18
+ flows smoothly, and resembles human-like language in its responses. Use it when assessing the readability
19
+ and user-friendliness of a model's generated responses in real-world applications.
15
20
 
16
21
  :param model_config: Configuration for the Azure OpenAI model.
17
22
  :type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
18
23
  ~azure.ai.evaluation.OpenAIModelConfiguration]
19
24
 
20
- **Usage**
21
-
22
- .. code-block:: python
23
-
24
- eval_fn = CoherenceEvaluator(model_config)
25
- result = eval_fn(
26
- query="What is the capital of Japan?",
27
- response="The capital of Japan is Tokyo.")
25
+ .. admonition:: Example:
28
26
 
29
- **Output format**
27
+ .. literalinclude:: ../samples/evaluation_samples_evaluate.py
28
+ :start-after: [START coherence_evaluator]
29
+ :end-before: [END coherence_evaluator]
30
+ :language: python
31
+ :dedent: 8
32
+ :caption: Initialize and call a CoherenceEvaluator with a query and response.
30
33
 
31
- .. code-block:: python
34
+ .. note::
32
35
 
33
- {
34
- "coherence": 1.0,
35
- "gpt_coherence": 1.0,
36
- }
37
-
38
- Note: To align with our support of a diverse set of models, a key without the `gpt_` prefix has been added.
39
- To maintain backwards compatibility, the old key with the `gpt_` prefix is still be present in the output;
40
- however, it is recommended to use the new key moving forward as the old key will be deprecated in the future.
36
+ To align with our support of a diverse set of models, an output key without the `gpt_` prefix has been added.
37
+ To maintain backwards compatibility, the old key with the `gpt_` prefix is still be present in the output;
38
+ however, it is recommended to use the new key moving forward as the old key will be deprecated in the future.
41
39
  """
42
40
 
43
41
  _PROMPTY_FILE = "coherence.prompty"
44
42
  _RESULT_KEY = "coherence"
45
43
 
44
+ id = "azureml://registries/azureml/models/Coherence-Evaluator/versions/4"
45
+ """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
46
+
46
47
  @override
47
48
  def __init__(self, model_config):
48
49
  current_dir = os.path.dirname(__file__)
49
50
  prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
50
51
  super().__init__(model_config=model_config, prompty_file=prompty_path, result_key=self._RESULT_KEY)
51
52
 
52
- @override
53
+ @overload
54
+ def __call__(
55
+ self,
56
+ *,
57
+ query: str,
58
+ response: str,
59
+ ) -> Dict[str, Union[str, float]]:
60
+ """Evaluate coherence for given input of query, response
61
+
62
+ :keyword query: The query to be evaluated.
63
+ :paramtype query: str
64
+ :keyword response: The response to be evaluated.
65
+ :paramtype response: str
66
+ :return: The coherence score.
67
+ :rtype: Dict[str, float]
68
+ """
69
+
70
+ @overload
53
71
  def __call__(
54
72
  self,
55
73
  *,
56
- query: Optional[str] = None,
57
- response: Optional[str] = None,
58
- conversation=None,
74
+ conversation: Conversation,
75
+ ) -> Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]:
76
+ """Evaluate coherence for a conversation
77
+
78
+ :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
79
+ key "messages", and potentially a global context under the key "context". Conversation turns are expected
80
+ to be dictionaries with keys "content", "role", and possibly "context".
81
+ :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
82
+ :return: The coherence score.
83
+ :rtype: Dict[str, Union[float, Dict[str, List[float]]]]
84
+ """
85
+
86
+ @override
87
+ def __call__( # pylint: disable=docstring-missing-param
88
+ self,
89
+ *args,
59
90
  **kwargs,
60
91
  ):
61
92
  """Evaluate coherence. Accepts either a query and response for a single evaluation,
62
93
  or a conversation for a potentially multi-turn evaluation. If the conversation has more than one pair of
63
94
  turns, the evaluator will aggregate the results of each turn.
64
95
 
96
+ :keyword query: The query to be evaluated.
97
+ :paramtype query: str
65
98
  :keyword response: The response to be evaluated.
66
99
  :paramtype response: Optional[str]
67
- :keyword context: The context to be evaluated.
68
- :paramtype context: Optional[str]
69
100
  :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
70
101
  key "messages". Conversation turns are expected
71
102
  to be dictionaries with keys "content" and "role".
@@ -73,4 +104,4 @@ class CoherenceEvaluator(PromptyEvaluatorBase):
73
104
  :return: The relevance score.
74
105
  :rtype: Union[Dict[str, float], Dict[str, Union[float, Dict[str, List[float]]]]]
75
106
  """
76
- return super().__call__(query=query, response=response, conversation=conversation, **kwargs)
107
+ return super().__call__(*args, **kwargs)
@@ -7,7 +7,7 @@ from abc import ABC, abstractmethod
7
7
  from typing import Any, Callable, Dict, Generic, List, TypedDict, TypeVar, Union, cast, final
8
8
 
9
9
  from promptflow._utils.async_utils import async_run_allowing_running_loop
10
- from typing_extensions import ParamSpec, TypeAlias
10
+ from typing_extensions import ParamSpec, TypeAlias, get_overloads
11
11
 
12
12
  from azure.ai.evaluation._common.math import list_mean
13
13
  from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
@@ -88,7 +88,11 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
88
88
  # This needs to be overridden just to change the function header into something more informative,
89
89
  # and to be able to add a more specific docstring. The actual function contents should just be
90
90
  # super().__call__(<inputs>)
91
- def __call__(self, **kwargs) -> Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]]:
91
+ def __call__( # pylint: disable=docstring-missing-param
92
+ self,
93
+ *args,
94
+ **kwargs,
95
+ ) -> Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]]:
92
96
  """Evaluate a given input. This method serves as a wrapper and is meant to be overridden by child classes for
93
97
  one main reason - to overwrite the method headers and docstring to include additional inputs as needed.
94
98
  The actual behavior of this function shouldn't change beyond adding more inputs to the
@@ -127,11 +131,19 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
127
131
  :rtype: List[str]
128
132
  """
129
133
 
134
+ overloads = get_overloads(self.__call__)
135
+ if not overloads:
136
+ call_signatures = [inspect.signature(self.__call__)]
137
+ else:
138
+ call_signatures = [inspect.signature(overload) for overload in overloads]
130
139
  call_signature = inspect.signature(self.__call__)
131
140
  singletons = []
132
- for param in call_signature.parameters:
133
- if param not in self._not_singleton_inputs:
134
- singletons.append(param)
141
+ for call_signature in call_signatures:
142
+ params = call_signature.parameters
143
+ if any(not_singleton_input in params for not_singleton_input in self._not_singleton_inputs):
144
+ continue
145
+ # exclude self since it is not a singleton input
146
+ singletons.extend([p for p in params if p != "self"])
135
147
  return singletons
136
148
 
137
149
  def _derive_conversation_converter(self) -> Callable[[Dict], List[DerivedEvalInput]]:
@@ -4,7 +4,7 @@
4
4
 
5
5
  import math
6
6
  import re
7
- from typing import Dict, Union
7
+ from typing import Dict, TypeVar, Union
8
8
 
9
9
  from promptflow.core import AsyncPrompty
10
10
  from typing_extensions import override
@@ -18,8 +18,10 @@ try:
18
18
  except ImportError:
19
19
  USER_AGENT = "None"
20
20
 
21
+ T = TypeVar("T")
21
22
 
22
- class PromptyEvaluatorBase(EvaluatorBase[float]):
23
+
24
+ class PromptyEvaluatorBase(EvaluatorBase[T]):
23
25
  """Base class for all evaluators that make use of context as an input. It's also assumed that such evaluators
24
26
  make use of a prompty file, and return their results as a dictionary, with a single key-value pair
25
27
  linking the result name to a float value (unless multi-turn evaluation occurs, in which case the
@@ -1,7 +1,7 @@
1
1
  # ---------------------------------------------------------
2
2
  # Copyright (c) Microsoft Corporation. All rights reserved.
3
3
  # ---------------------------------------------------------
4
- from typing import Dict, Optional, Union
4
+ from typing import Dict, TypeVar, Union
5
5
 
6
6
  from typing_extensions import override
7
7
 
@@ -18,7 +18,7 @@ from azure.core.credentials import TokenCredential
18
18
 
19
19
  from . import EvaluatorBase
20
20
 
21
- T = Union[str, float]
21
+ T = TypeVar("T")
22
22
 
23
23
 
24
24
  class RaiServiceEvaluatorBase(EvaluatorBase[T]):
@@ -50,12 +50,9 @@ class RaiServiceEvaluatorBase(EvaluatorBase[T]):
50
50
  self._credential = credential
51
51
 
52
52
  @override
53
- def __call__(
53
+ def __call__( # pylint: disable=docstring-missing-param
54
54
  self,
55
- *,
56
- query: Optional[str] = None,
57
- response: Optional[str] = None,
58
- conversation=None,
55
+ *args,
59
56
  **kwargs,
60
57
  ):
61
58
  """Evaluate either a query and response or a conversation. Must supply either a query AND response,
@@ -71,7 +68,7 @@ class RaiServiceEvaluatorBase(EvaluatorBase[T]):
71
68
  :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
72
69
  :rtype: Union[Dict[str, T], Dict[str, Union[float, Dict[str, List[T]]]]]
73
70
  """
74
- return super().__call__(query=query, response=response, conversation=conversation, **kwargs)
71
+ return super().__call__(*args, **kwargs)
75
72
 
76
73
  @override
77
74
  async def _do_eval(self, eval_input: Dict) -> Dict[str, T]:
@@ -108,7 +105,7 @@ class RaiServiceEvaluatorBase(EvaluatorBase[T]):
108
105
  )
109
106
  input_data["context"] = context
110
107
 
111
- return await evaluate_with_rai_service(
108
+ return await evaluate_with_rai_service( # type: ignore
112
109
  metric_name=self._eval_metric,
113
110
  data=input_data,
114
111
  project_scope=self._azure_ai_project,