azure-ai-evaluation 1.8.0__py3-none-any.whl → 1.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (142) hide show
  1. azure/ai/evaluation/__init__.py +51 -6
  2. azure/ai/evaluation/_aoai/__init__.py +1 -1
  3. azure/ai/evaluation/_aoai/aoai_grader.py +21 -11
  4. azure/ai/evaluation/_aoai/label_grader.py +3 -2
  5. azure/ai/evaluation/_aoai/python_grader.py +84 -0
  6. azure/ai/evaluation/_aoai/score_model_grader.py +91 -0
  7. azure/ai/evaluation/_aoai/string_check_grader.py +3 -2
  8. azure/ai/evaluation/_aoai/text_similarity_grader.py +3 -2
  9. azure/ai/evaluation/_azure/_envs.py +9 -10
  10. azure/ai/evaluation/_azure/_token_manager.py +7 -1
  11. azure/ai/evaluation/_common/constants.py +11 -2
  12. azure/ai/evaluation/_common/evaluation_onedp_client.py +32 -26
  13. azure/ai/evaluation/_common/onedp/__init__.py +32 -32
  14. azure/ai/evaluation/_common/onedp/_client.py +136 -139
  15. azure/ai/evaluation/_common/onedp/_configuration.py +70 -73
  16. azure/ai/evaluation/_common/onedp/_patch.py +21 -21
  17. azure/ai/evaluation/_common/onedp/_utils/__init__.py +6 -0
  18. azure/ai/evaluation/_common/onedp/_utils/model_base.py +1232 -0
  19. azure/ai/evaluation/_common/onedp/_utils/serialization.py +2032 -0
  20. azure/ai/evaluation/_common/onedp/_validation.py +50 -50
  21. azure/ai/evaluation/_common/onedp/_version.py +9 -9
  22. azure/ai/evaluation/_common/onedp/aio/__init__.py +29 -29
  23. azure/ai/evaluation/_common/onedp/aio/_client.py +138 -143
  24. azure/ai/evaluation/_common/onedp/aio/_configuration.py +70 -75
  25. azure/ai/evaluation/_common/onedp/aio/_patch.py +21 -21
  26. azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +37 -39
  27. azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +4832 -4494
  28. azure/ai/evaluation/_common/onedp/aio/operations/_patch.py +21 -21
  29. azure/ai/evaluation/_common/onedp/models/__init__.py +168 -142
  30. azure/ai/evaluation/_common/onedp/models/_enums.py +230 -162
  31. azure/ai/evaluation/_common/onedp/models/_models.py +2685 -2228
  32. azure/ai/evaluation/_common/onedp/models/_patch.py +21 -21
  33. azure/ai/evaluation/_common/onedp/operations/__init__.py +37 -39
  34. azure/ai/evaluation/_common/onedp/operations/_operations.py +6106 -5657
  35. azure/ai/evaluation/_common/onedp/operations/_patch.py +21 -21
  36. azure/ai/evaluation/_common/rai_service.py +88 -52
  37. azure/ai/evaluation/_common/raiclient/__init__.py +1 -1
  38. azure/ai/evaluation/_common/raiclient/operations/_operations.py +14 -1
  39. azure/ai/evaluation/_common/utils.py +188 -10
  40. azure/ai/evaluation/_constants.py +2 -1
  41. azure/ai/evaluation/_converters/__init__.py +1 -1
  42. azure/ai/evaluation/_converters/_ai_services.py +9 -8
  43. azure/ai/evaluation/_converters/_models.py +46 -0
  44. azure/ai/evaluation/_converters/_sk_services.py +495 -0
  45. azure/ai/evaluation/_eval_mapping.py +2 -2
  46. azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +73 -25
  47. azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +2 -2
  48. azure/ai/evaluation/_evaluate/_evaluate.py +210 -94
  49. azure/ai/evaluation/_evaluate/_evaluate_aoai.py +132 -89
  50. azure/ai/evaluation/_evaluate/_telemetry/__init__.py +0 -1
  51. azure/ai/evaluation/_evaluate/_utils.py +25 -17
  52. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +4 -4
  53. azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +20 -12
  54. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +6 -6
  55. azure/ai/evaluation/_evaluators/_common/_base_eval.py +45 -11
  56. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +24 -9
  57. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +24 -9
  58. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +28 -18
  59. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +11 -8
  60. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +11 -8
  61. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +12 -9
  62. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +10 -7
  63. azure/ai/evaluation/_evaluators/_document_retrieval/__init__.py +1 -5
  64. azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +37 -64
  65. azure/ai/evaluation/_evaluators/_eci/_eci.py +6 -3
  66. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +5 -5
  67. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +3 -3
  68. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +4 -4
  69. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +12 -8
  70. azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +31 -26
  71. azure/ai/evaluation/_evaluators/_intent_resolution/intent_resolution.prompty +210 -96
  72. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +3 -4
  73. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +14 -7
  74. azure/ai/evaluation/_evaluators/_qa/_qa.py +5 -5
  75. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +62 -15
  76. azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +140 -59
  77. azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +21 -26
  78. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +5 -5
  79. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +22 -22
  80. azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +7 -6
  81. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +4 -4
  82. azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +27 -24
  83. azure/ai/evaluation/_evaluators/_task_adherence/task_adherence.prompty +354 -66
  84. azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +175 -183
  85. azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +99 -21
  86. azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +20 -12
  87. azure/ai/evaluation/_evaluators/_xpia/xpia.py +10 -7
  88. azure/ai/evaluation/_exceptions.py +10 -0
  89. azure/ai/evaluation/_http_utils.py +3 -3
  90. azure/ai/evaluation/_legacy/_batch_engine/_config.py +6 -3
  91. azure/ai/evaluation/_legacy/_batch_engine/_engine.py +117 -32
  92. azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py +5 -2
  93. azure/ai/evaluation/_legacy/_batch_engine/_result.py +2 -0
  94. azure/ai/evaluation/_legacy/_batch_engine/_run.py +2 -2
  95. azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +33 -41
  96. azure/ai/evaluation/_legacy/_batch_engine/_utils.py +1 -4
  97. azure/ai/evaluation/_legacy/_common/_async_token_provider.py +12 -19
  98. azure/ai/evaluation/_legacy/_common/_thread_pool_executor_with_context.py +2 -0
  99. azure/ai/evaluation/_legacy/prompty/_prompty.py +11 -5
  100. azure/ai/evaluation/_safety_evaluation/__init__.py +1 -1
  101. azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +195 -111
  102. azure/ai/evaluation/_user_agent.py +32 -1
  103. azure/ai/evaluation/_version.py +1 -1
  104. azure/ai/evaluation/red_team/__init__.py +3 -1
  105. azure/ai/evaluation/red_team/_agent/__init__.py +1 -1
  106. azure/ai/evaluation/red_team/_agent/_agent_functions.py +68 -71
  107. azure/ai/evaluation/red_team/_agent/_agent_tools.py +103 -145
  108. azure/ai/evaluation/red_team/_agent/_agent_utils.py +26 -6
  109. azure/ai/evaluation/red_team/_agent/_semantic_kernel_plugin.py +62 -71
  110. azure/ai/evaluation/red_team/_attack_objective_generator.py +94 -52
  111. azure/ai/evaluation/red_team/_attack_strategy.py +2 -1
  112. azure/ai/evaluation/red_team/_callback_chat_target.py +4 -9
  113. azure/ai/evaluation/red_team/_default_converter.py +1 -1
  114. azure/ai/evaluation/red_team/_red_team.py +1947 -1040
  115. azure/ai/evaluation/red_team/_red_team_result.py +49 -38
  116. azure/ai/evaluation/red_team/_utils/__init__.py +1 -1
  117. azure/ai/evaluation/red_team/_utils/_rai_service_eval_chat_target.py +39 -34
  118. azure/ai/evaluation/red_team/_utils/_rai_service_target.py +163 -138
  119. azure/ai/evaluation/red_team/_utils/_rai_service_true_false_scorer.py +14 -14
  120. azure/ai/evaluation/red_team/_utils/constants.py +1 -13
  121. azure/ai/evaluation/red_team/_utils/formatting_utils.py +41 -44
  122. azure/ai/evaluation/red_team/_utils/logging_utils.py +17 -17
  123. azure/ai/evaluation/red_team/_utils/metric_mapping.py +31 -4
  124. azure/ai/evaluation/red_team/_utils/strategy_utils.py +33 -25
  125. azure/ai/evaluation/simulator/_adversarial_scenario.py +2 -0
  126. azure/ai/evaluation/simulator/_adversarial_simulator.py +31 -17
  127. azure/ai/evaluation/simulator/_conversation/__init__.py +2 -2
  128. azure/ai/evaluation/simulator/_direct_attack_simulator.py +8 -8
  129. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +18 -6
  130. azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +54 -24
  131. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +7 -1
  132. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +30 -10
  133. azure/ai/evaluation/simulator/_model_tools/_rai_client.py +19 -31
  134. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +20 -6
  135. azure/ai/evaluation/simulator/_model_tools/models.py +1 -1
  136. azure/ai/evaluation/simulator/_simulator.py +21 -8
  137. {azure_ai_evaluation-1.8.0.dist-info → azure_ai_evaluation-1.10.0.dist-info}/METADATA +46 -3
  138. {azure_ai_evaluation-1.8.0.dist-info → azure_ai_evaluation-1.10.0.dist-info}/RECORD +141 -136
  139. azure/ai/evaluation/_common/onedp/aio/_vendor.py +0 -40
  140. {azure_ai_evaluation-1.8.0.dist-info → azure_ai_evaluation-1.10.0.dist-info}/NOTICE.txt +0 -0
  141. {azure_ai_evaluation-1.8.0.dist-info → azure_ai_evaluation-1.10.0.dist-info}/WHEEL +0 -0
  142. {azure_ai_evaluation-1.8.0.dist-info → azure_ai_evaluation-1.10.0.dist-info}/top_level.txt +0 -0
@@ -41,9 +41,9 @@ class SexualEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
41
41
 
42
42
  :param credential: The credential for connecting to Azure AI project. Required
43
43
  :type credential: ~azure.core.credentials.TokenCredential
44
- :param azure_ai_project: The scope of the Azure AI project.
45
- It contains subscription id, resource group, and project name.
46
- :type azure_ai_project: ~azure.ai.evaluation.AzureAIProject
44
+ :param azure_ai_project: The Azure AI project, which can either be a string representing the project endpoint
45
+ or an instance of AzureAIProject. It contains subscription id, resource group, and project name.
46
+ :type azure_ai_project: Union[str, ~azure.ai.evaluation.AzureAIProject]
47
47
  :param threshold: The threshold for the Sexual evaluator. Default is 3.
48
48
  :type threshold: int
49
49
 
@@ -55,17 +55,17 @@ class SexualEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
55
55
  :language: python
56
56
  :dedent: 8
57
57
  :caption: Initialize and call a SexualEvaluator.
58
-
58
+
59
59
  .. admonition:: Example using Azure AI Project URL:
60
-
60
+
61
61
  .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
62
62
  :start-after: [START sexual_evaluator]
63
63
  :end-before: [END sexual_evaluator]
64
64
  :language: python
65
65
  :dedent: 8
66
- :caption: Initialize and call SexualEvaluator using Azure AI Project URL in following format
66
+ :caption: Initialize and call SexualEvaluator using Azure AI Project URL in following format
67
67
  https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
68
-
68
+
69
69
  .. admonition:: Example with Threshold:
70
70
 
71
71
  .. literalinclude:: ../samples/evaluation_samples_threshold.py
@@ -76,8 +76,9 @@ class SexualEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
76
76
  :caption: Initialize with threshold and call a SexualEvaluator.
77
77
  """
78
78
 
79
- id = "azureml://registries/azureml/models/Sexual-Content-Evaluator/versions/3"
79
+ id = "azureai://built-in/evaluators/sexual"
80
80
  """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
81
+ _OPTIONAL_PARAMS = ["query"]
81
82
 
82
83
  @override
83
84
  def __init__(
@@ -86,6 +87,7 @@ class SexualEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
86
87
  azure_ai_project,
87
88
  *,
88
89
  threshold: int = 3,
90
+ **kwargs,
89
91
  ):
90
92
  super().__init__(
91
93
  eval_metric=EvaluationMetrics.SEXUAL,
@@ -94,6 +96,7 @@ class SexualEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
94
96
  conversation_aggregation_type=_AggregationType.MAX,
95
97
  threshold=threshold,
96
98
  _higher_is_better=False,
99
+ **kwargs,
97
100
  )
98
101
 
99
102
  @overload
@@ -146,7 +149,7 @@ class SexualEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
146
149
  key "messages". Conversation turns are expected
147
150
  to be dictionaries with keys "content" and "role".
148
151
  :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
149
- :return: The fluency score.
152
+ :return: The sexual score.
150
153
  :rtype: Union[Dict[str, Union[str, float]], Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]]
151
154
  """
152
155
  return super().__call__(*args, **kwargs)
@@ -41,9 +41,9 @@ class ViolenceEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
41
41
 
42
42
  :param credential: The credential for connecting to Azure AI project. Required
43
43
  :type credential: ~azure.core.credentials.TokenCredential
44
- :param azure_ai_project: The scope of the Azure AI project.
45
- It contains subscription id, resource group, and project name.
46
- :type azure_ai_project: ~azure.ai.evaluation.AzureAIProject
44
+ :param azure_ai_project: The Azure AI project, which can either be a string representing the project endpoint
45
+ or an instance of AzureAIProject. It contains subscription id, resource group, and project name.
46
+ :type azure_ai_project: Union[str, ~azure.ai.evaluation.AzureAIProject]
47
47
  :param threshold: The threshold for the Violence evaluator. Default is 3.
48
48
  :type threshold: int
49
49
 
@@ -57,15 +57,15 @@ class ViolenceEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
57
57
  :caption: Initialize and call a ViolenceEvaluator.
58
58
 
59
59
  .. admonition:: Example using Azure AI Project URL:
60
-
60
+
61
61
  .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
62
62
  :start-after: [START violence_evaluator]
63
63
  :end-before: [END violence_evaluator]
64
64
  :language: python
65
65
  :dedent: 8
66
- :caption: Initialize and call ViolenceEvaluator using Azure AI Project URL in following format
66
+ :caption: Initialize and call ViolenceEvaluator using Azure AI Project URL in following format
67
67
  https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
68
-
68
+
69
69
  .. admonition:: Example:
70
70
 
71
71
  .. literalinclude:: ../samples/evaluation_samples_threshold.py
@@ -76,8 +76,9 @@ class ViolenceEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
76
76
  :caption: Initialize with threshold and call a ViolenceEvaluator.
77
77
  """
78
78
 
79
- id = "azureml://registries/azureml/models/Violent-Content-Evaluator/versions/3"
79
+ id = "azureai://built-in/evaluators/violence"
80
80
  """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
81
+ _OPTIONAL_PARAMS = ["query"]
81
82
 
82
83
  @override
83
84
  def __init__(
@@ -86,6 +87,7 @@ class ViolenceEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
86
87
  azure_ai_project,
87
88
  *,
88
89
  threshold: int = 3,
90
+ **kwargs,
89
91
  ):
90
92
  super().__init__(
91
93
  eval_metric=EvaluationMetrics.VIOLENCE,
@@ -94,6 +96,7 @@ class ViolenceEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
94
96
  conversation_aggregation_type=_AggregationType.MAX,
95
97
  threshold=threshold,
96
98
  _higher_is_better=False,
99
+ **kwargs,
97
100
  )
98
101
 
99
102
  @overload
@@ -4,8 +4,4 @@
4
4
 
5
5
  from ._document_retrieval import DocumentRetrievalEvaluator, RetrievalGroundTruthDocument, RetrievedDocument
6
6
 
7
- __all__ = [
8
- "DocumentRetrievalEvaluator",
9
- "RetrievalGroundTruthDocument",
10
- "RetrievedDocument"
11
- ]
7
+ __all__ = ["DocumentRetrievalEvaluator", "RetrievalGroundTruthDocument", "RetrievedDocument"]
@@ -14,9 +14,7 @@ RetrievalGroundTruthDocument = TypedDict(
14
14
  "RetrievalGroundTruthDocument", {"document_id": str, "query_relevance_label": int}
15
15
  )
16
16
 
17
- RetrievedDocument = TypedDict(
18
- "RetrievedDocument", {"document_id": str, "relevance_score": float}
19
- )
17
+ RetrievedDocument = TypedDict("RetrievedDocument", {"document_id": str, "relevance_score": float})
20
18
 
21
19
 
22
20
  class DocumentRetrievalEvaluator(EvaluatorBase):
@@ -33,15 +31,15 @@ class DocumentRetrievalEvaluator(EvaluatorBase):
33
31
  :caption: Initialize and call a DocumentRetrievalEvaluator
34
32
 
35
33
  .. admonition:: Example using Azure AI Project URL:
36
-
34
+
37
35
  .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
38
36
  :start-after: [START document_retrieval_evaluator]
39
37
  :end-before: [END document_retrieval_evaluator]
40
38
  :language: python
41
39
  :dedent: 8
42
- :caption: Initialize and call DocumentRetrievalEvaluator using Azure AI Project URL in following format
40
+ :caption: Initialize and call DocumentRetrievalEvaluator using Azure AI Project URL in following format
43
41
  https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
44
-
42
+
45
43
  .. admonition:: Example with Threshold:
46
44
  .. literalinclude:: ../samples/evaluation_samples_threshold.py
47
45
  :start-after: [START threshold_document_retrieval_evaluator]
@@ -51,6 +49,9 @@ class DocumentRetrievalEvaluator(EvaluatorBase):
51
49
  :caption: Initialize with threshold and call a DocumentRetrievalEvaluator.
52
50
  """
53
51
 
52
+ id = "azureai://built-in/evaluators/document_retrieval"
53
+ """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
54
+
54
55
  def __init__(
55
56
  self,
56
57
  *,
@@ -62,7 +63,7 @@ class DocumentRetrievalEvaluator(EvaluatorBase):
62
63
  top1_relevance_threshold: Optional[float] = 50.0,
63
64
  top3_max_relevance_threshold: Optional[float] = 50.0,
64
65
  total_retrieved_documents_threshold: Optional[int] = 50,
65
- total_ground_truth_documents_threshold: Optional[int] = 50
66
+ total_ground_truth_documents_threshold: Optional[int] = 50,
66
67
  ):
67
68
  super().__init__()
68
69
  self.k = 3
@@ -74,14 +75,10 @@ class DocumentRetrievalEvaluator(EvaluatorBase):
74
75
  )
75
76
 
76
77
  if not isinstance(ground_truth_label_min, int):
77
- raise EvaluationException(
78
- "The ground truth label minimum must be an integer value."
79
- )
78
+ raise EvaluationException("The ground truth label minimum must be an integer value.")
80
79
 
81
80
  if not isinstance(ground_truth_label_max, int):
82
- raise EvaluationException(
83
- "The ground truth label maximum must be an integer value."
84
- )
81
+ raise EvaluationException("The ground truth label maximum must be an integer value.")
85
82
 
86
83
  self.ground_truth_label_min = ground_truth_label_min
87
84
  self.ground_truth_label_max = ground_truth_label_max
@@ -122,7 +119,7 @@ class DocumentRetrievalEvaluator(EvaluatorBase):
122
119
  ) -> float:
123
120
  """NDCG (Normalized Discounted Cumulative Gain) calculated for the top K documents retrieved from a search query.
124
121
  NDCG measures how well a document ranking compares to an ideal document ranking given a list of ground-truth documents.
125
-
122
+
126
123
  :param result_docs_groundtruth_labels: A list of retrieved documents' ground truth labels.
127
124
  :type result_docs_groundtruth_labels: List[int]
128
125
  :param ideal_docs_groundtruth_labels: A list of ideal documents' ground truth labels.
@@ -145,7 +142,7 @@ class DocumentRetrievalEvaluator(EvaluatorBase):
145
142
  def _compute_xdcg(self, result_docs_groundtruth_labels: List[int]) -> float:
146
143
  """XDCG calculated for the top K documents retrieved from a search query.
147
144
  XDCG measures how objectively good are the top K documents, discounted by their position in the list.
148
-
145
+
149
146
  :param result_docs_groundtruth_labels: A list of retrieved documents' ground truth labels.
150
147
  :type result_docs_groundtruth_labels: List[int]
151
148
  :return: The XDCG@K calculation result.
@@ -159,11 +156,7 @@ class DocumentRetrievalEvaluator(EvaluatorBase):
159
156
  return math.pow(self.xdcg_discount_factor, rank - 1)
160
157
 
161
158
  ranks = list(range(1, self.k + 1))
162
- xdcg_n = sum(
163
- starmap(
164
- calculate_xdcg_numerator, zip(result_docs_groundtruth_labels, ranks)
165
- )
166
- )
159
+ xdcg_n = sum(starmap(calculate_xdcg_numerator, zip(result_docs_groundtruth_labels, ranks)))
167
160
  xdcg_d = sum(map(calculate_xdcg_denominator, ranks))
168
161
 
169
162
  return xdcg_n / float(xdcg_d)
@@ -175,7 +168,7 @@ class DocumentRetrievalEvaluator(EvaluatorBase):
175
168
  ) -> float:
176
169
  """Fidelity calculated over all documents retrieved from a search query.
177
170
  Fidelity measures how objectively good are all of the documents retrieved compared with all known good documents in the underlying data store.
178
-
171
+
179
172
  :param result_docs_groundtruth_labels: A list of retrieved documents' ground truth labels.
180
173
  :type result_docs_groundtruth_labels: List[int]
181
174
  :param ideal_docs_groundtruth_labels: A list of ideal documents' ground truth labels.
@@ -196,25 +189,16 @@ class DocumentRetrievalEvaluator(EvaluatorBase):
196
189
  if label >= s:
197
190
  label_counts[str(label)] += 1
198
191
 
199
- sorted_label_counts = [
200
- x[1] for x in sorted(label_counts.items(), key=lambda x: x[0])
201
- ]
192
+ sorted_label_counts = [x[1] for x in sorted(label_counts.items(), key=lambda x: x[0])]
202
193
 
203
194
  # calculate weights
204
- weights = [
205
- (math.pow(2, i + 1) - 1)
206
- for i in range(s, self.ground_truth_label_max + 1)
207
- ]
195
+ weights = [(math.pow(2, i + 1) - 1) for i in range(s, self.ground_truth_label_max + 1)]
208
196
 
209
197
  # return weighted sum
210
198
  return sum(starmap(operator.mul, zip(sorted_label_counts, weights)))
211
199
 
212
- weighted_sum_by_rating_results = calculate_weighted_sum_by_rating(
213
- result_docs_groundtruth_labels
214
- )
215
- weighted_sum_by_rating_index = calculate_weighted_sum_by_rating(
216
- ideal_docs_groundtruth_labels
217
- )
200
+ weighted_sum_by_rating_results = calculate_weighted_sum_by_rating(result_docs_groundtruth_labels)
201
+ weighted_sum_by_rating_index = calculate_weighted_sum_by_rating(ideal_docs_groundtruth_labels)
218
202
 
219
203
  if weighted_sum_by_rating_index == 0:
220
204
  return math.nan
@@ -226,12 +210,16 @@ class DocumentRetrievalEvaluator(EvaluatorBase):
226
210
 
227
211
  for metric_name, metric_value in metrics.items():
228
212
  if metric_name in self._threshold_metrics.keys():
229
- result[f"{metric_name}_result"] = "pass" if metric_value >= self._threshold_metrics[metric_name] else "fail"
213
+ result[f"{metric_name}_result"] = (
214
+ "pass" if metric_value >= self._threshold_metrics[metric_name] else "fail"
215
+ )
230
216
  result[f"{metric_name}_threshold"] = self._threshold_metrics[metric_name]
231
217
  result[f"{metric_name}_higher_is_better"] = True
232
218
 
233
219
  elif metric_name in self._threshold_holes.keys():
234
- result[f"{metric_name}_result"] = "pass" if metric_value <= self._threshold_holes[metric_name] else "fail"
220
+ result[f"{metric_name}_result"] = (
221
+ "pass" if metric_value <= self._threshold_holes[metric_name] else "fail"
222
+ )
235
223
  result[f"{metric_name}_threshold"] = self._threshold_holes[metric_name]
236
224
  result[f"{metric_name}_higher_is_better"] = False
237
225
 
@@ -256,8 +244,10 @@ class DocumentRetrievalEvaluator(EvaluatorBase):
256
244
  # if the qrels are empty, no meaningful evaluation is possible
257
245
  if not retrieval_ground_truth:
258
246
  raise EvaluationException(
259
- ("'retrieval_ground_truth' parameter must contain at least one item. "
260
- "Check your data input to be sure that each input record has ground truth defined.")
247
+ (
248
+ "'retrieval_ground_truth' parameter must contain at least one item. "
249
+ "Check your data input to be sure that each input record has ground truth defined."
250
+ )
261
251
  )
262
252
 
263
253
  qrels = []
@@ -277,9 +267,7 @@ class DocumentRetrievalEvaluator(EvaluatorBase):
277
267
  )
278
268
 
279
269
  if not isinstance(query_relevance_label, int):
280
- raise EvaluationException(
281
- "Query relevance labels must be integer values."
282
- )
270
+ raise EvaluationException("Query relevance labels must be integer values.")
283
271
 
284
272
  if query_relevance_label < self.ground_truth_label_min:
285
273
  raise EvaluationException(
@@ -318,12 +306,8 @@ class DocumentRetrievalEvaluator(EvaluatorBase):
318
306
  )
319
307
  )
320
308
 
321
- if not isinstance(relevance_score, float) and not isinstance(
322
- relevance_score, int
323
- ):
324
- raise EvaluationException(
325
- "Retrieved document relevance score must be a numerical value."
326
- )
309
+ if not isinstance(relevance_score, float) and not isinstance(relevance_score, int):
310
+ raise EvaluationException("Retrieved document relevance score must be a numerical value.")
327
311
 
328
312
  results.append(result)
329
313
 
@@ -368,24 +352,17 @@ class DocumentRetrievalEvaluator(EvaluatorBase):
368
352
  results_lookup = {x["document_id"]: x["relevance_score"] for x in results}
369
353
 
370
354
  # sort each input set by label to get the ranking
371
- qrels_sorted_by_rank = sorted(
372
- qrels_lookup.items(), key=lambda x: x[1], reverse=True
373
- )
374
- results_sorted_by_rank = sorted(
375
- results_lookup.items(), key=lambda x: x[1], reverse=True
376
- )
355
+ qrels_sorted_by_rank = sorted(qrels_lookup.items(), key=lambda x: x[1], reverse=True)
356
+ results_sorted_by_rank = sorted(results_lookup.items(), key=lambda x: x[1], reverse=True)
377
357
 
378
358
  # find ground truth labels for the results set and ideal set
379
359
  result_docs_groundtruth_labels = [
380
- qrels_lookup[doc_id] if doc_id in qrels_lookup else 0
381
- for (doc_id, _) in results_sorted_by_rank
360
+ qrels_lookup[doc_id] if doc_id in qrels_lookup else 0 for (doc_id, _) in results_sorted_by_rank
382
361
  ]
383
362
  ideal_docs_groundtruth_labels = [label for (_, label) in qrels_sorted_by_rank]
384
363
 
385
364
  # calculate the proportion of result docs with no ground truth label (holes)
386
- holes = self._compute_holes(
387
- [x[0] for x in results_sorted_by_rank], [x[0] for x in qrels_sorted_by_rank]
388
- )
365
+ holes = self._compute_holes([x[0] for x in results_sorted_by_rank], [x[0] for x in qrels_sorted_by_rank])
389
366
  holes_ratio = holes / float(len(results))
390
367
 
391
368
  # if none of the retrieved docs are labeled, report holes only
@@ -412,12 +389,8 @@ class DocumentRetrievalEvaluator(EvaluatorBase):
412
389
  result_docs_groundtruth_labels[: self.k],
413
390
  ideal_docs_groundtruth_labels[: self.k],
414
391
  ),
415
- f"xdcg@{self.k}": self._compute_xdcg(
416
- result_docs_groundtruth_labels[: self.k]
417
- ),
418
- "fidelity": self._compute_fidelity(
419
- result_docs_groundtruth_labels, ideal_docs_groundtruth_labels
420
- ),
392
+ f"xdcg@{self.k}": self._compute_xdcg(result_docs_groundtruth_labels[: self.k]),
393
+ "fidelity": self._compute_fidelity(result_docs_groundtruth_labels, ideal_docs_groundtruth_labels),
421
394
  "top1_relevance": result_docs_groundtruth_labels[0],
422
395
  "top3_max_relevance": max(result_docs_groundtruth_labels[: self.k]),
423
396
  "holes": holes,
@@ -22,9 +22,9 @@ class ECIEvaluator(RaiServiceEvaluatorBase):
22
22
 
23
23
  :param credential: The credential for connecting to Azure AI project. Required
24
24
  :type credential: ~azure.core.credentials.TokenCredential
25
- :param azure_ai_project: The scope of the Azure AI project.
26
- It contains subscription id, resource group, and project name.
27
- :type azure_ai_project: ~azure.ai.evaluation.AzureAIProject
25
+ :param azure_ai_project: The Azure AI project, which can either be a string representing the project endpoint
26
+ or an instance of AzureAIProject. It contains subscription id, resource group, and project name.
27
+ :type azure_ai_project: Union[str, ~azure.ai.evaluation.AzureAIProject]
28
28
  :return: Whether or not ECI was found in the response without a disclaimer, with AI-generated reasoning
29
29
  :rtype: Dict[str, str]
30
30
 
@@ -52,17 +52,20 @@ class ECIEvaluator(RaiServiceEvaluatorBase):
52
52
 
53
53
  id = "eci"
54
54
  """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
55
+ _OPTIONAL_PARAMS = ["query"]
55
56
 
56
57
  @override
57
58
  def __init__(
58
59
  self,
59
60
  credential,
60
61
  azure_ai_project,
62
+ **kwargs,
61
63
  ):
62
64
  super().__init__(
63
65
  eval_metric=_InternalEvaluationMetrics.ECI,
64
66
  azure_ai_project=azure_ai_project,
65
67
  credential=credential,
68
+ **kwargs,
66
69
  )
67
70
 
68
71
  @overload
@@ -39,15 +39,15 @@ class F1ScoreEvaluator(EvaluatorBase):
39
39
  :caption: Initialize and call an F1ScoreEvaluator.
40
40
 
41
41
  .. admonition:: Example using Azure AI Project URL:
42
-
42
+
43
43
  .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
44
44
  :start-after: [START f1_score_evaluator]
45
45
  :end-before: [END f1_score_evaluator]
46
46
  :language: python
47
47
  :dedent: 8
48
- :caption: Initialize and call F1ScoreEvaluator using Azure AI Project URL in following format
48
+ :caption: Initialize and call F1ScoreEvaluator using Azure AI Project URL in following format
49
49
  https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
50
-
50
+
51
51
  .. admonition:: Example with Threshold:
52
52
 
53
53
  .. literalinclude:: ../samples/evaluation_samples_threshold.py
@@ -58,7 +58,7 @@ class F1ScoreEvaluator(EvaluatorBase):
58
58
  :caption: Initialize with threshold and call an F1ScoreEvaluator.
59
59
  """
60
60
 
61
- id = "azureml://registries/azureml/models/F1Score-Evaluator/versions/3"
61
+ id = "azureai://built-in/evaluators/f1_score"
62
62
  """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
63
63
 
64
64
  def __init__(self, *, threshold=0.5):
@@ -147,7 +147,7 @@ class F1ScoreEvaluator(EvaluatorBase):
147
147
  if f1_result <= self._threshold:
148
148
  binary_result = True
149
149
  return {
150
- "f1_score": f1_result,
150
+ "f1_score": f1_result,
151
151
  "f1_result": EVALUATION_PASS_FAIL_MAPPING[binary_result],
152
152
  "f1_threshold": self._threshold,
153
153
  }
@@ -45,7 +45,7 @@ class FluencyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
45
45
  :caption: Initialize with threshold and call a FluencyEvaluator.
46
46
 
47
47
  .. admonition:: Example using Azure AI Project URL:
48
-
48
+
49
49
  .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
50
50
  :start-after: [START fluency_evaluator]
51
51
  :end-before: [END fluency_evaluator]
@@ -64,7 +64,7 @@ class FluencyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
64
64
  _PROMPTY_FILE = "fluency.prompty"
65
65
  _RESULT_KEY = "fluency"
66
66
 
67
- id = "azureml://registries/azureml/models/Fluency-Evaluator/versions/4"
67
+ id = "azureai://built-in/evaluators/fluency"
68
68
  """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
69
69
 
70
70
  @override
@@ -78,7 +78,7 @@ class FluencyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
78
78
  prompty_file=prompty_path,
79
79
  result_key=self._RESULT_KEY,
80
80
  threshold=threshold,
81
- _higher_is_better=self._higher_is_better
81
+ _higher_is_better=self._higher_is_better,
82
82
  )
83
83
 
84
84
  @overload
@@ -34,7 +34,7 @@ class GleuScoreEvaluator(EvaluatorBase):
34
34
  :language: python
35
35
  :dedent: 8
36
36
  :caption: Initialize and call a GleuScoreEvaluator.
37
-
37
+
38
38
  .. admonition:: Example with Threshold:
39
39
 
40
40
  .. literalinclude:: ../samples/evaluation_samples_threshold.py
@@ -45,17 +45,17 @@ class GleuScoreEvaluator(EvaluatorBase):
45
45
  :caption: Initialize with threshold and call a GleuScoreEvaluator.
46
46
 
47
47
  .. admonition:: Example using Azure AI Project URL:
48
-
48
+
49
49
  .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
50
50
  :start-after: [START gleu_score_evaluator]
51
51
  :end-before: [END gleu_score_evaluator]
52
52
  :language: python
53
53
  :dedent: 8
54
- :caption: Initialize and call GleuScoreEvaluator using Azure AI Project URL in the following format
54
+ :caption: Initialize and call GleuScoreEvaluator using Azure AI Project URL in the following format
55
55
  https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
56
56
  """
57
57
 
58
- id = "azureml://registries/azureml/models/Gleu-Score-Evaluator/versions/3"
58
+ id = "azureai://built-in/evaluators/gleu_score"
59
59
  """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
60
60
 
61
61
  @override
@@ -12,9 +12,13 @@ from azure.ai.evaluation._model_configurations import Conversation
12
12
  from ..._common.utils import construct_prompty_model_config, validate_model_config
13
13
 
14
14
  try:
15
- from ..._user_agent import USER_AGENT
15
+ from ..._user_agent import UserAgentSingleton
16
16
  except ImportError:
17
- USER_AGENT = "None"
17
+
18
+ class UserAgentSingleton:
19
+ @property
20
+ def value(self) -> str:
21
+ return "None"
18
22
 
19
23
 
20
24
  class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
@@ -35,7 +39,7 @@ class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
35
39
  ~azure.ai.evaluation.OpenAIModelConfiguration]
36
40
  :param threshold: The threshold for the groundedness evaluator. Default is 3.
37
41
  :type threshold: int
38
-
42
+
39
43
  .. admonition:: Example:
40
44
 
41
45
  .. literalinclude:: ../samples/evaluation_samples_evaluate.py
@@ -54,13 +58,13 @@ class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
54
58
  :caption: Initialize with threshold and call a GroundednessEvaluator.
55
59
 
56
60
  .. admonition:: Example using Azure AI Project URL:
57
-
61
+
58
62
  .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
59
63
  :start-after: [START groundedness_evaluator]
60
64
  :end-before: [END groundedness_evaluator]
61
65
  :language: python
62
66
  :dedent: 8
63
- :caption: Initialize and call GroundednessEvaluator using Azure AI Project URL in the following format
67
+ :caption: Initialize and call GroundednessEvaluator using Azure AI Project URL in the following format
64
68
  https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
65
69
 
66
70
  .. note::
@@ -75,7 +79,7 @@ class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
75
79
  _RESULT_KEY = "groundedness"
76
80
  _OPTIONAL_PARAMS = ["query"]
77
81
 
78
- id = "azureml://registries/azureml/models/Groundedness-Evaluator/versions/4"
82
+ id = "azureai://built-in/evaluators/groundedness"
79
83
  """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
80
84
 
81
85
  @override
@@ -89,7 +93,7 @@ class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
89
93
  prompty_file=prompty_path,
90
94
  result_key=self._RESULT_KEY,
91
95
  threshold=threshold,
92
- _higher_is_better=self._higher_is_better
96
+ _higher_is_better=self._higher_is_better,
93
97
  )
94
98
  self._model_config = model_config
95
99
  self.threshold = threshold
@@ -165,7 +169,7 @@ class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
165
169
  prompty_model_config = construct_prompty_model_config(
166
170
  validate_model_config(self._model_config),
167
171
  self._DEFAULT_OPEN_API_VERSION,
168
- USER_AGENT,
172
+ UserAgentSingleton().value,
169
173
  )
170
174
  self._flow = AsyncPrompty.load(source=self._prompty_file, model=prompty_model_config)
171
175