azure-ai-evaluation 1.8.0__py3-none-any.whl → 1.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (142) hide show
  1. azure/ai/evaluation/__init__.py +51 -6
  2. azure/ai/evaluation/_aoai/__init__.py +1 -1
  3. azure/ai/evaluation/_aoai/aoai_grader.py +21 -11
  4. azure/ai/evaluation/_aoai/label_grader.py +3 -2
  5. azure/ai/evaluation/_aoai/python_grader.py +84 -0
  6. azure/ai/evaluation/_aoai/score_model_grader.py +91 -0
  7. azure/ai/evaluation/_aoai/string_check_grader.py +3 -2
  8. azure/ai/evaluation/_aoai/text_similarity_grader.py +3 -2
  9. azure/ai/evaluation/_azure/_envs.py +9 -10
  10. azure/ai/evaluation/_azure/_token_manager.py +7 -1
  11. azure/ai/evaluation/_common/constants.py +11 -2
  12. azure/ai/evaluation/_common/evaluation_onedp_client.py +32 -26
  13. azure/ai/evaluation/_common/onedp/__init__.py +32 -32
  14. azure/ai/evaluation/_common/onedp/_client.py +136 -139
  15. azure/ai/evaluation/_common/onedp/_configuration.py +70 -73
  16. azure/ai/evaluation/_common/onedp/_patch.py +21 -21
  17. azure/ai/evaluation/_common/onedp/_utils/__init__.py +6 -0
  18. azure/ai/evaluation/_common/onedp/_utils/model_base.py +1232 -0
  19. azure/ai/evaluation/_common/onedp/_utils/serialization.py +2032 -0
  20. azure/ai/evaluation/_common/onedp/_validation.py +50 -50
  21. azure/ai/evaluation/_common/onedp/_version.py +9 -9
  22. azure/ai/evaluation/_common/onedp/aio/__init__.py +29 -29
  23. azure/ai/evaluation/_common/onedp/aio/_client.py +138 -143
  24. azure/ai/evaluation/_common/onedp/aio/_configuration.py +70 -75
  25. azure/ai/evaluation/_common/onedp/aio/_patch.py +21 -21
  26. azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +37 -39
  27. azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +4832 -4494
  28. azure/ai/evaluation/_common/onedp/aio/operations/_patch.py +21 -21
  29. azure/ai/evaluation/_common/onedp/models/__init__.py +168 -142
  30. azure/ai/evaluation/_common/onedp/models/_enums.py +230 -162
  31. azure/ai/evaluation/_common/onedp/models/_models.py +2685 -2228
  32. azure/ai/evaluation/_common/onedp/models/_patch.py +21 -21
  33. azure/ai/evaluation/_common/onedp/operations/__init__.py +37 -39
  34. azure/ai/evaluation/_common/onedp/operations/_operations.py +6106 -5657
  35. azure/ai/evaluation/_common/onedp/operations/_patch.py +21 -21
  36. azure/ai/evaluation/_common/rai_service.py +88 -52
  37. azure/ai/evaluation/_common/raiclient/__init__.py +1 -1
  38. azure/ai/evaluation/_common/raiclient/operations/_operations.py +14 -1
  39. azure/ai/evaluation/_common/utils.py +188 -10
  40. azure/ai/evaluation/_constants.py +2 -1
  41. azure/ai/evaluation/_converters/__init__.py +1 -1
  42. azure/ai/evaluation/_converters/_ai_services.py +9 -8
  43. azure/ai/evaluation/_converters/_models.py +46 -0
  44. azure/ai/evaluation/_converters/_sk_services.py +495 -0
  45. azure/ai/evaluation/_eval_mapping.py +2 -2
  46. azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +73 -25
  47. azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +2 -2
  48. azure/ai/evaluation/_evaluate/_evaluate.py +210 -94
  49. azure/ai/evaluation/_evaluate/_evaluate_aoai.py +132 -89
  50. azure/ai/evaluation/_evaluate/_telemetry/__init__.py +0 -1
  51. azure/ai/evaluation/_evaluate/_utils.py +25 -17
  52. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +4 -4
  53. azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +20 -12
  54. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +6 -6
  55. azure/ai/evaluation/_evaluators/_common/_base_eval.py +45 -11
  56. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +24 -9
  57. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +24 -9
  58. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +28 -18
  59. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +11 -8
  60. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +11 -8
  61. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +12 -9
  62. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +10 -7
  63. azure/ai/evaluation/_evaluators/_document_retrieval/__init__.py +1 -5
  64. azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +37 -64
  65. azure/ai/evaluation/_evaluators/_eci/_eci.py +6 -3
  66. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +5 -5
  67. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +3 -3
  68. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +4 -4
  69. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +12 -8
  70. azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +31 -26
  71. azure/ai/evaluation/_evaluators/_intent_resolution/intent_resolution.prompty +210 -96
  72. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +3 -4
  73. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +14 -7
  74. azure/ai/evaluation/_evaluators/_qa/_qa.py +5 -5
  75. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +62 -15
  76. azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +140 -59
  77. azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +21 -26
  78. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +5 -5
  79. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +22 -22
  80. azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +7 -6
  81. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +4 -4
  82. azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +27 -24
  83. azure/ai/evaluation/_evaluators/_task_adherence/task_adherence.prompty +354 -66
  84. azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +175 -183
  85. azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +99 -21
  86. azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +20 -12
  87. azure/ai/evaluation/_evaluators/_xpia/xpia.py +10 -7
  88. azure/ai/evaluation/_exceptions.py +10 -0
  89. azure/ai/evaluation/_http_utils.py +3 -3
  90. azure/ai/evaluation/_legacy/_batch_engine/_config.py +6 -3
  91. azure/ai/evaluation/_legacy/_batch_engine/_engine.py +117 -32
  92. azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py +5 -2
  93. azure/ai/evaluation/_legacy/_batch_engine/_result.py +2 -0
  94. azure/ai/evaluation/_legacy/_batch_engine/_run.py +2 -2
  95. azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +33 -41
  96. azure/ai/evaluation/_legacy/_batch_engine/_utils.py +1 -4
  97. azure/ai/evaluation/_legacy/_common/_async_token_provider.py +12 -19
  98. azure/ai/evaluation/_legacy/_common/_thread_pool_executor_with_context.py +2 -0
  99. azure/ai/evaluation/_legacy/prompty/_prompty.py +11 -5
  100. azure/ai/evaluation/_safety_evaluation/__init__.py +1 -1
  101. azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +195 -111
  102. azure/ai/evaluation/_user_agent.py +32 -1
  103. azure/ai/evaluation/_version.py +1 -1
  104. azure/ai/evaluation/red_team/__init__.py +3 -1
  105. azure/ai/evaluation/red_team/_agent/__init__.py +1 -1
  106. azure/ai/evaluation/red_team/_agent/_agent_functions.py +68 -71
  107. azure/ai/evaluation/red_team/_agent/_agent_tools.py +103 -145
  108. azure/ai/evaluation/red_team/_agent/_agent_utils.py +26 -6
  109. azure/ai/evaluation/red_team/_agent/_semantic_kernel_plugin.py +62 -71
  110. azure/ai/evaluation/red_team/_attack_objective_generator.py +94 -52
  111. azure/ai/evaluation/red_team/_attack_strategy.py +2 -1
  112. azure/ai/evaluation/red_team/_callback_chat_target.py +4 -9
  113. azure/ai/evaluation/red_team/_default_converter.py +1 -1
  114. azure/ai/evaluation/red_team/_red_team.py +1947 -1040
  115. azure/ai/evaluation/red_team/_red_team_result.py +49 -38
  116. azure/ai/evaluation/red_team/_utils/__init__.py +1 -1
  117. azure/ai/evaluation/red_team/_utils/_rai_service_eval_chat_target.py +39 -34
  118. azure/ai/evaluation/red_team/_utils/_rai_service_target.py +163 -138
  119. azure/ai/evaluation/red_team/_utils/_rai_service_true_false_scorer.py +14 -14
  120. azure/ai/evaluation/red_team/_utils/constants.py +1 -13
  121. azure/ai/evaluation/red_team/_utils/formatting_utils.py +41 -44
  122. azure/ai/evaluation/red_team/_utils/logging_utils.py +17 -17
  123. azure/ai/evaluation/red_team/_utils/metric_mapping.py +31 -4
  124. azure/ai/evaluation/red_team/_utils/strategy_utils.py +33 -25
  125. azure/ai/evaluation/simulator/_adversarial_scenario.py +2 -0
  126. azure/ai/evaluation/simulator/_adversarial_simulator.py +31 -17
  127. azure/ai/evaluation/simulator/_conversation/__init__.py +2 -2
  128. azure/ai/evaluation/simulator/_direct_attack_simulator.py +8 -8
  129. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +18 -6
  130. azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +54 -24
  131. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +7 -1
  132. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +30 -10
  133. azure/ai/evaluation/simulator/_model_tools/_rai_client.py +19 -31
  134. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +20 -6
  135. azure/ai/evaluation/simulator/_model_tools/models.py +1 -1
  136. azure/ai/evaluation/simulator/_simulator.py +21 -8
  137. {azure_ai_evaluation-1.8.0.dist-info → azure_ai_evaluation-1.10.0.dist-info}/METADATA +46 -3
  138. {azure_ai_evaluation-1.8.0.dist-info → azure_ai_evaluation-1.10.0.dist-info}/RECORD +141 -136
  139. azure/ai/evaluation/_common/onedp/aio/_vendor.py +0 -40
  140. {azure_ai_evaluation-1.8.0.dist-info → azure_ai_evaluation-1.10.0.dist-info}/NOTICE.txt +0 -0
  141. {azure_ai_evaluation-1.8.0.dist-info → azure_ai_evaluation-1.10.0.dist-info}/WHEEL +0 -0
  142. {azure_ai_evaluation-1.8.0.dist-info → azure_ai_evaluation-1.10.0.dist-info}/top_level.txt +0 -0
@@ -8,15 +8,16 @@ from azure.ai.evaluation._common._experimental import experimental
8
8
  from azure.ai.evaluation._common.constants import EvaluationMetrics
9
9
  from azure.ai.evaluation._evaluators._common import RaiServiceEvaluatorBase
10
10
 
11
+
11
12
  # cspell:ignore ssrf, vuln
12
13
  @experimental
13
14
  class CodeVulnerabilityEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
14
15
  """
15
- Evaluates code vulnerability for a given query and response for a single-turn evaluation only,
16
+ Evaluates code vulnerability for a given query and response for a single-turn evaluation only,
16
17
  where query represents the user query or code before the completion, and response represents the code recommended by the assistant.
17
18
 
18
19
  The code vulnerability evaluation checks for vulnerabilities in the following coding languages:
19
-
20
+
20
21
  - Python
21
22
  - Java
22
23
  - C++
@@ -26,7 +27,7 @@ class CodeVulnerabilityEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
26
27
  - SQL
27
28
 
28
29
  The code vulnerability evaluation identifies the following vulnerabilities:
29
-
30
+
30
31
  - path-injection
31
32
  - sql-injection
32
33
  - code-injection
@@ -49,9 +50,9 @@ class CodeVulnerabilityEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
49
50
 
50
51
  :param credential: The credential for connecting to Azure AI project. Required
51
52
  :type credential: ~azure.core.credentials.TokenCredential
52
- :param azure_ai_project: The scope of the Azure AI project.
53
- It contains subscription id, resource group, and project name.
54
- :type azure_ai_project: ~azure.ai.evaluation.AzureAIProject
53
+ :param azure_ai_project: The Azure AI project, which can either be a string representing the project endpoint
54
+ or an instance of AzureAIProject. It contains subscription id, resource group, and project name.
55
+ :type azure_ai_project: Union[str, ~azure.ai.evaluation.AzureAIProject]
55
56
  :param kwargs: Additional arguments to pass to the evaluator.
56
57
  :type kwargs: Any
57
58
 
@@ -63,13 +64,13 @@ class CodeVulnerabilityEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
63
64
  :language: python
64
65
  :dedent: 8
65
66
  :caption: Initialize and call CodeVulnerabilityEvaluator with a query and response using azure.ai.evaluation.AzureAIProject.
66
-
67
+
67
68
  .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
68
69
  :start-after: [START code_vulnerability_evaluator]
69
70
  :end-before: [END code_vulnerability_evaluator]
70
71
  :language: python
71
72
  :dedent: 8
72
- :caption: Initialize and call CodeVulnerabilityEvaluator using Azure AI Project URL in following format
73
+ :caption: Initialize and call CodeVulnerabilityEvaluator using Azure AI Project URL in following format
73
74
  https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
74
75
 
75
76
  .. note::
@@ -78,19 +79,26 @@ class CodeVulnerabilityEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
78
79
  for the code vulnerability will be "code_vulnerability_label".
79
80
  """
80
81
 
81
- id = "code_vulnerability"
82
+ id = "azureai://built-in/evaluators/code_vulnerability"
82
83
  """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
84
+ _OPTIONAL_PARAMS = ["query"]
83
85
 
84
86
  @override
85
87
  def __init__(
86
88
  self,
87
89
  credential,
88
90
  azure_ai_project,
91
+ **kwargs,
89
92
  ):
93
+ # Set default for evaluate_query if not provided
94
+ if "evaluate_query" not in kwargs:
95
+ kwargs["evaluate_query"] = True
96
+
90
97
  super().__init__(
91
98
  eval_metric=EvaluationMetrics.CODE_VULNERABILITY,
92
99
  azure_ai_project=azure_ai_project,
93
100
  credential=credential,
101
+ **kwargs,
94
102
  )
95
103
 
96
104
  @overload
@@ -99,7 +107,7 @@ class CodeVulnerabilityEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
99
107
  *,
100
108
  query: str,
101
109
  response: str,
102
- ) -> Dict[str, Union[str, float]]:
110
+ ) -> Dict[str, Union[str, float]]:
103
111
  """Evaluate a given query/response pair for code vulnerability
104
112
 
105
113
  :keyword query: The query to be evaluated.
@@ -116,7 +124,7 @@ class CodeVulnerabilityEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
116
124
  *args,
117
125
  **kwargs,
118
126
  ):
119
- """Evaluate code vulnerability. Accepts query and response for a single-turn evaluation only.
127
+ """Evaluate code vulnerability. Accepts query and response for a single-turn evaluation only.
120
128
 
121
129
  :keyword query: The query to be evaluated.
122
130
  :paramtype query: Optional[str]
@@ -124,5 +132,5 @@ class CodeVulnerabilityEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
124
132
  :paramtype response: Optional[str]
125
133
  :rtype: Dict[str, Union[str, bool]]
126
134
  """
127
-
135
+
128
136
  return super().__call__(*args, **kwargs)
@@ -32,17 +32,17 @@ class CoherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
32
32
  :language: python
33
33
  :dedent: 8
34
34
  :caption: Initialize and call CoherenceEvaluator using azure.ai.evaluation.AzureAIProject
35
-
35
+
36
36
  .. admonition:: Example using Azure AI Project URL:
37
-
37
+
38
38
  .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
39
39
  :start-after: [START coherence_evaluator]
40
40
  :end-before: [END coherence_evaluator]
41
41
  :language: python
42
42
  :dedent: 8
43
- :caption: Initialize and call CoherenceEvaluator using Azure AI Project URL in following format
43
+ :caption: Initialize and call CoherenceEvaluator using Azure AI Project URL in following format
44
44
  https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
45
-
45
+
46
46
  .. admonition:: Example with Threshold:
47
47
 
48
48
  .. literalinclude:: ../samples/evaluation_samples_threshold.py
@@ -62,7 +62,7 @@ class CoherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
62
62
  _PROMPTY_FILE = "coherence.prompty"
63
63
  _RESULT_KEY = "coherence"
64
64
 
65
- id = "azureml://registries/azureml/models/Coherence-Evaluator/versions/4"
65
+ id = "azureai://built-in/evaluators/coherence"
66
66
  """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
67
67
 
68
68
  @override
@@ -76,7 +76,7 @@ class CoherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
76
76
  prompty_file=prompty_path,
77
77
  result_key=self._RESULT_KEY,
78
78
  threshold=threshold,
79
- _higher_is_better=self._higher_is_better
79
+ _higher_is_better=self._higher_is_better,
80
80
  )
81
81
 
82
82
  @overload
@@ -4,14 +4,34 @@
4
4
 
5
5
  import inspect
6
6
  from abc import ABC, abstractmethod
7
- from typing import Any, Callable, Dict, Generic, List, TypedDict, TypeVar, Union, cast, final, Optional
7
+ from typing import (
8
+ Any,
9
+ Callable,
10
+ Dict,
11
+ Generic,
12
+ List,
13
+ TypedDict,
14
+ TypeVar,
15
+ Union,
16
+ cast,
17
+ final,
18
+ Optional,
19
+ )
8
20
 
9
21
  from azure.ai.evaluation._legacy._adapters.utils import async_run_allowing_running_loop
10
22
  from typing_extensions import ParamSpec, TypeAlias, get_overloads
11
23
 
12
- from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
24
+ from azure.ai.evaluation._exceptions import (
25
+ ErrorBlame,
26
+ ErrorCategory,
27
+ ErrorTarget,
28
+ EvaluationException,
29
+ )
13
30
  from azure.ai.evaluation._common.utils import remove_optional_singletons
14
- from azure.ai.evaluation._constants import _AggregationType, EVALUATION_PASS_FAIL_MAPPING
31
+ from azure.ai.evaluation._constants import (
32
+ _AggregationType,
33
+ EVALUATION_PASS_FAIL_MAPPING,
34
+ )
15
35
  from azure.ai.evaluation._model_configurations import Conversation
16
36
  from azure.ai.evaluation._common._experimental import experimental
17
37
 
@@ -176,7 +196,9 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
176
196
  singletons.extend([p for p in params if p != "self"])
177
197
  return singletons
178
198
 
179
- def _derive_conversation_converter(self) -> Callable[[Dict], List[DerivedEvalInput]]:
199
+ def _derive_conversation_converter(
200
+ self,
201
+ ) -> Callable[[Dict], List[DerivedEvalInput]]:
180
202
  """Produce the function that will be used to convert conversations to a list of evaluable inputs.
181
203
  This uses the inputs derived from the _derive_singleton_inputs function to determine which
182
204
  aspects of a conversation ought to be extracted.
@@ -235,7 +257,9 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
235
257
 
236
258
  return converter
237
259
 
238
- def _derive_multi_modal_conversation_converter(self) -> Callable[[Dict], List[Dict[str, Any]]]:
260
+ def _derive_multi_modal_conversation_converter(
261
+ self,
262
+ ) -> Callable[[Dict], List[Dict[str, Any]]]:
239
263
  """Produce the function that will be used to convert multi-modal conversations to a list of evaluable inputs.
240
264
  This uses the inputs derived from the _derive_singleton_inputs function to determine which
241
265
  aspects of a conversation ought to be extracted.
@@ -288,7 +312,7 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
288
312
 
289
313
  return multi_modal_converter
290
314
 
291
- def _convert_kwargs_to_eval_input(self, **kwargs) -> Union[List[Dict], List[DerivedEvalInput]]:
315
+ def _convert_kwargs_to_eval_input(self, **kwargs) -> Union[List[Dict], List[DerivedEvalInput], Dict[str, Any]]:
292
316
  """Convert an arbitrary input into a list of inputs for evaluators.
293
317
  It is assumed that evaluators generally make use of their inputs in one of two ways.
294
318
  Either they receive a collection of keyname inputs that are all single values
@@ -416,12 +440,12 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
416
440
  threshold_key = f"{base_key}_threshold"
417
441
  result[threshold_key] = self._threshold
418
442
  if self._higher_is_better:
419
- if int(score_value) >= self._threshold:
443
+ if float(score_value) >= self._threshold:
420
444
  result[result_key] = EVALUATION_PASS_FAIL_MAPPING[True]
421
445
  else:
422
446
  result[result_key] = EVALUATION_PASS_FAIL_MAPPING[False]
423
447
  else:
424
- if int(score_value) <= self._threshold:
448
+ if float(score_value) <= self._threshold:
425
449
  result[result_key] = EVALUATION_PASS_FAIL_MAPPING[True]
426
450
  else:
427
451
  result[result_key] = EVALUATION_PASS_FAIL_MAPPING[False]
@@ -498,9 +522,19 @@ class AsyncEvaluatorBase:
498
522
  # Since we want this to be relatively call-agnostic, we just account for every input that any children
499
523
  # are known to throw at this, mash them into kwargs, and then pass them into the real call.
500
524
  async def __call__(
501
- self, *, query=None, response=None, context=None, conversation=None, ground_truth=None,
502
- tool_calls=None, tool_definitions=None, messages=None, retrieval_ground_truth=None,
503
- retrieved_documents=None,**kwargs
525
+ self,
526
+ *,
527
+ query=None,
528
+ response=None,
529
+ context=None,
530
+ conversation=None,
531
+ ground_truth=None,
532
+ tool_calls=None,
533
+ tool_definitions=None,
534
+ messages=None,
535
+ retrieval_ground_truth=None,
536
+ retrieved_documents=None,
537
+ **kwargs,
504
538
  ):
505
539
  if conversation is not None:
506
540
  kwargs["conversation"] = conversation
@@ -20,9 +20,14 @@ from ..._common.utils import construct_prompty_model_config, validate_model_conf
20
20
  from . import EvaluatorBase
21
21
 
22
22
  try:
23
- from ..._user_agent import USER_AGENT
23
+ from ..._user_agent import UserAgentSingleton
24
24
  except ImportError:
25
- USER_AGENT = "None"
25
+
26
+ class UserAgentSingleton:
27
+ @property
28
+ def value(self) -> str:
29
+ return "None"
30
+
26
31
 
27
32
  T = TypeVar("T")
28
33
 
@@ -50,8 +55,17 @@ class PromptyEvaluatorBase(EvaluatorBase[T]):
50
55
  _LLM_CALL_TIMEOUT = 600
51
56
  _DEFAULT_OPEN_API_VERSION = "2024-02-15-preview"
52
57
 
53
- def __init__(self, *, result_key: str, prompty_file: str, model_config: dict, eval_last_turn: bool = False,
54
- threshold: int = 3, _higher_is_better: bool = False, **kwargs) -> None:
58
+ def __init__(
59
+ self,
60
+ *,
61
+ result_key: str,
62
+ prompty_file: str,
63
+ model_config: dict,
64
+ eval_last_turn: bool = False,
65
+ threshold: int = 3,
66
+ _higher_is_better: bool = False,
67
+ **kwargs,
68
+ ) -> None:
55
69
  self._result_key = result_key
56
70
  self._is_reasoning_model = kwargs.get("is_reasoning_model", False)
57
71
  self._prompty_file = prompty_file
@@ -60,15 +74,16 @@ class PromptyEvaluatorBase(EvaluatorBase[T]):
60
74
  super().__init__(eval_last_turn=eval_last_turn, threshold=threshold, _higher_is_better=_higher_is_better)
61
75
 
62
76
  subclass_name = self.__class__.__name__
63
- user_agent = f"{USER_AGENT} (type=evaluator subtype={subclass_name})"
77
+ user_agent = f"{UserAgentSingleton().value} (type=evaluator subtype={subclass_name})"
64
78
  prompty_model_config = construct_prompty_model_config(
65
79
  validate_model_config(model_config),
66
80
  self._DEFAULT_OPEN_API_VERSION,
67
81
  user_agent,
68
82
  )
69
83
 
70
- self._flow = AsyncPrompty.load(source=self._prompty_file, model=prompty_model_config,
71
- is_reasoning_model=self._is_reasoning_model)
84
+ self._flow = AsyncPrompty.load(
85
+ source=self._prompty_file, model=prompty_model_config, is_reasoning_model=self._is_reasoning_model
86
+ )
72
87
 
73
88
  # __call__ not overridden here because child classes have such varied signatures that there's no point
74
89
  # defining a default here.
@@ -132,7 +147,7 @@ class PromptyEvaluatorBase(EvaluatorBase[T]):
132
147
  score = float(match.group())
133
148
  binary_result = self._get_binary_result(score)
134
149
  return {
135
- self._result_key: float(score),
150
+ self._result_key: float(score),
136
151
  f"gpt_{self._result_key}": float(score),
137
152
  f"{self._result_key}_result": binary_result,
138
153
  f"{self._result_key}_threshold": self._threshold,
@@ -140,7 +155,7 @@ class PromptyEvaluatorBase(EvaluatorBase[T]):
140
155
 
141
156
  binary_result = self._get_binary_result(score)
142
157
  return {
143
- self._result_key: float(score),
158
+ self._result_key: float(score),
144
159
  f"gpt_{self._result_key}": float(score),
145
160
  f"{self._result_key}_result": binary_result,
146
161
  f"{self._result_key}_threshold": self._threshold,
@@ -36,14 +36,17 @@ class RaiServiceEvaluatorBase(EvaluatorBase[T]):
36
36
  aggregated. Per-turn results are still be available in the output via the "evaluation_per_turn" key
37
37
  when this occurs. Default is False, resulting full conversation evaluation and aggregation.
38
38
  :type eval_last_turn: bool
39
- :param conversation_aggregation_type: The type of aggregation to perform on the per-turn results of a conversation
40
- to produce a single result.
39
+ :param conversation_aggregation_type: The type of aggregation to perform on the per-turn results of a conversation to produce a single result.
41
40
  Default is ~azure.ai.evaluation._AggregationType.MEAN.
42
41
  :type conversation_aggregation_type: ~azure.ai.evaluation._AggregationType
43
42
  :param threshold: The threshold for the evaluation. Default is 3.
44
43
  :type threshold: Optional[int]
45
44
  :param _higher_is_better: If True, higher scores are better. Default is True.
46
45
  :type _higher_is_better: Optional[bool]
46
+ :param evaluate_query: If True, the query will be included in the evaluation data when evaluating
47
+ query-response pairs. If False, only the response will be evaluated. Default is False.
48
+ Can be passed as a keyword argument.
49
+ :type evaluate_query: bool
47
50
  """
48
51
 
49
52
  @override
@@ -56,12 +59,21 @@ class RaiServiceEvaluatorBase(EvaluatorBase[T]):
56
59
  conversation_aggregation_type: _AggregationType = _AggregationType.MEAN,
57
60
  threshold: int = 3,
58
61
  _higher_is_better: Optional[bool] = False,
62
+ **kwargs,
59
63
  ):
60
- super().__init__(eval_last_turn=eval_last_turn, conversation_aggregation_type=conversation_aggregation_type, threshold=threshold, _higher_is_better=_higher_is_better)
64
+ super().__init__(
65
+ eval_last_turn=eval_last_turn,
66
+ conversation_aggregation_type=conversation_aggregation_type,
67
+ threshold=threshold,
68
+ _higher_is_better=_higher_is_better,
69
+ )
61
70
  self._eval_metric = eval_metric
62
71
  self._azure_ai_project = validate_azure_ai_project(azure_ai_project)
63
72
  self._credential = credential
64
73
  self._threshold = threshold
74
+
75
+ # Handle evaluate_query parameter from kwargs
76
+ self._evaluate_query = kwargs.get("evaluate_query", False)
65
77
  self._higher_is_better = _higher_is_better
66
78
 
67
79
  @override
@@ -96,7 +108,7 @@ class RaiServiceEvaluatorBase(EvaluatorBase[T]):
96
108
  :return: The evaluation result.
97
109
  :rtype: Dict
98
110
  """
99
- if "query" in eval_input and "response" in eval_input:
111
+ if "response" in eval_input:
100
112
  return await self._evaluate_query_response(eval_input)
101
113
 
102
114
  conversation = eval_input.get("conversation", None)
@@ -126,17 +138,20 @@ class RaiServiceEvaluatorBase(EvaluatorBase[T]):
126
138
  return result
127
139
 
128
140
  async def _evaluate_query_response(self, eval_input: Dict) -> Dict[str, T]:
129
- query = str(eval_input.get("query", None))
130
- response = str(eval_input.get("response", None))
131
- if query is None or response is None:
141
+ query = eval_input.get("query", None)
142
+ response = eval_input.get("response", None)
143
+ if response is None:
132
144
  raise EvaluationException(
133
145
  message="Not implemented",
134
146
  internal_message=(
135
- "Reached query/response evaluation without supplying query or response."
147
+ "Reached query/response evaluation without supplying response."
136
148
  + " This should have failed earlier."
137
149
  ),
138
150
  )
139
- input_data = {"query": query, "response": response}
151
+ input_data = {"response": str(response)}
152
+
153
+ if query is not None and self._evaluate_query:
154
+ input_data["query"] = str(query)
140
155
 
141
156
  if "context" in self._singleton_inputs:
142
157
  context = eval_input.get("context", None)
@@ -22,9 +22,9 @@ class ContentSafetyEvaluator(MultiEvaluatorBase[Union[str, float]]):
22
22
 
23
23
  :param credential: The credential for connecting to Azure AI project. Required
24
24
  :type credential: ~azure.core.credentials.TokenCredential
25
- :param azure_ai_project: The scope of the Azure AI project.
26
- It contains subscription id, resource group, and project name.
27
- :type azure_ai_project: ~azure.ai.evaluation.AzureAIProject
25
+ :param azure_ai_project: The Azure AI project, which can either be a string representing the project endpoint
26
+ or an instance of AzureAIProject. It contains subscription id, resource group, and project name.
27
+ :type azure_ai_project: Union[str, ~azure.ai.evaluation.AzureAIProject]
28
28
  :param violence_threshold: The threshold for violence evaluation. Default is 3.
29
29
  :type violence_threshold: int
30
30
  :param sexual_threshold: The threshold for sexual content evaluation. Default is 3.
@@ -33,6 +33,8 @@ class ContentSafetyEvaluator(MultiEvaluatorBase[Union[str, float]]):
33
33
  :type self_harm_threshold: int
34
34
  :param hate_unfairness_threshold: The threshold for hate/unfairness evaluation. Default is 3.
35
35
  :type hate_unfairness_threshold: int
36
+ :param evaluate_query: Whether to also evaluate the query in addition to the response. Default is False.
37
+ :type evaluate_query: bool
36
38
  :param kwargs: Additional arguments to pass to the evaluator.
37
39
  :type kwargs: Any
38
40
  :return: A function that evaluates content-safety metrics for "question-answering" scenario.
@@ -45,19 +47,19 @@ class ContentSafetyEvaluator(MultiEvaluatorBase[Union[str, float]]):
45
47
  :language: python
46
48
  :dedent: 8
47
49
  :caption: Initialize and call ContentSafetyEvaluator using azure.ai.evaluation.AzureAIProject.
48
-
50
+
49
51
  .. admonition:: Example using Azure AI Project URL:
50
-
52
+
51
53
  .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
52
54
  :start-after: [START content_safety_evaluator]
53
55
  :end-before: [END content_safety_evaluator]
54
56
  :language: python
55
57
  :dedent: 8
56
- :caption: Initialize and call ContentSafetyEvaluator using Azure AI Project URL in the following format
58
+ :caption: Initialize and call ContentSafetyEvaluator using Azure AI Project URL in the following format
57
59
  https://{resource_name}.services.ai.azure.com/api/projects/{project_name}.
58
-
60
+
59
61
  .. admonition:: Example with Threshold:
60
-
62
+
61
63
  .. literalinclude:: ../samples/evaluation_samples_threshold.py
62
64
  :start-after: [START threshold_content_safety_evaluator]
63
65
  :end-before: [END threshold_content_safety_evaluator]
@@ -66,19 +68,20 @@ class ContentSafetyEvaluator(MultiEvaluatorBase[Union[str, float]]):
66
68
  :caption: Initialize with threshold and call a ContentSafetyEvaluator with a query and response.
67
69
  """
68
70
 
69
- id = "content_safety"
71
+ id = "azureai://built-in/evaluators/content_safety"
70
72
  """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
73
+ _OPTIONAL_PARAMS = ["query"]
71
74
 
72
75
  def __init__(
73
- self,
74
- credential,
76
+ self,
77
+ credential,
75
78
  azure_ai_project,
76
- *,
79
+ *,
77
80
  violence_threshold: int = 3,
78
81
  sexual_threshold: int = 3,
79
82
  self_harm_threshold: int = 3,
80
83
  hate_unfairness_threshold: int = 3,
81
- **kwargs
84
+ **kwargs,
82
85
  ):
83
86
  # Type checking
84
87
  for name, value in [
@@ -89,12 +92,19 @@ class ContentSafetyEvaluator(MultiEvaluatorBase[Union[str, float]]):
89
92
  ]:
90
93
  if not isinstance(value, int):
91
94
  raise TypeError(f"{name} must be an int, got {type(value)}")
92
-
95
+
96
+ # Extract evaluate_query from kwargs if present
97
+ evaluate_query_kwargs = {}
98
+ if "evaluate_query" in kwargs:
99
+ evaluate_query_kwargs["evaluate_query"] = kwargs["evaluate_query"]
100
+
93
101
  evaluators = [
94
- ViolenceEvaluator(credential, azure_ai_project, threshold=violence_threshold),
95
- SexualEvaluator(credential, azure_ai_project, threshold=sexual_threshold),
96
- SelfHarmEvaluator(credential, azure_ai_project, threshold=self_harm_threshold),
97
- HateUnfairnessEvaluator(credential, azure_ai_project, threshold=hate_unfairness_threshold),
102
+ ViolenceEvaluator(credential, azure_ai_project, threshold=violence_threshold, **evaluate_query_kwargs),
103
+ SexualEvaluator(credential, azure_ai_project, threshold=sexual_threshold, **evaluate_query_kwargs),
104
+ SelfHarmEvaluator(credential, azure_ai_project, threshold=self_harm_threshold, **evaluate_query_kwargs),
105
+ HateUnfairnessEvaluator(
106
+ credential, azure_ai_project, threshold=hate_unfairness_threshold, **evaluate_query_kwargs
107
+ ),
98
108
  ]
99
109
  super().__init__(evaluators=evaluators, **kwargs)
100
110
 
@@ -45,9 +45,9 @@ class HateUnfairnessEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
45
45
 
46
46
  :param credential: The credential for connecting to Azure AI project. Required
47
47
  :type credential: ~azure.core.credentials.TokenCredential
48
- :param azure_ai_project: The scope of the Azure AI project.
49
- It contains subscription id, resource group, and project name.
50
- :type azure_ai_project: ~azure.ai.evaluation.AzureAIProject
48
+ :param azure_ai_project: The Azure AI project, which can either be a string representing the project endpoint
49
+ or an instance of AzureAIProject. It contains subscription id, resource group, and project name.
50
+ :type azure_ai_project: Union[str, ~azure.ai.evaluation.AzureAIProject]
51
51
  :param threshold: The threshold for the HateUnfairness evaluator. Default is 3.
52
52
  :type threshold: int
53
53
 
@@ -59,17 +59,17 @@ class HateUnfairnessEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
59
59
  :language: python
60
60
  :dedent: 8
61
61
  :caption: Initialize and call HateUnfairnessEvaluator using azure.ai.evaluation.AzureAIProject.
62
-
62
+
63
63
  .. admonition:: Example using Azure AI Project URL:
64
-
64
+
65
65
  .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
66
66
  :start-after: [START hate_unfairness_evaluator]
67
67
  :end-before: [END hate_unfairness_evaluator]
68
68
  :language: python
69
69
  :dedent: 8
70
- :caption: Initialize and call HateUnfairnessEvaluator using Azure AI Project URL in the following format
70
+ :caption: Initialize and call HateUnfairnessEvaluator using Azure AI Project URL in the following format
71
71
  https://{resource_name}.services.ai.azure.com/api/projects/{project_name}.
72
-
72
+
73
73
  .. admonition:: Example with Threshold:
74
74
 
75
75
  .. literalinclude:: ../samples/evaluation_samples_threshold.py
@@ -80,8 +80,9 @@ class HateUnfairnessEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
80
80
  :caption: Initialize with threshold and call a HateUnfairnessEvaluator with a query and response.
81
81
  """
82
82
 
83
- id = "azureml://registries/azureml/models/Hate-and-Unfairness-Evaluator/versions/4"
83
+ id = "azureai://built-in/evaluators/hate_unfairness"
84
84
  """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
85
+ _OPTIONAL_PARAMS = ["query"]
85
86
 
86
87
  @override
87
88
  def __init__(
@@ -90,6 +91,7 @@ class HateUnfairnessEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
90
91
  azure_ai_project,
91
92
  *,
92
93
  threshold: int = 3,
94
+ **kwargs,
93
95
  ):
94
96
  super().__init__(
95
97
  eval_metric=EvaluationMetrics.HATE_FAIRNESS,
@@ -98,6 +100,7 @@ class HateUnfairnessEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
98
100
  conversation_aggregation_type=_AggregationType.MAX,
99
101
  threshold=threshold,
100
102
  _higher_is_better=False,
103
+ **kwargs,
101
104
  )
102
105
 
103
106
  @overload
@@ -39,9 +39,9 @@ class SelfHarmEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
39
39
 
40
40
  :param credential: The credential for connecting to Azure AI project. Required
41
41
  :type credential: ~azure.core.credentials.TokenCredential
42
- :param azure_ai_project: The scope of the Azure AI project.
43
- It contains subscription id, resource group, and project name.
44
- :type azure_ai_project: ~azure.ai.evaluation.AzureAIProject
42
+ :param azure_ai_project: The Azure AI project, which can either be a string representing the project endpoint
43
+ or an instance of AzureAIProject. It contains subscription id, resource group, and project name.
44
+ :type azure_ai_project: Union[str, ~azure.ai.evaluation.AzureAIProject]
45
45
  :param threshold: The threshold for the SelfHarm evaluator. Default is 3.
46
46
  :type threshold: int
47
47
 
@@ -53,20 +53,21 @@ class SelfHarmEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
53
53
  :language: python
54
54
  :dedent: 8
55
55
  :caption: Initialize and call SelfHarmEvaluator using azure.ai.evaluation.AzureAIProject.
56
-
56
+
57
57
  .. admonition:: Example using Azure AI Project URL:
58
-
58
+
59
59
  .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
60
60
  :start-after: [START self_harm_evaluator]
61
61
  :end-before: [END self_harm_evaluator]
62
62
  :language: python
63
63
  :dedent: 8
64
- :caption: Initialize and call SelfHarmEvaluator using Azure AI Project URL in the following format
65
- https://{resource_name}.services.ai.azure.com/api/projects/{project_name}.
64
+ :caption: Initialize and call SelfHarmEvaluator using Azure AI Project URL in the following format
65
+ https://{resource_name}.services.ai.azure.com/api/projects/{project_name}.
66
66
  """
67
67
 
68
- id = "azureml://registries/azureml/models/Self-Harm-Related-Content-Evaluator/versions/3"
68
+ id = "azureai://built-in/evaluators/self_harm"
69
69
  """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
70
+ _OPTIONAL_PARAMS = ["query"]
70
71
 
71
72
  @override
72
73
  def __init__(
@@ -75,6 +76,7 @@ class SelfHarmEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
75
76
  azure_ai_project,
76
77
  *,
77
78
  threshold: int = 3,
79
+ **kwargs,
78
80
  ):
79
81
  super().__init__(
80
82
  eval_metric=EvaluationMetrics.SELF_HARM,
@@ -83,6 +85,7 @@ class SelfHarmEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
83
85
  conversation_aggregation_type=_AggregationType.MAX,
84
86
  threshold=threshold,
85
87
  _higher_is_better=False,
88
+ **kwargs,
86
89
  )
87
90
 
88
91
  @overload