azure-ai-evaluation 1.8.0__py3-none-any.whl → 1.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (142) hide show
  1. azure/ai/evaluation/__init__.py +51 -6
  2. azure/ai/evaluation/_aoai/__init__.py +1 -1
  3. azure/ai/evaluation/_aoai/aoai_grader.py +21 -11
  4. azure/ai/evaluation/_aoai/label_grader.py +3 -2
  5. azure/ai/evaluation/_aoai/python_grader.py +84 -0
  6. azure/ai/evaluation/_aoai/score_model_grader.py +91 -0
  7. azure/ai/evaluation/_aoai/string_check_grader.py +3 -2
  8. azure/ai/evaluation/_aoai/text_similarity_grader.py +3 -2
  9. azure/ai/evaluation/_azure/_envs.py +9 -10
  10. azure/ai/evaluation/_azure/_token_manager.py +7 -1
  11. azure/ai/evaluation/_common/constants.py +11 -2
  12. azure/ai/evaluation/_common/evaluation_onedp_client.py +32 -26
  13. azure/ai/evaluation/_common/onedp/__init__.py +32 -32
  14. azure/ai/evaluation/_common/onedp/_client.py +136 -139
  15. azure/ai/evaluation/_common/onedp/_configuration.py +70 -73
  16. azure/ai/evaluation/_common/onedp/_patch.py +21 -21
  17. azure/ai/evaluation/_common/onedp/_utils/__init__.py +6 -0
  18. azure/ai/evaluation/_common/onedp/_utils/model_base.py +1232 -0
  19. azure/ai/evaluation/_common/onedp/_utils/serialization.py +2032 -0
  20. azure/ai/evaluation/_common/onedp/_validation.py +50 -50
  21. azure/ai/evaluation/_common/onedp/_version.py +9 -9
  22. azure/ai/evaluation/_common/onedp/aio/__init__.py +29 -29
  23. azure/ai/evaluation/_common/onedp/aio/_client.py +138 -143
  24. azure/ai/evaluation/_common/onedp/aio/_configuration.py +70 -75
  25. azure/ai/evaluation/_common/onedp/aio/_patch.py +21 -21
  26. azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +37 -39
  27. azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +4832 -4494
  28. azure/ai/evaluation/_common/onedp/aio/operations/_patch.py +21 -21
  29. azure/ai/evaluation/_common/onedp/models/__init__.py +168 -142
  30. azure/ai/evaluation/_common/onedp/models/_enums.py +230 -162
  31. azure/ai/evaluation/_common/onedp/models/_models.py +2685 -2228
  32. azure/ai/evaluation/_common/onedp/models/_patch.py +21 -21
  33. azure/ai/evaluation/_common/onedp/operations/__init__.py +37 -39
  34. azure/ai/evaluation/_common/onedp/operations/_operations.py +6106 -5657
  35. azure/ai/evaluation/_common/onedp/operations/_patch.py +21 -21
  36. azure/ai/evaluation/_common/rai_service.py +88 -52
  37. azure/ai/evaluation/_common/raiclient/__init__.py +1 -1
  38. azure/ai/evaluation/_common/raiclient/operations/_operations.py +14 -1
  39. azure/ai/evaluation/_common/utils.py +188 -10
  40. azure/ai/evaluation/_constants.py +2 -1
  41. azure/ai/evaluation/_converters/__init__.py +1 -1
  42. azure/ai/evaluation/_converters/_ai_services.py +9 -8
  43. azure/ai/evaluation/_converters/_models.py +46 -0
  44. azure/ai/evaluation/_converters/_sk_services.py +495 -0
  45. azure/ai/evaluation/_eval_mapping.py +2 -2
  46. azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +73 -25
  47. azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +2 -2
  48. azure/ai/evaluation/_evaluate/_evaluate.py +210 -94
  49. azure/ai/evaluation/_evaluate/_evaluate_aoai.py +132 -89
  50. azure/ai/evaluation/_evaluate/_telemetry/__init__.py +0 -1
  51. azure/ai/evaluation/_evaluate/_utils.py +25 -17
  52. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +4 -4
  53. azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +20 -12
  54. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +6 -6
  55. azure/ai/evaluation/_evaluators/_common/_base_eval.py +45 -11
  56. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +24 -9
  57. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +24 -9
  58. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +28 -18
  59. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +11 -8
  60. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +11 -8
  61. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +12 -9
  62. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +10 -7
  63. azure/ai/evaluation/_evaluators/_document_retrieval/__init__.py +1 -5
  64. azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +37 -64
  65. azure/ai/evaluation/_evaluators/_eci/_eci.py +6 -3
  66. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +5 -5
  67. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +3 -3
  68. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +4 -4
  69. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +12 -8
  70. azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +31 -26
  71. azure/ai/evaluation/_evaluators/_intent_resolution/intent_resolution.prompty +210 -96
  72. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +3 -4
  73. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +14 -7
  74. azure/ai/evaluation/_evaluators/_qa/_qa.py +5 -5
  75. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +62 -15
  76. azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +140 -59
  77. azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +21 -26
  78. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +5 -5
  79. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +22 -22
  80. azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +7 -6
  81. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +4 -4
  82. azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +27 -24
  83. azure/ai/evaluation/_evaluators/_task_adherence/task_adherence.prompty +354 -66
  84. azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +175 -183
  85. azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +99 -21
  86. azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +20 -12
  87. azure/ai/evaluation/_evaluators/_xpia/xpia.py +10 -7
  88. azure/ai/evaluation/_exceptions.py +10 -0
  89. azure/ai/evaluation/_http_utils.py +3 -3
  90. azure/ai/evaluation/_legacy/_batch_engine/_config.py +6 -3
  91. azure/ai/evaluation/_legacy/_batch_engine/_engine.py +117 -32
  92. azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py +5 -2
  93. azure/ai/evaluation/_legacy/_batch_engine/_result.py +2 -0
  94. azure/ai/evaluation/_legacy/_batch_engine/_run.py +2 -2
  95. azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +33 -41
  96. azure/ai/evaluation/_legacy/_batch_engine/_utils.py +1 -4
  97. azure/ai/evaluation/_legacy/_common/_async_token_provider.py +12 -19
  98. azure/ai/evaluation/_legacy/_common/_thread_pool_executor_with_context.py +2 -0
  99. azure/ai/evaluation/_legacy/prompty/_prompty.py +11 -5
  100. azure/ai/evaluation/_safety_evaluation/__init__.py +1 -1
  101. azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +195 -111
  102. azure/ai/evaluation/_user_agent.py +32 -1
  103. azure/ai/evaluation/_version.py +1 -1
  104. azure/ai/evaluation/red_team/__init__.py +3 -1
  105. azure/ai/evaluation/red_team/_agent/__init__.py +1 -1
  106. azure/ai/evaluation/red_team/_agent/_agent_functions.py +68 -71
  107. azure/ai/evaluation/red_team/_agent/_agent_tools.py +103 -145
  108. azure/ai/evaluation/red_team/_agent/_agent_utils.py +26 -6
  109. azure/ai/evaluation/red_team/_agent/_semantic_kernel_plugin.py +62 -71
  110. azure/ai/evaluation/red_team/_attack_objective_generator.py +94 -52
  111. azure/ai/evaluation/red_team/_attack_strategy.py +2 -1
  112. azure/ai/evaluation/red_team/_callback_chat_target.py +4 -9
  113. azure/ai/evaluation/red_team/_default_converter.py +1 -1
  114. azure/ai/evaluation/red_team/_red_team.py +1947 -1040
  115. azure/ai/evaluation/red_team/_red_team_result.py +49 -38
  116. azure/ai/evaluation/red_team/_utils/__init__.py +1 -1
  117. azure/ai/evaluation/red_team/_utils/_rai_service_eval_chat_target.py +39 -34
  118. azure/ai/evaluation/red_team/_utils/_rai_service_target.py +163 -138
  119. azure/ai/evaluation/red_team/_utils/_rai_service_true_false_scorer.py +14 -14
  120. azure/ai/evaluation/red_team/_utils/constants.py +1 -13
  121. azure/ai/evaluation/red_team/_utils/formatting_utils.py +41 -44
  122. azure/ai/evaluation/red_team/_utils/logging_utils.py +17 -17
  123. azure/ai/evaluation/red_team/_utils/metric_mapping.py +31 -4
  124. azure/ai/evaluation/red_team/_utils/strategy_utils.py +33 -25
  125. azure/ai/evaluation/simulator/_adversarial_scenario.py +2 -0
  126. azure/ai/evaluation/simulator/_adversarial_simulator.py +31 -17
  127. azure/ai/evaluation/simulator/_conversation/__init__.py +2 -2
  128. azure/ai/evaluation/simulator/_direct_attack_simulator.py +8 -8
  129. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +18 -6
  130. azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +54 -24
  131. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +7 -1
  132. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +30 -10
  133. azure/ai/evaluation/simulator/_model_tools/_rai_client.py +19 -31
  134. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +20 -6
  135. azure/ai/evaluation/simulator/_model_tools/models.py +1 -1
  136. azure/ai/evaluation/simulator/_simulator.py +21 -8
  137. {azure_ai_evaluation-1.8.0.dist-info → azure_ai_evaluation-1.10.0.dist-info}/METADATA +46 -3
  138. {azure_ai_evaluation-1.8.0.dist-info → azure_ai_evaluation-1.10.0.dist-info}/RECORD +141 -136
  139. azure/ai/evaluation/_common/onedp/aio/_vendor.py +0 -40
  140. {azure_ai_evaluation-1.8.0.dist-info → azure_ai_evaluation-1.10.0.dist-info}/NOTICE.txt +0 -0
  141. {azure_ai_evaluation-1.8.0.dist-info → azure_ai_evaluation-1.10.0.dist-info}/WHEEL +0 -0
  142. {azure_ai_evaluation-1.8.0.dist-info → azure_ai_evaluation-1.10.0.dist-info}/top_level.txt +0 -0
@@ -45,6 +45,8 @@ from ._aoai.aoai_grader import AzureOpenAIGrader
45
45
  from ._aoai.label_grader import AzureOpenAILabelGrader
46
46
  from ._aoai.string_check_grader import AzureOpenAIStringCheckGrader
47
47
  from ._aoai.text_similarity_grader import AzureOpenAITextSimilarityGrader
48
+ from ._aoai.score_model_grader import AzureOpenAIScoreModelGrader
49
+ from ._aoai.python_grader import AzureOpenAIPythonGrader
48
50
 
49
51
 
50
52
  _patch_all = []
@@ -52,13 +54,47 @@ _patch_all = []
52
54
  # The converter from the AI service to the evaluator schema requires a dependency on
53
55
  # ai.projects, but we also don't want to force users installing ai.evaluations to pull
54
56
  # in ai.projects. So we only import it if it's available and the user has ai.projects.
55
- try:
56
- from ._converters._ai_services import AIAgentConverter
57
- _patch_all.append("AIAgentConverter")
58
- except ImportError:
59
- print("[INFO] Could not import AIAgentConverter. Please install the dependency with `pip install azure-ai-projects`.")
57
+ # We use lazy loading to avoid printing messages during import unless the classes are actually used.
58
+ _lazy_imports = {}
60
59
 
61
60
 
61
+ def _create_lazy_import(class_name, module_path, dependency_name):
62
+ """Create a lazy import function for optional dependencies.
63
+
64
+ Args:
65
+ class_name: Name of the class to import
66
+ module_path: Module path to import from
67
+ dependency_name: Name of the dependency package for error message
68
+
69
+ Returns:
70
+ A function that performs the lazy import when called
71
+ """
72
+
73
+ def lazy_import():
74
+ try:
75
+ module = __import__(module_path, fromlist=[class_name])
76
+ cls = getattr(module, class_name)
77
+ _patch_all.append(class_name)
78
+ return cls
79
+ except ImportError:
80
+ raise ImportError(
81
+ f"Could not import {class_name}. Please install the dependency with `pip install {dependency_name}`."
82
+ )
83
+
84
+ return lazy_import
85
+
86
+
87
+ _lazy_imports["AIAgentConverter"] = _create_lazy_import(
88
+ "AIAgentConverter",
89
+ "azure.ai.evaluation._converters._ai_services",
90
+ "azure-ai-projects",
91
+ )
92
+ _lazy_imports["SKAgentConverter"] = _create_lazy_import(
93
+ "SKAgentConverter",
94
+ "azure.ai.evaluation._converters._sk_services",
95
+ "semantic-kernel",
96
+ )
97
+
62
98
  __all__ = [
63
99
  "evaluate",
64
100
  "CoherenceEvaluator",
@@ -99,6 +135,15 @@ __all__ = [
99
135
  "AzureOpenAILabelGrader",
100
136
  "AzureOpenAIStringCheckGrader",
101
137
  "AzureOpenAITextSimilarityGrader",
138
+ "AzureOpenAIScoreModelGrader",
139
+ "AzureOpenAIPythonGrader",
102
140
  ]
103
141
 
104
- __all__.extend([p for p in _patch_all if p not in __all__])
142
+ __all__.extend([p for p in _patch_all if p not in __all__])
143
+
144
+
145
+ def __getattr__(name):
146
+ """Handle lazy imports for optional dependencies."""
147
+ if name in _lazy_imports:
148
+ return _lazy_imports[name]()
149
+ raise AttributeError(f"module '{__name__}' has no attribute '{name}'")
@@ -7,4 +7,4 @@ from .aoai_grader import AzureOpenAIGrader
7
7
 
8
8
  __all__ = [
9
9
  "AzureOpenAIGrader",
10
- ]
10
+ ]
@@ -5,12 +5,13 @@ from azure.ai.evaluation._model_configurations import AzureOpenAIModelConfigurat
5
5
 
6
6
  from azure.ai.evaluation._constants import DEFAULT_AOAI_API_VERSION
7
7
  from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
8
+ from azure.ai.evaluation._user_agent import UserAgentSingleton
8
9
  from typing import Any, Dict, Union
9
10
  from azure.ai.evaluation._common._experimental import experimental
10
11
 
11
12
 
12
13
  @experimental
13
- class AzureOpenAIGrader():
14
+ class AzureOpenAIGrader:
14
15
  """
15
16
  Base class for Azure OpenAI grader wrappers, recommended only for use by experienced OpenAI API users.
16
17
  Combines a model configuration and any grader configuration
@@ -35,9 +36,15 @@ class AzureOpenAIGrader():
35
36
 
36
37
  """
37
38
 
38
- id = "aoai://general"
39
+ id = "azureai://built-in/evaluators/azure-openai/custom_grader"
39
40
 
40
- def __init__(self, *, model_config : Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration], grader_config: Dict[str, Any], **kwargs: Any):
41
+ def __init__(
42
+ self,
43
+ *,
44
+ model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration],
45
+ grader_config: Dict[str, Any],
46
+ **kwargs: Any,
47
+ ):
41
48
  self._model_config = model_config
42
49
  self._grader_config = grader_config
43
50
 
@@ -45,8 +52,6 @@ class AzureOpenAIGrader():
45
52
  self._validate_model_config()
46
53
  self._validate_grader_config()
47
54
 
48
-
49
-
50
55
  def _validate_model_config(self) -> None:
51
56
  """Validate the model configuration that this grader wrapper is using."""
52
57
  if "api_key" not in self._model_config or not self._model_config.get("api_key"):
@@ -57,7 +62,7 @@ class AzureOpenAIGrader():
57
62
  category=ErrorCategory.INVALID_VALUE,
58
63
  target=ErrorTarget.AOAI_GRADER,
59
64
  )
60
-
65
+
61
66
  def _validate_grader_config(self) -> None:
62
67
  """Validate the grader configuration that this grader wrapper is using."""
63
68
 
@@ -71,19 +76,24 @@ class AzureOpenAIGrader():
71
76
  :return: The OpenAI client.
72
77
  :rtype: [~openai.OpenAI, ~openai.AzureOpenAI]
73
78
  """
79
+ default_headers = {"User-Agent": UserAgentSingleton().value}
74
80
  if "azure_endpoint" in self._model_config:
75
- from openai import AzureOpenAI
76
- # TODO set default values?
77
- return AzureOpenAI(
81
+ from openai import AzureOpenAI
82
+
83
+ # TODO set default values?
84
+ return AzureOpenAI(
78
85
  azure_endpoint=self._model_config["azure_endpoint"],
79
- api_key=self._model_config.get("api_key", None), # Default-style access to appease linters.
80
- api_version=DEFAULT_AOAI_API_VERSION, # Force a known working version
86
+ api_key=self._model_config.get("api_key", None), # Default-style access to appease linters.
87
+ api_version=DEFAULT_AOAI_API_VERSION, # Force a known working version
81
88
  azure_deployment=self._model_config.get("azure_deployment", ""),
89
+ default_headers=default_headers,
82
90
  )
83
91
  from openai import OpenAI
92
+
84
93
  # TODO add default values for base_url and organization?
85
94
  return OpenAI(
86
95
  api_key=self._model_config["api_key"],
87
96
  base_url=self._model_config.get("base_url", ""),
88
97
  organization=self._model_config.get("organization", ""),
98
+ default_headers=default_headers,
89
99
  )
@@ -9,6 +9,7 @@ from azure.ai.evaluation._common._experimental import experimental
9
9
 
10
10
  from .aoai_grader import AzureOpenAIGrader
11
11
 
12
+
12
13
  @experimental
13
14
  class AzureOpenAILabelGrader(AzureOpenAIGrader):
14
15
  """
@@ -42,12 +43,12 @@ class AzureOpenAILabelGrader(AzureOpenAIGrader):
42
43
 
43
44
  """
44
45
 
45
- id = "aoai://label_model"
46
+ id = "azureai://built-in/evaluators/azure-openai/label_grader"
46
47
 
47
48
  def __init__(
48
49
  self,
49
50
  *,
50
- model_config : Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration],
51
+ model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration],
51
52
  input: List[Dict[str, str]],
52
53
  labels: List[str],
53
54
  model: str,
@@ -0,0 +1,84 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+ from typing import Any, Dict, Union, Optional
5
+
6
+ from azure.ai.evaluation._model_configurations import AzureOpenAIModelConfiguration, OpenAIModelConfiguration
7
+ from openai.types.graders import PythonGrader
8
+ from azure.ai.evaluation._common._experimental import experimental
9
+
10
+ from .aoai_grader import AzureOpenAIGrader
11
+
12
+
13
+ @experimental
14
+ class AzureOpenAIPythonGrader(AzureOpenAIGrader):
15
+ """
16
+ Wrapper class for OpenAI's Python code graders.
17
+
18
+ Enables custom Python-based evaluation logic with flexible scoring and
19
+ pass/fail thresholds. The grader executes user-provided Python code
20
+ to evaluate outputs against custom criteria.
21
+
22
+ Supplying a PythonGrader to the `evaluate` method will cause an
23
+ asynchronous request to evaluate the grader via the OpenAI API. The
24
+ results of the evaluation will then be merged into the standard
25
+ evaluation results.
26
+
27
+ :param model_config: The model configuration to use for the grader.
28
+ :type model_config: Union[
29
+ ~azure.ai.evaluation.AzureOpenAIModelConfiguration,
30
+ ~azure.ai.evaluation.OpenAIModelConfiguration
31
+ ]
32
+ :param name: The name of the grader.
33
+ :type name: str
34
+ :param image_tag: The image tag for the Python execution environment.
35
+ :type image_tag: str
36
+ :param pass_threshold: Score threshold for pass/fail classification.
37
+ Scores >= threshold are considered passing.
38
+ :type pass_threshold: float
39
+ :param source: Python source code containing the grade function.
40
+ Must define: def grade(sample: dict, item: dict) -> float
41
+ :type source: str
42
+ :param kwargs: Additional keyword arguments to pass to the grader.
43
+ :type kwargs: Any
44
+
45
+
46
+ .. admonition:: Example:
47
+
48
+ .. literalinclude:: ../samples/evaluation_samples_common.py
49
+ :start-after: [START python_grader_example]
50
+ :end-before: [END python_grader_example]
51
+ :language: python
52
+ :dedent: 8
53
+ :caption: Using AzureOpenAIPythonGrader for custom evaluation logic.
54
+ """
55
+
56
+ id = "azureai://built-in/evaluators/azure-openai/python_grader"
57
+
58
+ def __init__(
59
+ self,
60
+ *,
61
+ model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration],
62
+ name: str,
63
+ image_tag: str,
64
+ pass_threshold: float,
65
+ source: str,
66
+ **kwargs: Any,
67
+ ):
68
+ # Validate pass_threshold
69
+ if not 0.0 <= pass_threshold <= 1.0:
70
+ raise ValueError("pass_threshold must be between 0.0 and 1.0")
71
+
72
+ # Store pass_threshold as instance attribute for potential future use
73
+ self.pass_threshold = pass_threshold
74
+
75
+ # Create OpenAI PythonGrader instance
76
+ grader = PythonGrader(
77
+ name=name,
78
+ image_tag=image_tag,
79
+ pass_threshold=pass_threshold,
80
+ source=source,
81
+ type="python",
82
+ )
83
+
84
+ super().__init__(model_config=model_config, grader_config=grader, **kwargs)
@@ -0,0 +1,91 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+ from typing import Any, Dict, Union, List, Optional
5
+
6
+ from azure.ai.evaluation._model_configurations import AzureOpenAIModelConfiguration, OpenAIModelConfiguration
7
+ from openai.types.graders import ScoreModelGrader
8
+ from azure.ai.evaluation._common._experimental import experimental
9
+
10
+ from .aoai_grader import AzureOpenAIGrader
11
+
12
+
13
+ @experimental
14
+ class AzureOpenAIScoreModelGrader(AzureOpenAIGrader):
15
+ """
16
+ Wrapper class for OpenAI's score model graders.
17
+
18
+ Enables continuous scoring evaluation with custom prompts and flexible
19
+ conversation-style inputs. Supports configurable score ranges and
20
+ pass thresholds for binary classification.
21
+
22
+ Supplying a ScoreModelGrader to the `evaluate` method will cause an
23
+ asynchronous request to evaluate the grader via the OpenAI API. The
24
+ results of the evaluation will then be merged into the standard
25
+ evaluation results.
26
+
27
+ :param model_config: The model configuration to use for the grader.
28
+ :type model_config: Union[
29
+ ~azure.ai.evaluation.AzureOpenAIModelConfiguration,
30
+ ~azure.ai.evaluation.OpenAIModelConfiguration
31
+ ]
32
+ :param input: The input messages for the grader. List of conversation
33
+ messages with role and content.
34
+ :type input: List[Dict[str, str]]
35
+ :param model: The model to use for the evaluation.
36
+ :type model: str
37
+ :param name: The name of the grader.
38
+ :type name: str
39
+ :param range: The range of the score. Defaults to [0, 1].
40
+ :type range: Optional[List[float]]
41
+ :param pass_threshold: Score threshold for pass/fail classification.
42
+ Defaults to midpoint of range.
43
+ :type pass_threshold: Optional[float]
44
+ :param sampling_params: The sampling parameters for the model.
45
+ :type sampling_params: Optional[Dict[str, Any]]
46
+ :param kwargs: Additional keyword arguments to pass to the grader.
47
+ :type kwargs: Any
48
+ """
49
+
50
+ id = "azureai://built-in/evaluators/azure-openai/score_model_grader"
51
+
52
+ def __init__(
53
+ self,
54
+ *,
55
+ model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration],
56
+ input: List[Dict[str, str]],
57
+ model: str,
58
+ name: str,
59
+ range: Optional[List[float]] = None,
60
+ pass_threshold: Optional[float] = None,
61
+ sampling_params: Optional[Dict[str, Any]] = None,
62
+ **kwargs: Any,
63
+ ):
64
+ # Validate range and pass_threshold
65
+ if range is not None:
66
+ if len(range) != 2 or range[0] >= range[1]:
67
+ raise ValueError("range must be a list of two numbers [min, max] where min < max")
68
+ else:
69
+ range = [0.0, 1.0] # Default range
70
+
71
+ if pass_threshold is not None:
72
+ if range and (pass_threshold < range[0] or pass_threshold > range[1]):
73
+ raise ValueError(f"pass_threshold {pass_threshold} must be within range {range}")
74
+ else:
75
+ pass_threshold = (range[0] + range[1]) / 2 # Default to midpoint
76
+
77
+ # Store pass_threshold as instance attribute
78
+ self.pass_threshold = pass_threshold
79
+
80
+ # Create OpenAI ScoreModelGrader instance
81
+ grader_kwargs = {"input": input, "model": model, "name": name, "type": "score_model"}
82
+
83
+ if range is not None:
84
+ grader_kwargs["range"] = range
85
+ if sampling_params is not None:
86
+ grader_kwargs["sampling_params"] = sampling_params
87
+ grader_kwargs["pass_threshold"] = self.pass_threshold
88
+
89
+ grader = ScoreModelGrader(**grader_kwargs)
90
+
91
+ super().__init__(model_config=model_config, grader_config=grader, **kwargs)
@@ -10,6 +10,7 @@ from azure.ai.evaluation._common._experimental import experimental
10
10
 
11
11
  from .aoai_grader import AzureOpenAIGrader
12
12
 
13
+
13
14
  @experimental
14
15
  class AzureOpenAIStringCheckGrader(AzureOpenAIGrader):
15
16
  """
@@ -38,12 +39,12 @@ class AzureOpenAIStringCheckGrader(AzureOpenAIGrader):
38
39
 
39
40
  """
40
41
 
41
- id = "aoai://string_check"
42
+ id = "azureai://built-in/evaluators/azure-openai/string_check_grader"
42
43
 
43
44
  def __init__(
44
45
  self,
45
46
  *,
46
- model_config : Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration],
47
+ model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration],
47
48
  input: str,
48
49
  name: str,
49
50
  operation: Literal[
@@ -10,6 +10,7 @@ from azure.ai.evaluation._common._experimental import experimental
10
10
 
11
11
  from .aoai_grader import AzureOpenAIGrader
12
12
 
13
+
13
14
  @experimental
14
15
  class AzureOpenAITextSimilarityGrader(AzureOpenAIGrader):
15
16
  """
@@ -52,12 +53,12 @@ class AzureOpenAITextSimilarityGrader(AzureOpenAIGrader):
52
53
 
53
54
  """
54
55
 
55
- id = "aoai://text_similarity"
56
+ id = "azureai://built-in/evaluators/azure-openai/text_similarity_grader"
56
57
 
57
58
  def __init__(
58
59
  self,
59
60
  *,
60
- model_config : Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration],
61
+ model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration],
61
62
  evaluation_metric: Literal[
62
63
  "fuzzy_match",
63
64
  "bleu",
@@ -19,6 +19,7 @@ from azure.core.pipeline.policies import ProxyPolicy, AsyncRetryPolicy
19
19
 
20
20
  class AzureEnvironmentMetadata(TypedDict):
21
21
  """Configuration for various Azure environments. All endpoints include a trailing slash."""
22
+
22
23
  portal_endpoint: str
23
24
  """The management portal for the Azure environment (e.g. https://portal.azure.com/)"""
24
25
  resource_manager_endpoint: str
@@ -107,15 +108,15 @@ class AzureEnvironmentClient:
107
108
 
108
109
  def case_insensitive_match(d: Mapping[str, Any], key: str) -> Optional[Any]:
109
110
  key = key.strip().lower()
110
- return next((v for k,v in d.items() if k.strip().lower() == key), None)
111
+ return next((v for k, v in d.items() if k.strip().lower() == key), None)
111
112
 
112
113
  async with _ASYNC_LOCK:
113
114
  cloud = _KNOWN_AZURE_ENVIRONMENTS.get(name) or case_insensitive_match(_KNOWN_AZURE_ENVIRONMENTS, name)
114
115
  if cloud:
115
116
  return cloud
116
- default_endpoint = (_KNOWN_AZURE_ENVIRONMENTS
117
- .get(_DEFAULT_AZURE_ENV_NAME, {})
118
- .get("resource_manager_endpoint"))
117
+ default_endpoint = _KNOWN_AZURE_ENVIRONMENTS.get(_DEFAULT_AZURE_ENV_NAME, {}).get(
118
+ "resource_manager_endpoint"
119
+ )
119
120
 
120
121
  metadata_url = self.get_default_metadata_url(default_endpoint)
121
122
  clouds = await self.get_clouds_async(metadata_url=metadata_url, update_cached=update_cached)
@@ -124,10 +125,7 @@ class AzureEnvironmentClient:
124
125
  return cloud_metadata
125
126
 
126
127
  async def get_clouds_async(
127
- self,
128
- *,
129
- metadata_url: Optional[str] = None,
130
- update_cached: bool = True
128
+ self, *, metadata_url: Optional[str] = None, update_cached: bool = True
131
129
  ) -> Mapping[str, AzureEnvironmentMetadata]:
132
130
  metadata_url = metadata_url or self.get_default_metadata_url()
133
131
 
@@ -149,7 +147,8 @@ class AzureEnvironmentClient:
149
147
  default_endpoint = default_endpoint or "https://management.azure.com/"
150
148
  metadata_url = os.getenv(
151
149
  _ENV_ARM_CLOUD_METADATA_URL,
152
- f"{default_endpoint}metadata/endpoints?api-version={AzureEnvironmentClient.DEFAULT_API_VERSION}")
150
+ f"{default_endpoint}metadata/endpoints?api-version={AzureEnvironmentClient.DEFAULT_API_VERSION}",
151
+ )
153
152
  return metadata_url
154
153
 
155
154
  @staticmethod
@@ -197,7 +196,7 @@ class AzureEnvironmentClient:
197
196
 
198
197
  def recursive_update(d: Dict, u: Mapping) -> None:
199
198
  """Recursively update a dictionary.
200
-
199
+
201
200
  :param Dict d: The dictionary to update.
202
201
  :param Mapping u: The mapping to update from.
203
202
  """
@@ -73,7 +73,13 @@ class AzureMLTokenManager(APITokenManager):
73
73
  return super().get_aad_credential()
74
74
 
75
75
  def get_token(
76
- self, scopes = None, claims: Union[str, None] = None, tenant_id: Union[str, None] = None, enable_cae: bool = False, **kwargs: Any) -> AccessToken:
76
+ self,
77
+ scopes=None,
78
+ claims: Union[str, None] = None,
79
+ tenant_id: Union[str, None] = None,
80
+ enable_cae: bool = False,
81
+ **kwargs: Any
82
+ ) -> AccessToken:
77
83
  """Get the API token. If the token is not available or has expired, refresh the token.
78
84
 
79
85
  :return: API token
@@ -5,8 +5,17 @@ from enum import Enum
5
5
 
6
6
  from azure.core import CaseInsensitiveEnumMeta
7
7
 
8
- PROMPT_BASED_REASON_EVALUATORS = ["coherence", "relevance", "retrieval", "groundedness", "fluency", "intent_resolution",
9
- "tool_call_accurate", "response_completeness", "task_adherence"]
8
+ PROMPT_BASED_REASON_EVALUATORS = [
9
+ "coherence",
10
+ "relevance",
11
+ "retrieval",
12
+ "groundedness",
13
+ "fluency",
14
+ "intent_resolution",
15
+ "tool_call_accurate",
16
+ "response_completeness",
17
+ "task_adherence",
18
+ ]
10
19
 
11
20
 
12
21
  class CommonConstants:
@@ -6,13 +6,22 @@ import logging
6
6
  from typing import Union, Any, Dict
7
7
  from azure.core.credentials import AzureKeyCredential, TokenCredential
8
8
  from azure.ai.evaluation._common.onedp import AIProjectClient as RestEvaluationServiceClient
9
- from azure.ai.evaluation._common.onedp.models import (PendingUploadRequest, PendingUploadType, EvaluationResult,
10
- ResultType, AssetCredentialRequest, EvaluationUpload, InputDataset, RedTeamUpload)
9
+ from azure.ai.evaluation._common.onedp.models import (
10
+ PendingUploadRequest,
11
+ PendingUploadType,
12
+ EvaluationResult,
13
+ ResultType,
14
+ AssetCredentialRequest,
15
+ EvaluationUpload,
16
+ InputDataset,
17
+ RedTeamUpload,
18
+ )
11
19
  from azure.storage.blob import ContainerClient
12
20
  from .utils import upload
13
21
 
14
22
  LOGGER = logging.getLogger(__name__)
15
23
 
24
+
16
25
  class EvaluationServiceOneDPClient:
17
26
 
18
27
  def __init__(self, endpoint: str, credential: Union[AzureKeyCredential, "TokenCredential"], **kwargs: Any) -> None:
@@ -23,7 +32,15 @@ class EvaluationServiceOneDPClient:
23
32
  )
24
33
 
25
34
  def create_evaluation_result(
26
- self, *, name: str, path: str, version=1, metrics: Dict[str, int]=None, result_type: ResultType=ResultType.EVALUATION, **kwargs) -> EvaluationResult:
35
+ self,
36
+ *,
37
+ name: str,
38
+ path: str,
39
+ version=1,
40
+ metrics: Dict[str, int] = None,
41
+ result_type: ResultType = ResultType.EVALUATION,
42
+ **kwargs,
43
+ ) -> EvaluationResult:
27
44
  """Create and upload evaluation results to Azure evaluation service.
28
45
 
29
46
  This method uploads evaluation results from a local path to Azure Blob Storage
@@ -49,17 +66,20 @@ class EvaluationServiceOneDPClient:
49
66
  :raises: Various exceptions from the underlying API calls or upload process
50
67
  """
51
68
 
52
- LOGGER.debug(f"Creating evaluation result for {name} with version {version} type {result_type} from path {path}")
69
+ LOGGER.debug(
70
+ f"Creating evaluation result for {name} with version {version} type {result_type} from path {path}"
71
+ )
53
72
  start_pending_upload_response = self.rest_client.evaluation_results.start_pending_upload(
54
73
  name=name,
55
74
  version=version,
56
75
  body=PendingUploadRequest(pending_upload_type=PendingUploadType.TEMPORARY_BLOB_REFERENCE),
57
- **kwargs
76
+ **kwargs,
58
77
  )
59
78
 
60
79
  LOGGER.debug(f"Uploading {path} to {start_pending_upload_response.blob_reference_for_consumption.blob_uri}")
61
80
  with ContainerClient.from_container_url(
62
- start_pending_upload_response.blob_reference_for_consumption.credential.sas_uri) as container_client:
81
+ start_pending_upload_response.blob_reference_for_consumption.credential.sas_uri
82
+ ) as container_client:
63
83
  upload(path=path, container_client=container_client, logger=LOGGER)
64
84
 
65
85
  LOGGER.debug(f"Creating evaluation result version for {name} with version {version}")
@@ -73,7 +93,7 @@ class EvaluationServiceOneDPClient:
73
93
  ),
74
94
  name=name,
75
95
  version=version,
76
- **kwargs
96
+ **kwargs,
77
97
  )
78
98
 
79
99
  return create_version_response
@@ -90,10 +110,7 @@ class EvaluationServiceOneDPClient:
90
110
  :rtype: EvaluationUpload
91
111
  :raises: Various exceptions from the underlying API calls
92
112
  """
93
- upload_run_response = self.rest_client.evaluations.upload_run(
94
- evaluation=evaluation,
95
- **kwargs
96
- )
113
+ upload_run_response = self.rest_client.evaluations.upload_run(evaluation=evaluation, **kwargs)
97
114
 
98
115
  return upload_run_response
99
116
 
@@ -112,11 +129,7 @@ class EvaluationServiceOneDPClient:
112
129
  :rtype: EvaluationUpload
113
130
  :raises: Various exceptions from the underlying API calls
114
131
  """
115
- update_run_response = self.rest_client.evaluations.upload_update_run(
116
- name=name,
117
- evaluation=evaluation,
118
- **kwargs
119
- )
132
+ update_run_response = self.rest_client.evaluations.upload_update_run(name=name, evaluation=evaluation, **kwargs)
120
133
 
121
134
  return update_run_response
122
135
 
@@ -132,10 +145,7 @@ class EvaluationServiceOneDPClient:
132
145
  :rtype: ~azure.ai.evaluation._common.onedp.models.RedTeamUpload
133
146
  :raises: Various exceptions from the underlying API calls
134
147
  """
135
- upload_run_response = self.rest_client.red_teams.upload_run(
136
- redteam=red_team,
137
- **kwargs
138
- )
148
+ upload_run_response = self.rest_client.red_teams.upload_run(redteam=red_team, **kwargs)
139
149
 
140
150
  return upload_run_response
141
151
 
@@ -154,10 +164,6 @@ class EvaluationServiceOneDPClient:
154
164
  :rtype: ~azure.ai.evaluation._common.onedp.models.RedTeamUpload
155
165
  :raises: Various exceptions from the underlying API calls
156
166
  """
157
- update_run_response = self.rest_client.red_teams.upload_update_run(
158
- name=name,
159
- redteam=red_team,
160
- **kwargs
161
- )
167
+ update_run_response = self.rest_client.red_teams.upload_update_run(name=name, redteam=red_team, **kwargs)
162
168
 
163
- return update_run_response
169
+ return update_run_response