azure-ai-evaluation 1.6.0__py3-none-any.whl → 1.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (55) hide show
  1. azure/ai/evaluation/__init__.py +1 -0
  2. azure/ai/evaluation/_aoai/aoai_grader.py +1 -1
  3. azure/ai/evaluation/_aoai/label_grader.py +2 -2
  4. azure/ai/evaluation/_aoai/string_check_grader.py +2 -2
  5. azure/ai/evaluation/_aoai/text_similarity_grader.py +2 -2
  6. azure/ai/evaluation/_common/__init__.py +3 -1
  7. azure/ai/evaluation/_common/evaluation_onedp_client.py +50 -5
  8. azure/ai/evaluation/_common/onedp/operations/_operations.py +1 -1
  9. azure/ai/evaluation/_common/rai_service.py +7 -6
  10. azure/ai/evaluation/_converters/_ai_services.py +162 -118
  11. azure/ai/evaluation/_converters/_models.py +76 -6
  12. azure/ai/evaluation/_eval_mapping.py +2 -0
  13. azure/ai/evaluation/_evaluate/_evaluate.py +11 -13
  14. azure/ai/evaluation/_evaluate/_evaluate_aoai.py +24 -5
  15. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +11 -1
  16. azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +9 -1
  17. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +12 -2
  18. azure/ai/evaluation/_evaluators/_common/_base_eval.py +4 -0
  19. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +12 -2
  20. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +14 -4
  21. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +9 -8
  22. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +10 -0
  23. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +10 -0
  24. azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +31 -29
  25. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +10 -0
  26. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +10 -0
  27. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +10 -0
  28. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +10 -0
  29. azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +10 -0
  30. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +10 -0
  31. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +11 -0
  32. azure/ai/evaluation/_evaluators/_qa/_qa.py +10 -0
  33. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +10 -0
  34. azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +13 -0
  35. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +10 -0
  36. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +10 -0
  37. azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +10 -0
  38. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +10 -0
  39. azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +11 -0
  40. azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +80 -10
  41. azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +10 -0
  42. azure/ai/evaluation/_evaluators/_xpia/xpia.py +11 -0
  43. azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +26 -7
  44. azure/ai/evaluation/_version.py +1 -1
  45. azure/ai/evaluation/red_team/_red_team.py +183 -128
  46. azure/ai/evaluation/simulator/_conversation/constants.py +1 -1
  47. azure/ai/evaluation/simulator/_direct_attack_simulator.py +3 -3
  48. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +3 -3
  49. azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +2 -0
  50. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +6 -5
  51. {azure_ai_evaluation-1.6.0.dist-info → azure_ai_evaluation-1.7.0.dist-info}/METADATA +26 -3
  52. {azure_ai_evaluation-1.6.0.dist-info → azure_ai_evaluation-1.7.0.dist-info}/RECORD +55 -55
  53. {azure_ai_evaluation-1.6.0.dist-info → azure_ai_evaluation-1.7.0.dist-info}/NOTICE.txt +0 -0
  54. {azure_ai_evaluation-1.6.0.dist-info → azure_ai_evaluation-1.7.0.dist-info}/WHEEL +0 -0
  55. {azure_ai_evaluation-1.6.0.dist-info → azure_ai_evaluation-1.7.0.dist-info}/top_level.txt +0 -0
@@ -3,10 +3,18 @@ import json
3
3
 
4
4
  from pydantic import BaseModel
5
5
 
6
- from azure.ai.projects.models import RunStepFunctionToolCall
7
-
8
6
  from typing import List, Optional, Union
9
7
 
8
+ # Models moved in a later version of agents SDK, so try a few different locations
9
+ try:
10
+ from azure.ai.projects.models import RunStepFunctionToolCall
11
+ except ImportError:
12
+ pass
13
+ try:
14
+ from azure.ai.agents.models import RunStepFunctionToolCall
15
+ except ImportError:
16
+ pass
17
+
10
18
  # Message roles constants.
11
19
  _SYSTEM = "system"
12
20
  _USER = "user"
@@ -21,6 +29,57 @@ _FUNCTION = "function"
21
29
  # This is returned by AI services in the API to filter against tool invocations.
22
30
  _TOOL_CALLS = "tool_calls"
23
31
 
32
+ # Constants to only be used internally in this file for the built-in tools.
33
+ _CODE_INTERPRETER = "code_interpreter"
34
+ _BING_GROUNDING = "bing_grounding"
35
+ _FILE_SEARCH = "file_search"
36
+ _AZURE_AI_SEARCH = "azure_ai_search"
37
+ _FABRIC_DATAAGENT = "fabric_dataagent"
38
+
39
+ # Built-in tool descriptions and parameters are hidden, but we include basic descriptions
40
+ # for evaluation purposes.
41
+ _BUILT_IN_DESCRIPTIONS = {
42
+ _CODE_INTERPRETER: "Use code interpreter to read and interpret information from datasets, "
43
+ + "generate code, and create graphs and charts using your data. Supports "
44
+ + "up to 20 files.",
45
+ _BING_GROUNDING: "Enhance model output with web data.",
46
+ _FILE_SEARCH: "Search for data across uploaded files.",
47
+ _AZURE_AI_SEARCH: "Search an Azure AI Search index for relevant data.",
48
+ _FABRIC_DATAAGENT: "Connect to Microsoft Fabric data agents to retrieve data across different data sources.",
49
+ }
50
+
51
+ # Built-in tool parameters are hidden, but we include basic parameters for evaluation purposes.
52
+ _BUILT_IN_PARAMS = {
53
+ _CODE_INTERPRETER: {
54
+ "type": "object",
55
+ "properties": {"input": {"type": "string", "description": "Generated code to be executed."}},
56
+ },
57
+ _BING_GROUNDING: {
58
+ "type": "object",
59
+ "properties": {"requesturl": {"type": "string", "description": "URL used in Bing Search API."}},
60
+ },
61
+ _FILE_SEARCH: {
62
+ "type": "object",
63
+ "properties": {
64
+ "ranking_options": {
65
+ "type": "object",
66
+ "properties": {
67
+ "ranker": {"type": "string", "description": "Ranking algorithm to use."},
68
+ "score_threshold": {"type": "number", "description": "Threshold for search results."},
69
+ },
70
+ "description": "Ranking options for search results.",
71
+ }
72
+ },
73
+ },
74
+ _AZURE_AI_SEARCH: {
75
+ "type": "object",
76
+ "properties": {"input": {"type": "string", "description": "Search terms to use."}},
77
+ },
78
+ _FABRIC_DATAAGENT: {
79
+ "type": "object",
80
+ "properties": {"input": {"type": "string", "description": "Search terms to use."}},
81
+ },
82
+ }
24
83
 
25
84
  class Message(BaseModel):
26
85
  """Represents a message in a conversation with agents, assistants, and tools. We need to export these structures
@@ -98,6 +157,8 @@ class ToolDefinition(BaseModel):
98
157
 
99
158
  :param name: The name of the tool.
100
159
  :type name: str
160
+ :param type: The type of the tool.
161
+ :type type: str
101
162
  :param description: A description of the tool.
102
163
  :type description: str
103
164
  :param parameters: The parameters required by the tool.
@@ -105,6 +166,7 @@ class ToolDefinition(BaseModel):
105
166
  """
106
167
 
107
168
  name: str
169
+ type: str
108
170
  description: Optional[str] = None
109
171
  parameters: dict
110
172
 
@@ -191,6 +253,10 @@ def break_tool_call_into_messages(tool_call: ToolCall, run_id: str) -> List[Mess
191
253
  arguments = {
192
254
  "ranking_options": {"ranker": options["ranker"], "score_threshold": options["score_threshold"]}
193
255
  }
256
+ elif tool_call.details["type"] == "azure_ai_search":
257
+ arguments = {"input": tool_call.details["azure_ai_search"]["input"]}
258
+ elif tool_call.details["type"] == "fabric_dataagent":
259
+ arguments = {"input": tool_call.details["fabric_dataagent"]["input"]}
194
260
  else:
195
261
  # unsupported tool type, skip
196
262
  return messages
@@ -211,17 +277,17 @@ def break_tool_call_into_messages(tool_call: ToolCall, run_id: str) -> List[Mess
211
277
  messages.append(AssistantMessage(run_id=run_id, content=[to_dict(content_tool_call)], createdAt=tool_call.created))
212
278
 
213
279
  if hasattr(tool_call.details, _FUNCTION):
214
- output = safe_loads(tool_call.details.function.output)
280
+ output = safe_loads(tool_call.details.function["output"])
215
281
  else:
216
282
  try:
217
283
  # Some built-ins may have output, others may not
218
284
  # Try to retrieve it, but if we don't find anything, skip adding the message
219
285
  # Just manually converting to dicts for easy serialization for now rather than custom serializers
220
- if tool_call.details.type == "code_interpreter":
286
+ if tool_call.details.type == _CODE_INTERPRETER:
221
287
  output = tool_call.details.code_interpreter.outputs
222
- elif tool_call.details.type == "bing_grounding":
288
+ elif tool_call.details.type == _BING_GROUNDING:
223
289
  return messages # not supported yet from bing grounding tool
224
- elif tool_call.details.type == "file_search":
290
+ elif tool_call.details.type == _FILE_SEARCH:
225
291
  output = [
226
292
  {
227
293
  "file_id": result.file_id,
@@ -231,6 +297,10 @@ def break_tool_call_into_messages(tool_call: ToolCall, run_id: str) -> List[Mess
231
297
  }
232
298
  for result in tool_call.details.file_search.results
233
299
  ]
300
+ elif tool_call.details.type == _AZURE_AI_SEARCH:
301
+ output = tool_call.details.azure_ai_search["output"]
302
+ elif tool_call.details.type == _FABRIC_DATAAGENT:
303
+ output = tool_call.details.fabric_dataagent["output"]
234
304
  except:
235
305
  return messages
236
306
 
@@ -16,6 +16,7 @@ from azure.ai.evaluation import (
16
16
  CodeVulnerabilityEvaluator,
17
17
  CoherenceEvaluator,
18
18
  ContentSafetyEvaluator,
19
+ DocumentRetrievalEvaluator,
19
20
  F1ScoreEvaluator,
20
21
  FluencyEvaluator,
21
22
  GleuScoreEvaluator,
@@ -45,6 +46,7 @@ EVAL_CLASS_MAP = {
45
46
  CodeVulnerabilityEvaluator: "code_vulnerability",
46
47
  CoherenceEvaluator: "coherence",
47
48
  ContentSafetyEvaluator: "content_safety",
49
+ DocumentRetrievalEvaluator: "document_retrieval",
48
50
  ECIEvaluator: "eci",
49
51
  F1ScoreEvaluator: "f1_score",
50
52
  FluencyEvaluator: "fluency",
@@ -141,7 +141,6 @@ def _aggregate_content_safety_metrics(
141
141
  module = inspect.getmodule(evaluators[evaluator_name])
142
142
  if (
143
143
  module
144
- and module.__name__.startswith("azure.ai.evaluation.")
145
144
  and metric_name.endswith("_score")
146
145
  and metric_name.replace("_score", "") in content_safety_metrics
147
146
  ):
@@ -739,7 +738,17 @@ def evaluate(
739
738
  :end-before: [END evaluate_method]
740
739
  :language: python
741
740
  :dedent: 8
742
- :caption: Run an evaluation on local data with Coherence and Relevance evaluators.
741
+ :caption: Run an evaluation on local data with one or more evaluators using azure.ai.evaluation.AzureAIProject
742
+
743
+ .. admonition:: Example using Azure AI Project URL:
744
+
745
+ .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
746
+ :start-after: [START evaluate_method]
747
+ :end-before: [END evaluate_method]
748
+ :language: python
749
+ :dedent: 8
750
+ :caption: Run an evaluation on local data with one or more evaluators using Azure AI Project URL in following format
751
+ https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
743
752
  """
744
753
  try:
745
754
  return _evaluate(
@@ -978,17 +987,6 @@ def _preprocess_data(
978
987
  # Split normal evaluators and OAI graders
979
988
  evaluators, graders = _split_evaluators_and_grader_configs(evaluators_and_graders)
980
989
 
981
- input_data_df = _validate_and_load_data(
982
- target,
983
- data,
984
- evaluators_and_graders,
985
- output_path,
986
- azure_ai_project,
987
- evaluation_name
988
- )
989
- if target is not None:
990
- _validate_columns_for_target(input_data_df, target)
991
-
992
990
  target_run: Optional[BatchClientRun] = None
993
991
  target_generated_columns: Set[str] = set()
994
992
  batch_run_client: BatchClient
@@ -208,7 +208,7 @@ def _get_single_run_results(
208
208
  if run_results.status != "completed":
209
209
  raise EvaluationException(
210
210
  message=f"AOAI evaluation run {run_info['eval_group_id']}/{run_info['eval_run_id']}"
211
- + " failed with status {run_results.status}.",
211
+ + f" failed with status {run_results.status}.",
212
212
  blame=ErrorBlame.UNKNOWN,
213
213
  category=ErrorCategory.FAILED_EXECUTION,
214
214
  target=ErrorTarget.AOAI_GRADER,
@@ -217,6 +217,16 @@ def _get_single_run_results(
217
217
  + " completed successfully. Gathering results...")
218
218
  # Convert run results into a dictionary of metrics
219
219
  run_metrics = {}
220
+ if run_results.per_testing_criteria_results is None:
221
+ msg = ("AOAI evaluation run returned no results, despite 'completed' status. This might" +
222
+ " occur when invalid or conflicting models are selected in the model and grader configs."
223
+ f" Navigate to the evaluation run's report URL for more details: {run_results.report_url}")
224
+ raise EvaluationException(
225
+ message=msg,
226
+ blame=ErrorBlame.UNKNOWN,
227
+ category=ErrorCategory.FAILED_EXECUTION,
228
+ target=ErrorTarget.AOAI_GRADER,
229
+ )
220
230
  for criteria_result in run_results.per_testing_criteria_results:
221
231
  grader_name = run_info["grader_name_map"][criteria_result.testing_criteria]
222
232
  passed = criteria_result.passed
@@ -240,8 +250,12 @@ def _get_single_run_results(
240
250
  eval_id=run_info["eval_group_id"],
241
251
  run_id=run_info["eval_run_id"]
242
252
  )
243
- listed_results = {}
253
+ listed_results = {"index": []}
254
+ # raw data has no order guarantees, we need to sort them by their
255
+ # datasource_item_id
244
256
  for row_result in raw_list_results.data:
257
+ # Add the datasource_item_id for later sorting
258
+ listed_results["index"].append(row_result.datasource_item_id)
245
259
  for single_grader_row_result in row_result.results:
246
260
  grader_name = run_info["grader_name_map"][single_grader_row_result["name"]]
247
261
  for name, value in single_grader_row_result.items():
@@ -251,14 +265,19 @@ def _get_single_run_results(
251
265
  # create a `_result` column for each grader
252
266
  result_column_name = f"outputs.{grader_name}.{grader_name}_result"
253
267
  if len(result_column_name) < 50: #TODO: is this the limit? Should we keep "passed"?
254
- listed_results[result_column_name] = EVALUATION_PASS_FAIL_MAPPING[value]
268
+ if (result_column_name not in listed_results):
269
+ listed_results[result_column_name] = []
270
+ listed_results[result_column_name].append(EVALUATION_PASS_FAIL_MAPPING[value])
255
271
 
256
272
  formatted_column_name = f"outputs.{grader_name}.{name}"
257
273
  if (formatted_column_name not in listed_results):
258
274
  listed_results[formatted_column_name] = []
259
- listed_results[f"outputs.{grader_name}.{name}"].append(value)
275
+ listed_results[formatted_column_name].append(value)
260
276
  output_df = pd.DataFrame(listed_results)
261
-
277
+ # sort by index
278
+ output_df = output_df.sort_values('index', ascending=[True])
279
+ # remove index column
280
+ output_df.drop(columns=["index"], inplace=True)
262
281
  return output_df, run_metrics
263
282
 
264
283
 
@@ -33,7 +33,17 @@ class BleuScoreEvaluator(EvaluatorBase):
33
33
  :end-before: [END bleu_score_evaluator]
34
34
  :language: python
35
35
  :dedent: 8
36
- :caption: Initialize and call an BleuScoreEvaluator.
36
+ :caption: Initialize and call an BleuScoreEvaluator using azure.ai.evaluation.AzureAIProject
37
+
38
+ .. admonition:: Example using Azure AI Project URL:
39
+
40
+ .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
41
+ :start-after: [START bleu_score_evaluator]
42
+ :end-before: [END bleu_score_evaluator]
43
+ :language: python
44
+ :dedent: 8
45
+ :caption: Initialize and call an BleuScoreEvaluator using Azure AI Project URL in following format
46
+ https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
37
47
 
38
48
  .. admonition:: Example with Threshold:
39
49
  .. literalinclude:: ../samples/evaluation_samples_threshold.py
@@ -62,7 +62,15 @@ class CodeVulnerabilityEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
62
62
  :end-before: [END code_vulnerability_evaluator]
63
63
  :language: python
64
64
  :dedent: 8
65
- :caption: Initialize and call a CodeVulnerabilityEvaluator with a query and response.
65
+ :caption: Initialize and call CodeVulnerabilityEvaluator with a query and response using azure.ai.evaluation.AzureAIProject.
66
+
67
+ .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
68
+ :start-after: [START code_vulnerability_evaluator]
69
+ :end-before: [END code_vulnerability_evaluator]
70
+ :language: python
71
+ :dedent: 8
72
+ :caption: Initialize and call CodeVulnerabilityEvaluator using Azure AI Project URL in following format
73
+ https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
66
74
 
67
75
  .. note::
68
76
 
@@ -31,7 +31,17 @@ class CoherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
31
31
  :end-before: [END coherence_evaluator]
32
32
  :language: python
33
33
  :dedent: 8
34
- :caption: Initialize and call a CoherenceEvaluator with a query and response.
34
+ :caption: Initialize and call CoherenceEvaluator using azure.ai.evaluation.AzureAIProject
35
+
36
+ .. admonition:: Example using Azure AI Project URL:
37
+
38
+ .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
39
+ :start-after: [START coherence_evaluator]
40
+ :end-before: [END coherence_evaluator]
41
+ :language: python
42
+ :dedent: 8
43
+ :caption: Initialize and call CoherenceEvaluator using Azure AI Project URL in following format
44
+ https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
35
45
 
36
46
  .. admonition:: Example with Threshold:
37
47
 
@@ -40,7 +50,7 @@ class CoherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
40
50
  :end-before: [END threshold_coherence_evaluator]
41
51
  :language: python
42
52
  :dedent: 8
43
- :caption: Initialize with threshold and and call a CoherenceEvaluator with a query and response.
53
+ :caption: Initialize with threshold and call a CoherenceEvaluator with a query and response.
44
54
 
45
55
  .. note::
46
56
 
@@ -86,6 +86,10 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
86
86
  :type _higher_is_better: Optional[bool]
87
87
  """
88
88
 
89
+ _NOT_APPLICABLE_RESULT = "not applicable"
90
+ _PASS_RESULT = "pass"
91
+ _FAIL_RESULT = "fail"
92
+
89
93
  # ~~~ METHODS THAT ALMOST ALWAYS NEED TO BE OVERRIDDEN BY CHILDREN~~~
90
94
 
91
95
  # Make sure to call super().__init__() in the child class's __init__ method.
@@ -44,7 +44,17 @@ class ContentSafetyEvaluator(MultiEvaluatorBase[Union[str, float]]):
44
44
  :end-before: [END content_safety_evaluator]
45
45
  :language: python
46
46
  :dedent: 8
47
- :caption: Initialize and call a ContentSafetyEvaluator.
47
+ :caption: Initialize and call ContentSafetyEvaluator using azure.ai.evaluation.AzureAIProject.
48
+
49
+ .. admonition:: Example using Azure AI Project URL:
50
+
51
+ .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
52
+ :start-after: [START content_safety_evaluator]
53
+ :end-before: [END content_safety_evaluator]
54
+ :language: python
55
+ :dedent: 8
56
+ :caption: Initialize and call ContentSafetyEvaluator using Azure AI Project URL in the following format
57
+ https://{resource_name}.services.ai.azure.com/api/projects/{project_name}.
48
58
 
49
59
  .. admonition:: Example with Threshold:
50
60
 
@@ -53,7 +63,7 @@ class ContentSafetyEvaluator(MultiEvaluatorBase[Union[str, float]]):
53
63
  :end-before: [END threshold_content_safety_evaluator]
54
64
  :language: python
55
65
  :dedent: 8
56
- :caption: Initialize with threshold and call a ContentSafetyEvaluator.
66
+ :caption: Initialize with threshold and call a ContentSafetyEvaluator with a query and response.
57
67
  """
58
68
 
59
69
  id = "content_safety"
@@ -58,16 +58,26 @@ class HateUnfairnessEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
58
58
  :end-before: [END hate_unfairness_evaluator]
59
59
  :language: python
60
60
  :dedent: 8
61
- :caption: Initialize and call a HateUnfairnessEvaluator.
62
-
63
- .. admonition:: Example with Threshold:
61
+ :caption: Initialize and call HateUnfairnessEvaluator using azure.ai.evaluation.AzureAIProject.
62
+
63
+ .. admonition:: Example using Azure AI Project URL:
64
+
65
+ .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
66
+ :start-after: [START hate_unfairness_evaluator]
67
+ :end-before: [END hate_unfairness_evaluator]
68
+ :language: python
69
+ :dedent: 8
70
+ :caption: Initialize and call HateUnfairnessEvaluator using Azure AI Project URL in the following format
71
+ https://{resource_name}.services.ai.azure.com/api/projects/{project_name}.
64
72
 
73
+ .. admonition:: Example with Threshold:
74
+
65
75
  .. literalinclude:: ../samples/evaluation_samples_threshold.py
66
76
  :start-after: [START threshold_hate_unfairness_evaluator]
67
77
  :end-before: [END threshold_hate_unfairness_evaluator]
68
78
  :language: python
69
79
  :dedent: 8
70
- :caption: Initialize with threshold and call a HateUnfairnessEvaluator.
80
+ :caption: Initialize with threshold and call a HateUnfairnessEvaluator with a query and response.
71
81
  """
72
82
 
73
83
  id = "azureml://registries/azureml/models/Hate-and-Unfairness-Evaluator/versions/4"
@@ -52,16 +52,17 @@ class SelfHarmEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
52
52
  :end-before: [END self_harm_evaluator]
53
53
  :language: python
54
54
  :dedent: 8
55
- :caption: Initialize and call a SelfHarmEvaluator.
56
-
57
- .. admonition:: Example:
58
-
59
- .. literalinclude:: ../samples/evaluation_samples_threshold.py
60
- :start-after: [START threshold_self_harm_evaluator]
61
- :end-before: [END threshold_self_harm_evaluator]
55
+ :caption: Initialize and call SelfHarmEvaluator using azure.ai.evaluation.AzureAIProject.
56
+
57
+ .. admonition:: Example using Azure AI Project URL:
58
+
59
+ .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
60
+ :start-after: [START self_harm_evaluator]
61
+ :end-before: [END self_harm_evaluator]
62
62
  :language: python
63
63
  :dedent: 8
64
- :caption: Initialize with threshold and call a SelfHarmEvaluator.
64
+ :caption: Initialize and call SelfHarmEvaluator using Azure AI Project URL in the following format
65
+ https://{resource_name}.services.ai.azure.com/api/projects/{project_name}.
65
66
  """
66
67
 
67
68
  id = "azureml://registries/azureml/models/Self-Harm-Related-Content-Evaluator/versions/3"
@@ -56,6 +56,16 @@ class SexualEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
56
56
  :dedent: 8
57
57
  :caption: Initialize and call a SexualEvaluator.
58
58
 
59
+ .. admonition:: Example using Azure AI Project URL:
60
+
61
+ .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
62
+ :start-after: [START sexual_evaluator]
63
+ :end-before: [END sexual_evaluator]
64
+ :language: python
65
+ :dedent: 8
66
+ :caption: Initialize and call SexualEvaluator using Azure AI Project URL in following format
67
+ https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
68
+
59
69
  .. admonition:: Example with Threshold:
60
70
 
61
71
  .. literalinclude:: ../samples/evaluation_samples_threshold.py
@@ -56,6 +56,16 @@ class ViolenceEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
56
56
  :dedent: 8
57
57
  :caption: Initialize and call a ViolenceEvaluator.
58
58
 
59
+ .. admonition:: Example using Azure AI Project URL:
60
+
61
+ .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
62
+ :start-after: [START violence_evaluator]
63
+ :end-before: [END violence_evaluator]
64
+ :language: python
65
+ :dedent: 8
66
+ :caption: Initialize and call ViolenceEvaluator using Azure AI Project URL in following format
67
+ https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
68
+
59
69
  .. admonition:: Example:
60
70
 
61
71
  .. literalinclude:: ../samples/evaluation_samples_threshold.py
@@ -4,7 +4,7 @@
4
4
  import math
5
5
  import operator
6
6
  from itertools import starmap
7
- from typing import Dict, List, TypedDict, Tuple, Optional
7
+ from typing import Any, Dict, List, TypedDict, Tuple, Optional, Union
8
8
  from azure.ai.evaluation._evaluators._common import EvaluatorBase
9
9
  from azure.ai.evaluation._exceptions import EvaluationException
10
10
  from typing_extensions import override, overload
@@ -30,8 +30,18 @@ class DocumentRetrievalEvaluator(EvaluatorBase):
30
30
  :end-before: [END document_retrieval_evaluator]
31
31
  :language: python
32
32
  :dedent: 8
33
- :caption: Initialize and call a Document RetrievalEvaluator
33
+ :caption: Initialize and call a DocumentRetrievalEvaluator
34
34
 
35
+ .. admonition:: Example using Azure AI Project URL:
36
+
37
+ .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
38
+ :start-after: [START document_retrieval_evaluator]
39
+ :end-before: [END document_retrieval_evaluator]
40
+ :language: python
41
+ :dedent: 8
42
+ :caption: Initialize and call DocumentRetrievalEvaluator using Azure AI Project URL in following format
43
+ https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
44
+
35
45
  .. admonition:: Example with Threshold:
36
46
  .. literalinclude:: ../samples/evaluation_samples_threshold.py
37
47
  :start-after: [START threshold_document_retrieval_evaluator]
@@ -46,7 +56,13 @@ class DocumentRetrievalEvaluator(EvaluatorBase):
46
56
  *,
47
57
  ground_truth_label_min: int = 0,
48
58
  ground_truth_label_max: int = 4,
49
- threshold: Optional[dict] = None,
59
+ ndcg_threshold: Optional[float] = 0.5,
60
+ xdcg_threshold: Optional[float] = 50.0,
61
+ fidelity_threshold: Optional[float] = 0.5,
62
+ top1_relevance_threshold: Optional[float] = 50.0,
63
+ top3_max_relevance_threshold: Optional[float] = 50.0,
64
+ total_retrieved_documents_threshold: Optional[int] = 50,
65
+ total_ground_truth_documents_threshold: Optional[int] = 50
50
66
  ):
51
67
  super().__init__()
52
68
  self.k = 3
@@ -71,27 +87,19 @@ class DocumentRetrievalEvaluator(EvaluatorBase):
71
87
  self.ground_truth_label_max = ground_truth_label_max
72
88
 
73
89
  # The default threshold for metrics where higher numbers are better.
74
- self._threshold_metrics = {
75
- "ndcg@3": 0.5,
76
- "xdcg@3": 0.5,
77
- "fidelity": 0.5,
78
- "top1_relevance": 50,
79
- "top3_max_relevance": 50,
80
- "total_retrieved_documents": 50,
81
- "total_ground_truth_documents": 50,
90
+ self._threshold_metrics: Dict[str, Any] = {
91
+ "ndcg@3": ndcg_threshold,
92
+ "xdcg@3": xdcg_threshold,
93
+ "fidelity": fidelity_threshold,
94
+ "top1_relevance": top1_relevance_threshold,
95
+ "top3_max_relevance": top3_max_relevance_threshold,
96
+ "total_retrieved_documents": total_retrieved_documents_threshold,
97
+ "total_ground_truth_documents": total_ground_truth_documents_threshold,
82
98
  }
83
99
 
84
100
  # Ideally, the number of holes should be zero.
85
101
  self._threshold_holes = {"holes": 0, "holes_ratio": 0}
86
102
 
87
- if threshold and not isinstance(threshold, dict):
88
- raise EvaluationException(
89
- f"Threshold must be a dictionary, got {type(threshold)}"
90
- )
91
-
92
- elif isinstance(threshold, dict):
93
- self._threshold_metrics.update(threshold)
94
-
95
103
  def _compute_holes(self, actual_docs: List[str], labeled_docs: List[str]) -> int:
96
104
  """
97
105
  The number of documents retrieved from a search query which have no provided ground-truth label.
@@ -214,22 +222,16 @@ class DocumentRetrievalEvaluator(EvaluatorBase):
214
222
  return weighted_sum_by_rating_results / float(weighted_sum_by_rating_index)
215
223
 
216
224
  def _get_binary_result(self, **metrics) -> Dict[str, float]:
217
- result = {}
225
+ result: Dict[str, Any] = {}
218
226
 
219
227
  for metric_name, metric_value in metrics.items():
220
228
  if metric_name in self._threshold_metrics.keys():
221
- result[f"{metric_name}_result"] = (
222
- metric_value >= self._threshold_metrics[metric_name]
223
- )
224
- result[f"{metric_name}_threshold"] = self._threshold_metrics[
225
- metric_name
226
- ]
229
+ result[f"{metric_name}_result"] = "pass" if metric_value >= self._threshold_metrics[metric_name] else "fail"
230
+ result[f"{metric_name}_threshold"] = self._threshold_metrics[metric_name]
227
231
  result[f"{metric_name}_higher_is_better"] = True
228
232
 
229
233
  elif metric_name in self._threshold_holes.keys():
230
- result[f"{metric_name}_result"] = (
231
- metric_value <= self._threshold_holes[metric_name]
232
- )
234
+ result[f"{metric_name}_result"] = "pass" if metric_value <= self._threshold_holes[metric_name] else "fail"
233
235
  result[f"{metric_name}_threshold"] = self._threshold_holes[metric_name]
234
236
  result[f"{metric_name}_higher_is_better"] = False
235
237
 
@@ -38,6 +38,16 @@ class F1ScoreEvaluator(EvaluatorBase):
38
38
  :dedent: 8
39
39
  :caption: Initialize and call an F1ScoreEvaluator.
40
40
 
41
+ .. admonition:: Example using Azure AI Project URL:
42
+
43
+ .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
44
+ :start-after: [START f1_score_evaluator]
45
+ :end-before: [END f1_score_evaluator]
46
+ :language: python
47
+ :dedent: 8
48
+ :caption: Initialize and call F1ScoreEvaluator using Azure AI Project URL in following format
49
+ https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
50
+
41
51
  .. admonition:: Example with Threshold:
42
52
 
43
53
  .. literalinclude:: ../samples/evaluation_samples_threshold.py
@@ -44,6 +44,16 @@ class FluencyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
44
44
  :dedent: 8
45
45
  :caption: Initialize with threshold and call a FluencyEvaluator.
46
46
 
47
+ .. admonition:: Example using Azure AI Project URL:
48
+
49
+ .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
50
+ :start-after: [START fluency_evaluator]
51
+ :end-before: [END fluency_evaluator]
52
+ :language: python
53
+ :dedent: 8
54
+ :caption: Initialize and call FluencyEvaluator using Azure AI Project URL in the following format
55
+ https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
56
+
47
57
  .. note::
48
58
 
49
59
  To align with our support of a diverse set of models, an output key without the `gpt_` prefix has been added.
@@ -43,6 +43,16 @@ class GleuScoreEvaluator(EvaluatorBase):
43
43
  :language: python
44
44
  :dedent: 8
45
45
  :caption: Initialize with threshold and call a GleuScoreEvaluator.
46
+
47
+ .. admonition:: Example using Azure AI Project URL:
48
+
49
+ .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
50
+ :start-after: [START gleu_score_evaluator]
51
+ :end-before: [END gleu_score_evaluator]
52
+ :language: python
53
+ :dedent: 8
54
+ :caption: Initialize and call GleuScoreEvaluator using Azure AI Project URL in the following format
55
+ https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
46
56
  """
47
57
 
48
58
  id = "azureml://registries/azureml/models/Gleu-Score-Evaluator/versions/3"
@@ -53,6 +53,16 @@ class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
53
53
  :dedent: 8
54
54
  :caption: Initialize with threshold and call a GroundednessEvaluator.
55
55
 
56
+ .. admonition:: Example using Azure AI Project URL:
57
+
58
+ .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
59
+ :start-after: [START groundedness_evaluator]
60
+ :end-before: [END groundedness_evaluator]
61
+ :language: python
62
+ :dedent: 8
63
+ :caption: Initialize and call GroundednessEvaluator using Azure AI Project URL in the following format
64
+ https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
65
+
56
66
  .. note::
57
67
 
58
68
  To align with our support of a diverse set of models, an output key without the `gpt_` prefix has been added.