azure-ai-evaluation 1.5.0__py3-none-any.whl → 1.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (144) hide show
  1. azure/ai/evaluation/__init__.py +10 -0
  2. azure/ai/evaluation/_aoai/__init__.py +10 -0
  3. azure/ai/evaluation/_aoai/aoai_grader.py +89 -0
  4. azure/ai/evaluation/_aoai/label_grader.py +66 -0
  5. azure/ai/evaluation/_aoai/string_check_grader.py +65 -0
  6. azure/ai/evaluation/_aoai/text_similarity_grader.py +88 -0
  7. azure/ai/evaluation/_azure/_clients.py +4 -4
  8. azure/ai/evaluation/_azure/_envs.py +208 -0
  9. azure/ai/evaluation/_azure/_token_manager.py +12 -7
  10. azure/ai/evaluation/_common/__init__.py +7 -0
  11. azure/ai/evaluation/_common/evaluation_onedp_client.py +163 -0
  12. azure/ai/evaluation/_common/onedp/__init__.py +32 -0
  13. azure/ai/evaluation/_common/onedp/_client.py +139 -0
  14. azure/ai/evaluation/_common/onedp/_configuration.py +73 -0
  15. azure/ai/evaluation/_common/onedp/_model_base.py +1232 -0
  16. azure/ai/evaluation/_common/onedp/_patch.py +21 -0
  17. azure/ai/evaluation/_common/onedp/_serialization.py +2032 -0
  18. azure/ai/evaluation/_common/onedp/_types.py +21 -0
  19. azure/ai/evaluation/_common/onedp/_validation.py +50 -0
  20. azure/ai/evaluation/_common/onedp/_vendor.py +50 -0
  21. azure/ai/evaluation/_common/onedp/_version.py +9 -0
  22. azure/ai/evaluation/_common/onedp/aio/__init__.py +29 -0
  23. azure/ai/evaluation/_common/onedp/aio/_client.py +143 -0
  24. azure/ai/evaluation/_common/onedp/aio/_configuration.py +75 -0
  25. azure/ai/evaluation/_common/onedp/aio/_patch.py +21 -0
  26. azure/ai/evaluation/_common/onedp/aio/_vendor.py +40 -0
  27. azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +39 -0
  28. azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +4494 -0
  29. azure/ai/evaluation/_common/onedp/aio/operations/_patch.py +21 -0
  30. azure/ai/evaluation/_common/onedp/models/__init__.py +142 -0
  31. azure/ai/evaluation/_common/onedp/models/_enums.py +162 -0
  32. azure/ai/evaluation/_common/onedp/models/_models.py +2228 -0
  33. azure/ai/evaluation/_common/onedp/models/_patch.py +21 -0
  34. azure/ai/evaluation/_common/onedp/operations/__init__.py +39 -0
  35. azure/ai/evaluation/_common/onedp/operations/_operations.py +5655 -0
  36. azure/ai/evaluation/_common/onedp/operations/_patch.py +21 -0
  37. azure/ai/evaluation/_common/onedp/py.typed +1 -0
  38. azure/ai/evaluation/_common/onedp/servicepatterns/__init__.py +1 -0
  39. azure/ai/evaluation/_common/onedp/servicepatterns/aio/__init__.py +1 -0
  40. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/__init__.py +25 -0
  41. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_operations.py +34 -0
  42. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_patch.py +20 -0
  43. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/__init__.py +1 -0
  44. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/__init__.py +1 -0
  45. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/__init__.py +22 -0
  46. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_operations.py +29 -0
  47. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_patch.py +20 -0
  48. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/__init__.py +22 -0
  49. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_operations.py +29 -0
  50. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_patch.py +20 -0
  51. azure/ai/evaluation/_common/onedp/servicepatterns/operations/__init__.py +25 -0
  52. azure/ai/evaluation/_common/onedp/servicepatterns/operations/_operations.py +34 -0
  53. azure/ai/evaluation/_common/onedp/servicepatterns/operations/_patch.py +20 -0
  54. azure/ai/evaluation/_common/rai_service.py +165 -34
  55. azure/ai/evaluation/_common/raiclient/_version.py +1 -1
  56. azure/ai/evaluation/_common/utils.py +79 -1
  57. azure/ai/evaluation/_constants.py +16 -0
  58. azure/ai/evaluation/_converters/_ai_services.py +162 -118
  59. azure/ai/evaluation/_converters/_models.py +76 -6
  60. azure/ai/evaluation/_eval_mapping.py +73 -0
  61. azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +30 -16
  62. azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +8 -0
  63. azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +5 -0
  64. azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +17 -1
  65. azure/ai/evaluation/_evaluate/_eval_run.py +1 -1
  66. azure/ai/evaluation/_evaluate/_evaluate.py +325 -76
  67. azure/ai/evaluation/_evaluate/_evaluate_aoai.py +553 -0
  68. azure/ai/evaluation/_evaluate/_utils.py +117 -4
  69. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +11 -1
  70. azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +9 -1
  71. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +12 -2
  72. azure/ai/evaluation/_evaluators/_common/_base_eval.py +12 -3
  73. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +12 -3
  74. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +2 -2
  75. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +12 -2
  76. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +14 -4
  77. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +9 -8
  78. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +10 -0
  79. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +10 -0
  80. azure/ai/evaluation/_evaluators/_document_retrieval/__init__.py +11 -0
  81. azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +469 -0
  82. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +10 -0
  83. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +11 -1
  84. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +10 -0
  85. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +11 -1
  86. azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +16 -2
  87. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +10 -0
  88. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +11 -0
  89. azure/ai/evaluation/_evaluators/_qa/_qa.py +10 -0
  90. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +11 -1
  91. azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +20 -2
  92. azure/ai/evaluation/_evaluators/_response_completeness/response_completeness.prompty +31 -46
  93. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +10 -0
  94. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +10 -0
  95. azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +10 -0
  96. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +11 -1
  97. azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +16 -2
  98. azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +86 -12
  99. azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +10 -0
  100. azure/ai/evaluation/_evaluators/_xpia/xpia.py +11 -0
  101. azure/ai/evaluation/_exceptions.py +2 -0
  102. azure/ai/evaluation/_legacy/_adapters/__init__.py +0 -14
  103. azure/ai/evaluation/_legacy/_adapters/_check.py +17 -0
  104. azure/ai/evaluation/_legacy/_adapters/_flows.py +1 -1
  105. azure/ai/evaluation/_legacy/_batch_engine/_engine.py +51 -32
  106. azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py +114 -8
  107. azure/ai/evaluation/_legacy/_batch_engine/_result.py +6 -0
  108. azure/ai/evaluation/_legacy/_batch_engine/_run.py +6 -0
  109. azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +69 -29
  110. azure/ai/evaluation/_legacy/_batch_engine/_trace.py +54 -62
  111. azure/ai/evaluation/_legacy/_batch_engine/_utils.py +19 -1
  112. azure/ai/evaluation/_legacy/_common/__init__.py +3 -0
  113. azure/ai/evaluation/_legacy/_common/_async_token_provider.py +124 -0
  114. azure/ai/evaluation/_legacy/_common/_thread_pool_executor_with_context.py +15 -0
  115. azure/ai/evaluation/_legacy/prompty/_connection.py +11 -74
  116. azure/ai/evaluation/_legacy/prompty/_exceptions.py +80 -0
  117. azure/ai/evaluation/_legacy/prompty/_prompty.py +119 -9
  118. azure/ai/evaluation/_legacy/prompty/_utils.py +72 -2
  119. azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +114 -22
  120. azure/ai/evaluation/_version.py +1 -1
  121. azure/ai/evaluation/red_team/_attack_strategy.py +1 -1
  122. azure/ai/evaluation/red_team/_red_team.py +976 -546
  123. azure/ai/evaluation/red_team/_utils/metric_mapping.py +23 -0
  124. azure/ai/evaluation/red_team/_utils/strategy_utils.py +1 -1
  125. azure/ai/evaluation/simulator/_adversarial_simulator.py +63 -39
  126. azure/ai/evaluation/simulator/_constants.py +1 -0
  127. azure/ai/evaluation/simulator/_conversation/__init__.py +13 -6
  128. azure/ai/evaluation/simulator/_conversation/_conversation.py +2 -1
  129. azure/ai/evaluation/simulator/_conversation/constants.py +1 -1
  130. azure/ai/evaluation/simulator/_direct_attack_simulator.py +38 -25
  131. azure/ai/evaluation/simulator/_helpers/_language_suffix_mapping.py +1 -0
  132. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +43 -28
  133. azure/ai/evaluation/simulator/_model_tools/__init__.py +2 -1
  134. azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +26 -18
  135. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +5 -10
  136. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +65 -41
  137. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +15 -10
  138. azure/ai/evaluation/simulator/_model_tools/models.py +20 -17
  139. {azure_ai_evaluation-1.5.0.dist-info → azure_ai_evaluation-1.7.0.dist-info}/METADATA +49 -3
  140. {azure_ai_evaluation-1.5.0.dist-info → azure_ai_evaluation-1.7.0.dist-info}/RECORD +144 -86
  141. /azure/ai/evaluation/_legacy/{_batch_engine → _common}/_logging.py +0 -0
  142. {azure_ai_evaluation-1.5.0.dist-info → azure_ai_evaluation-1.7.0.dist-info}/NOTICE.txt +0 -0
  143. {azure_ai_evaluation-1.5.0.dist-info → azure_ai_evaluation-1.7.0.dist-info}/WHEEL +0 -0
  144. {azure_ai_evaluation-1.5.0.dist-info → azure_ai_evaluation-1.7.0.dist-info}/top_level.txt +0 -0
@@ -10,6 +10,7 @@ from pathlib import Path
10
10
  from typing import Any, Dict, NamedTuple, Optional, Union, cast
11
11
  import uuid
12
12
  import base64
13
+ import math
13
14
 
14
15
  import pandas as pd
15
16
  from azure.ai.evaluation._legacy._adapters.entities import Run
@@ -126,6 +127,82 @@ def process_message_content(content, images_folder_path):
126
127
  f.write(image_data_binary)
127
128
  return None
128
129
 
130
+ def _log_metrics_and_instance_results_onedp(
131
+ metrics: Dict[str, Any],
132
+ instance_results: pd.DataFrame,
133
+ project_url: str,
134
+ evaluation_name: Optional[str],
135
+ name_map: Dict[str, str],
136
+ **kwargs,
137
+ ) -> Optional[str]:
138
+
139
+ # One RP Client
140
+ from azure.ai.evaluation._azure._token_manager import AzureMLTokenManager
141
+ from azure.ai.evaluation._constants import TokenScope
142
+ from azure.ai.evaluation._common import EvaluationServiceOneDPClient, EvaluationUpload
143
+
144
+ credentials = AzureMLTokenManager(
145
+ TokenScope.COGNITIVE_SERVICES_MANAGEMENT.value, LOGGER, credential=kwargs.get("credential")
146
+ )
147
+ client = EvaluationServiceOneDPClient(
148
+ endpoint=project_url,
149
+ credential=credentials
150
+ )
151
+
152
+ # Massaging before artifacts are put on disk
153
+ # Adding line_number as index column this is needed by UI to form link to individual instance run
154
+ instance_results["line_number"] = instance_results.index.values
155
+
156
+ artifact_name = "instance_results.jsonl"
157
+
158
+ with tempfile.TemporaryDirectory() as tmpdir:
159
+ # storing multi_modal images if exists
160
+ col_name = "inputs.conversation"
161
+ if col_name in instance_results.columns:
162
+ for item in instance_results[col_name].items():
163
+ value = item[1]
164
+ if "messages" in value:
165
+ _store_multimodal_content(value["messages"], tmpdir)
166
+
167
+ # storing artifact result
168
+ tmp_path = os.path.join(tmpdir, artifact_name)
169
+
170
+ with open(tmp_path, "w", encoding=DefaultOpenEncoding.WRITE) as f:
171
+ f.write(instance_results.to_json(orient="records", lines=True))
172
+
173
+ properties = {
174
+ EvaluationRunProperties.RUN_TYPE: "eval_run",
175
+ EvaluationRunProperties.EVALUATION_RUN: "promptflow.BatchRun",
176
+ EvaluationRunProperties.EVALUATION_SDK: f"azure-ai-evaluation:{VERSION}",
177
+ "_azureml.evaluate_artifacts": json.dumps([{"path": artifact_name, "type": "table"}]),
178
+ }
179
+ properties.update(_convert_name_map_into_property_entries(name_map))
180
+
181
+ create_evaluation_result_response = client.create_evaluation_result(
182
+ name=uuid.uuid4(),
183
+ path=tmpdir,
184
+ metrics=metrics
185
+ )
186
+
187
+ upload_run_response = client.start_evaluation_run(
188
+ evaluation=EvaluationUpload(
189
+ display_name=evaluation_name,
190
+ )
191
+ )
192
+
193
+ update_run_response = client.update_evaluation_run(
194
+ name=upload_run_response.id,
195
+ evaluation=EvaluationUpload(
196
+ display_name=evaluation_name,
197
+ status="Completed",
198
+ outputs={
199
+ 'evaluationResultId': create_evaluation_result_response.id,
200
+ },
201
+ properties=properties,
202
+ )
203
+ )
204
+
205
+ return update_run_response.properties.get("AiStudioEvaluationUri")
129
206
 
130
207
  def _log_metrics_and_instance_results(
131
208
  metrics: Dict[str, Any],
@@ -133,6 +210,7 @@ def _log_metrics_and_instance_results(
133
210
  trace_destination: Optional[str],
134
211
  run: Optional[Run],
135
212
  evaluation_name: Optional[str],
213
+ name_map: Dict[str, str],
136
214
  **kwargs,
137
215
  ) -> Optional[str]:
138
216
  from azure.ai.evaluation._evaluate._eval_run import EvalRun
@@ -187,14 +265,14 @@ def _log_metrics_and_instance_results(
187
265
  # adding these properties to avoid showing traces if a dummy run is created.
188
266
  # We are doing that only for the pure evaluation runs.
189
267
  if run is None:
190
- ev_run.write_properties_to_run_history(
191
- properties={
268
+ properties = {
192
269
  EvaluationRunProperties.RUN_TYPE: "eval_run",
193
270
  EvaluationRunProperties.EVALUATION_RUN: "promptflow.BatchRun",
194
271
  EvaluationRunProperties.EVALUATION_SDK: f"azure-ai-evaluation:{VERSION}",
195
272
  "_azureml.evaluate_artifacts": json.dumps([{"path": artifact_name, "type": "table"}]),
196
273
  }
197
- )
274
+ properties.update(_convert_name_map_into_property_entries(name_map))
275
+ ev_run.write_properties_to_run_history(properties=properties)
198
276
  else:
199
277
  ev_run.write_properties_to_run_history(
200
278
  properties={
@@ -241,7 +319,7 @@ def _write_output(path: Union[str, os.PathLike], data_dict: Any) -> None:
241
319
  p = p / DEFAULT_EVALUATION_RESULTS_FILE_NAME
242
320
 
243
321
  with open(p, "w", encoding=DefaultOpenEncoding.WRITE) as f:
244
- json.dump(data_dict, f)
322
+ json.dump(data_dict, f, ensure_ascii=False)
245
323
 
246
324
  print(f'Evaluation results saved to "{p.resolve()}".\n')
247
325
 
@@ -329,6 +407,41 @@ def set_event_loop_policy() -> None:
329
407
  # On Windows seems to be a problem with EventLoopPolicy, use this snippet to work around it
330
408
  asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy()) # type: ignore[attr-defined]
331
409
 
410
+ # textwrap.wrap tries to do fancy nonsense that we don't want
411
+ def _wrap(s, w):
412
+ return [s[i:i + w] for i in range(0, len(s), w)]
413
+
414
+ def _convert_name_map_into_property_entries(
415
+ name_map: Dict[str, str], segment_length: int = 950, max_segments: int = 10
416
+ ) -> Dict[str, Any]:
417
+ """
418
+ Convert the name map into property entries.
419
+
420
+ :param name_map: The name map to be converted.
421
+ :type name_map: Dict[str, str]
422
+ :param segment_length: The max length of each individual segment,
423
+ which will each have their own dictionary entry
424
+ :type segment_length: str
425
+ :param max_segments: The max number of segments we can have. If the stringified
426
+ name map is too long, we just return a length entry with a value
427
+ of -1 to indicate that the map was too long.
428
+ :type max_segments: str
429
+ :return: The converted name map.
430
+ :rtype: Dict[str, Any]
431
+ """
432
+ name_map_string = json.dumps(name_map)
433
+ num_segments = math.ceil(len(name_map_string) / segment_length)
434
+ # Property map is somehow still too long to encode within the space
435
+ # we allow, so give up, but make sure the service knows we gave up
436
+ if (num_segments > max_segments):
437
+ return {EvaluationRunProperties.NAME_MAP_LENGTH: -1}
438
+
439
+ result: Dict[str, Any] = {EvaluationRunProperties.NAME_MAP_LENGTH: num_segments}
440
+ segments_list = _wrap(name_map_string, segment_length)
441
+ for i in range(0, num_segments):
442
+ segment_key = f"{EvaluationRunProperties.NAME_MAP}_{i}"
443
+ result[segment_key] = segments_list[i]
444
+ return result
332
445
 
333
446
  class JSONLDataFileLoader:
334
447
  def __init__(self, filename: Union[os.PathLike, str]):
@@ -33,7 +33,17 @@ class BleuScoreEvaluator(EvaluatorBase):
33
33
  :end-before: [END bleu_score_evaluator]
34
34
  :language: python
35
35
  :dedent: 8
36
- :caption: Initialize and call an BleuScoreEvaluator.
36
+ :caption: Initialize and call an BleuScoreEvaluator using azure.ai.evaluation.AzureAIProject
37
+
38
+ .. admonition:: Example using Azure AI Project URL:
39
+
40
+ .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
41
+ :start-after: [START bleu_score_evaluator]
42
+ :end-before: [END bleu_score_evaluator]
43
+ :language: python
44
+ :dedent: 8
45
+ :caption: Initialize and call an BleuScoreEvaluator using Azure AI Project URL in following format
46
+ https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
37
47
 
38
48
  .. admonition:: Example with Threshold:
39
49
  .. literalinclude:: ../samples/evaluation_samples_threshold.py
@@ -62,7 +62,15 @@ class CodeVulnerabilityEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
62
62
  :end-before: [END code_vulnerability_evaluator]
63
63
  :language: python
64
64
  :dedent: 8
65
- :caption: Initialize and call a CodeVulnerabilityEvaluator with a query and response.
65
+ :caption: Initialize and call CodeVulnerabilityEvaluator with a query and response using azure.ai.evaluation.AzureAIProject.
66
+
67
+ .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
68
+ :start-after: [START code_vulnerability_evaluator]
69
+ :end-before: [END code_vulnerability_evaluator]
70
+ :language: python
71
+ :dedent: 8
72
+ :caption: Initialize and call CodeVulnerabilityEvaluator using Azure AI Project URL in following format
73
+ https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
66
74
 
67
75
  .. note::
68
76
 
@@ -31,7 +31,17 @@ class CoherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
31
31
  :end-before: [END coherence_evaluator]
32
32
  :language: python
33
33
  :dedent: 8
34
- :caption: Initialize and call a CoherenceEvaluator with a query and response.
34
+ :caption: Initialize and call CoherenceEvaluator using azure.ai.evaluation.AzureAIProject
35
+
36
+ .. admonition:: Example using Azure AI Project URL:
37
+
38
+ .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
39
+ :start-after: [START coherence_evaluator]
40
+ :end-before: [END coherence_evaluator]
41
+ :language: python
42
+ :dedent: 8
43
+ :caption: Initialize and call CoherenceEvaluator using Azure AI Project URL in following format
44
+ https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
35
45
 
36
46
  .. admonition:: Example with Threshold:
37
47
 
@@ -40,7 +50,7 @@ class CoherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
40
50
  :end-before: [END threshold_coherence_evaluator]
41
51
  :language: python
42
52
  :dedent: 8
43
- :caption: Initialize with threshold and and call a CoherenceEvaluator with a query and response.
53
+ :caption: Initialize with threshold and call a CoherenceEvaluator with a query and response.
44
54
 
45
55
  .. note::
46
56
 
@@ -86,6 +86,10 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
86
86
  :type _higher_is_better: Optional[bool]
87
87
  """
88
88
 
89
+ _NOT_APPLICABLE_RESULT = "not applicable"
90
+ _PASS_RESULT = "pass"
91
+ _FAIL_RESULT = "fail"
92
+
89
93
  # ~~~ METHODS THAT ALMOST ALWAYS NEED TO BE OVERRIDDEN BY CHILDREN~~~
90
94
 
91
95
  # Make sure to call super().__init__() in the child class's __init__ method.
@@ -495,7 +499,8 @@ class AsyncEvaluatorBase:
495
499
  # are known to throw at this, mash them into kwargs, and then pass them into the real call.
496
500
  async def __call__(
497
501
  self, *, query=None, response=None, context=None, conversation=None, ground_truth=None,
498
- tool_call=None, tool_definitions=None, messages=None, **kwargs
502
+ tool_calls=None, tool_definitions=None, messages=None, retrieval_ground_truth=None,
503
+ retrieved_documents=None,**kwargs
499
504
  ):
500
505
  if conversation is not None:
501
506
  kwargs["conversation"] = conversation
@@ -509,11 +514,15 @@ class AsyncEvaluatorBase:
509
514
  kwargs["context"] = context
510
515
  if ground_truth is not None:
511
516
  kwargs["ground_truth"] = ground_truth
512
- if tool_call is not None:
513
- kwargs["tool_call"] = tool_call
517
+ if tool_calls is not None:
518
+ kwargs["tool_calls"] = tool_calls
514
519
  if tool_definitions is not None:
515
520
  kwargs["tool_definitions"] = tool_definitions
516
521
  if messages is not None:
517
522
  kwargs["messages"] = messages
523
+ if retrieval_ground_truth is not None:
524
+ kwargs["retrieval_ground_truth"] = retrieval_ground_truth
525
+ if retrieved_documents is not None:
526
+ kwargs["retrieved_documents"] = retrieved_documents
518
527
 
519
528
  return await self._real_call(**kwargs)
@@ -4,9 +4,13 @@
4
4
 
5
5
  import math
6
6
  import re
7
+ import os
7
8
  from typing import Dict, TypeVar, Union
8
9
 
9
- from azure.ai.evaluation._legacy.prompty import AsyncPrompty
10
+ if os.getenv("AI_EVALS_USE_PF_PROMPTY", "false").lower() == "true":
11
+ from promptflow.core._flow import AsyncPrompty
12
+ else:
13
+ from azure.ai.evaluation._legacy.prompty import AsyncPrompty
10
14
  from typing_extensions import override
11
15
 
12
16
  from azure.ai.evaluation._common.constants import PROMPT_BASED_REASON_EVALUATORS
@@ -39,13 +43,17 @@ class PromptyEvaluatorBase(EvaluatorBase[T]):
39
43
  :param ignore_queries: If True, queries will be ignored in conversation evaluations. Default is False.
40
44
  Useful since some evaluators of this format are response-only.
41
45
  :type ignore_queries: bool
46
+ :keyword is_reasoning_model: This parameter is in preview. If True, updates the config parameters in prompty file based on reasoning models. Defaults to False.
47
+ :type is_reasoning_model: bool
42
48
  """
43
49
 
44
50
  _LLM_CALL_TIMEOUT = 600
45
51
  _DEFAULT_OPEN_API_VERSION = "2024-02-15-preview"
46
52
 
47
- def __init__(self, *, result_key: str, prompty_file: str, model_config: dict, eval_last_turn: bool = False, threshold: int = 3, _higher_is_better: bool = False):
53
+ def __init__(self, *, result_key: str, prompty_file: str, model_config: dict, eval_last_turn: bool = False,
54
+ threshold: int = 3, _higher_is_better: bool = False, **kwargs) -> None:
48
55
  self._result_key = result_key
56
+ self._is_reasoning_model = kwargs.get("is_reasoning_model", False)
49
57
  self._prompty_file = prompty_file
50
58
  self._threshold = threshold
51
59
  self._higher_is_better = _higher_is_better
@@ -59,7 +67,8 @@ class PromptyEvaluatorBase(EvaluatorBase[T]):
59
67
  user_agent,
60
68
  )
61
69
 
62
- self._flow = AsyncPrompty.load(source=prompty_file, model=prompty_model_config)
70
+ self._flow = AsyncPrompty.load(source=self._prompty_file, model=prompty_model_config,
71
+ is_reasoning_model=self._is_reasoning_model)
63
72
 
64
73
  # __call__ not overridden here because child classes have such varied signatures that there's no point
65
74
  # defining a default here.
@@ -12,7 +12,7 @@ from azure.ai.evaluation._common.constants import (
12
12
  _InternalAnnotationTasks,
13
13
  )
14
14
  from azure.ai.evaluation._common.rai_service import evaluate_with_rai_service, evaluate_with_rai_service_multimodal
15
- from azure.ai.evaluation._common.utils import validate_azure_ai_project
15
+ from azure.ai.evaluation._common.utils import validate_azure_ai_project, is_onedp_project
16
16
  from azure.ai.evaluation._exceptions import EvaluationException
17
17
  from azure.ai.evaluation._common.utils import validate_conversation
18
18
  from azure.ai.evaluation._constants import _AggregationType
@@ -50,7 +50,7 @@ class RaiServiceEvaluatorBase(EvaluatorBase[T]):
50
50
  def __init__(
51
51
  self,
52
52
  eval_metric: Union[EvaluationMetrics, _InternalEvaluationMetrics],
53
- azure_ai_project: dict,
53
+ azure_ai_project: Union[dict, str],
54
54
  credential: TokenCredential,
55
55
  eval_last_turn: bool = False,
56
56
  conversation_aggregation_type: _AggregationType = _AggregationType.MEAN,
@@ -44,7 +44,17 @@ class ContentSafetyEvaluator(MultiEvaluatorBase[Union[str, float]]):
44
44
  :end-before: [END content_safety_evaluator]
45
45
  :language: python
46
46
  :dedent: 8
47
- :caption: Initialize and call a ContentSafetyEvaluator.
47
+ :caption: Initialize and call ContentSafetyEvaluator using azure.ai.evaluation.AzureAIProject.
48
+
49
+ .. admonition:: Example using Azure AI Project URL:
50
+
51
+ .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
52
+ :start-after: [START content_safety_evaluator]
53
+ :end-before: [END content_safety_evaluator]
54
+ :language: python
55
+ :dedent: 8
56
+ :caption: Initialize and call ContentSafetyEvaluator using Azure AI Project URL in the following format
57
+ https://{resource_name}.services.ai.azure.com/api/projects/{project_name}.
48
58
 
49
59
  .. admonition:: Example with Threshold:
50
60
 
@@ -53,7 +63,7 @@ class ContentSafetyEvaluator(MultiEvaluatorBase[Union[str, float]]):
53
63
  :end-before: [END threshold_content_safety_evaluator]
54
64
  :language: python
55
65
  :dedent: 8
56
- :caption: Initialize with threshold and call a ContentSafetyEvaluator.
66
+ :caption: Initialize with threshold and call a ContentSafetyEvaluator with a query and response.
57
67
  """
58
68
 
59
69
  id = "content_safety"
@@ -58,16 +58,26 @@ class HateUnfairnessEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
58
58
  :end-before: [END hate_unfairness_evaluator]
59
59
  :language: python
60
60
  :dedent: 8
61
- :caption: Initialize and call a HateUnfairnessEvaluator.
62
-
63
- .. admonition:: Example with Threshold:
61
+ :caption: Initialize and call HateUnfairnessEvaluator using azure.ai.evaluation.AzureAIProject.
62
+
63
+ .. admonition:: Example using Azure AI Project URL:
64
+
65
+ .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
66
+ :start-after: [START hate_unfairness_evaluator]
67
+ :end-before: [END hate_unfairness_evaluator]
68
+ :language: python
69
+ :dedent: 8
70
+ :caption: Initialize and call HateUnfairnessEvaluator using Azure AI Project URL in the following format
71
+ https://{resource_name}.services.ai.azure.com/api/projects/{project_name}.
64
72
 
73
+ .. admonition:: Example with Threshold:
74
+
65
75
  .. literalinclude:: ../samples/evaluation_samples_threshold.py
66
76
  :start-after: [START threshold_hate_unfairness_evaluator]
67
77
  :end-before: [END threshold_hate_unfairness_evaluator]
68
78
  :language: python
69
79
  :dedent: 8
70
- :caption: Initialize with threshold and call a HateUnfairnessEvaluator.
80
+ :caption: Initialize with threshold and call a HateUnfairnessEvaluator with a query and response.
71
81
  """
72
82
 
73
83
  id = "azureml://registries/azureml/models/Hate-and-Unfairness-Evaluator/versions/4"
@@ -52,16 +52,17 @@ class SelfHarmEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
52
52
  :end-before: [END self_harm_evaluator]
53
53
  :language: python
54
54
  :dedent: 8
55
- :caption: Initialize and call a SelfHarmEvaluator.
56
-
57
- .. admonition:: Example:
58
-
59
- .. literalinclude:: ../samples/evaluation_samples_threshold.py
60
- :start-after: [START threshold_self_harm_evaluator]
61
- :end-before: [END threshold_self_harm_evaluator]
55
+ :caption: Initialize and call SelfHarmEvaluator using azure.ai.evaluation.AzureAIProject.
56
+
57
+ .. admonition:: Example using Azure AI Project URL:
58
+
59
+ .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
60
+ :start-after: [START self_harm_evaluator]
61
+ :end-before: [END self_harm_evaluator]
62
62
  :language: python
63
63
  :dedent: 8
64
- :caption: Initialize with threshold and call a SelfHarmEvaluator.
64
+ :caption: Initialize and call SelfHarmEvaluator using Azure AI Project URL in the following format
65
+ https://{resource_name}.services.ai.azure.com/api/projects/{project_name}.
65
66
  """
66
67
 
67
68
  id = "azureml://registries/azureml/models/Self-Harm-Related-Content-Evaluator/versions/3"
@@ -56,6 +56,16 @@ class SexualEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
56
56
  :dedent: 8
57
57
  :caption: Initialize and call a SexualEvaluator.
58
58
 
59
+ .. admonition:: Example using Azure AI Project URL:
60
+
61
+ .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
62
+ :start-after: [START sexual_evaluator]
63
+ :end-before: [END sexual_evaluator]
64
+ :language: python
65
+ :dedent: 8
66
+ :caption: Initialize and call SexualEvaluator using Azure AI Project URL in following format
67
+ https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
68
+
59
69
  .. admonition:: Example with Threshold:
60
70
 
61
71
  .. literalinclude:: ../samples/evaluation_samples_threshold.py
@@ -56,6 +56,16 @@ class ViolenceEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
56
56
  :dedent: 8
57
57
  :caption: Initialize and call a ViolenceEvaluator.
58
58
 
59
+ .. admonition:: Example using Azure AI Project URL:
60
+
61
+ .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
62
+ :start-after: [START violence_evaluator]
63
+ :end-before: [END violence_evaluator]
64
+ :language: python
65
+ :dedent: 8
66
+ :caption: Initialize and call ViolenceEvaluator using Azure AI Project URL in following format
67
+ https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
68
+
59
69
  .. admonition:: Example:
60
70
 
61
71
  .. literalinclude:: ../samples/evaluation_samples_threshold.py
@@ -0,0 +1,11 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+
5
+ from ._document_retrieval import DocumentRetrievalEvaluator, RetrievalGroundTruthDocument, RetrievedDocument
6
+
7
+ __all__ = [
8
+ "DocumentRetrievalEvaluator",
9
+ "RetrievalGroundTruthDocument",
10
+ "RetrievedDocument"
11
+ ]