azure-ai-evaluation 1.2.0__py3-none-any.whl → 1.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (134) hide show
  1. azure/ai/evaluation/__init__.py +42 -14
  2. azure/ai/evaluation/_azure/_models.py +6 -6
  3. azure/ai/evaluation/_common/constants.py +6 -2
  4. azure/ai/evaluation/_common/rai_service.py +38 -4
  5. azure/ai/evaluation/_common/raiclient/__init__.py +34 -0
  6. azure/ai/evaluation/_common/raiclient/_client.py +128 -0
  7. azure/ai/evaluation/_common/raiclient/_configuration.py +87 -0
  8. azure/ai/evaluation/_common/raiclient/_model_base.py +1235 -0
  9. azure/ai/evaluation/_common/raiclient/_patch.py +20 -0
  10. azure/ai/evaluation/_common/raiclient/_serialization.py +2050 -0
  11. azure/ai/evaluation/_common/raiclient/_version.py +9 -0
  12. azure/ai/evaluation/_common/raiclient/aio/__init__.py +29 -0
  13. azure/ai/evaluation/_common/raiclient/aio/_client.py +130 -0
  14. azure/ai/evaluation/_common/raiclient/aio/_configuration.py +87 -0
  15. azure/ai/evaluation/_common/raiclient/aio/_patch.py +20 -0
  16. azure/ai/evaluation/_common/raiclient/aio/operations/__init__.py +25 -0
  17. azure/ai/evaluation/_common/raiclient/aio/operations/_operations.py +981 -0
  18. azure/ai/evaluation/_common/raiclient/aio/operations/_patch.py +20 -0
  19. azure/ai/evaluation/_common/raiclient/models/__init__.py +60 -0
  20. azure/ai/evaluation/_common/raiclient/models/_enums.py +18 -0
  21. azure/ai/evaluation/_common/raiclient/models/_models.py +651 -0
  22. azure/ai/evaluation/_common/raiclient/models/_patch.py +20 -0
  23. azure/ai/evaluation/_common/raiclient/operations/__init__.py +25 -0
  24. azure/ai/evaluation/_common/raiclient/operations/_operations.py +1225 -0
  25. azure/ai/evaluation/_common/raiclient/operations/_patch.py +20 -0
  26. azure/ai/evaluation/_common/raiclient/py.typed +1 -0
  27. azure/ai/evaluation/_common/utils.py +30 -10
  28. azure/ai/evaluation/_constants.py +10 -0
  29. azure/ai/evaluation/_converters/__init__.py +3 -0
  30. azure/ai/evaluation/_converters/_ai_services.py +804 -0
  31. azure/ai/evaluation/_converters/_models.py +302 -0
  32. azure/ai/evaluation/_evaluate/_batch_run/__init__.py +10 -3
  33. azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +104 -0
  34. azure/ai/evaluation/_evaluate/_batch_run/batch_clients.py +82 -0
  35. azure/ai/evaluation/_evaluate/_eval_run.py +1 -1
  36. azure/ai/evaluation/_evaluate/_evaluate.py +36 -4
  37. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +23 -3
  38. azure/ai/evaluation/_evaluators/_code_vulnerability/__init__.py +5 -0
  39. azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +120 -0
  40. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +21 -2
  41. azure/ai/evaluation/_evaluators/_common/_base_eval.py +43 -3
  42. azure/ai/evaluation/_evaluators/_common/_base_multi_eval.py +3 -1
  43. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +43 -4
  44. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +16 -4
  45. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +42 -5
  46. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +15 -0
  47. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +15 -0
  48. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +15 -0
  49. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +15 -0
  50. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +28 -4
  51. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +21 -2
  52. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +26 -3
  53. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +21 -3
  54. azure/ai/evaluation/_evaluators/_intent_resolution/__init__.py +7 -0
  55. azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +152 -0
  56. azure/ai/evaluation/_evaluators/_intent_resolution/intent_resolution.prompty +161 -0
  57. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +26 -3
  58. azure/ai/evaluation/_evaluators/_qa/_qa.py +51 -7
  59. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +26 -2
  60. azure/ai/evaluation/_evaluators/_response_completeness/__init__.py +7 -0
  61. azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +157 -0
  62. azure/ai/evaluation/_evaluators/_response_completeness/response_completeness.prompty +99 -0
  63. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +21 -2
  64. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +113 -4
  65. azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +23 -3
  66. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +24 -5
  67. azure/ai/evaluation/_evaluators/_task_adherence/__init__.py +7 -0
  68. azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +148 -0
  69. azure/ai/evaluation/_evaluators/_task_adherence/task_adherence.prompty +117 -0
  70. azure/ai/evaluation/_evaluators/_tool_call_accuracy/__init__.py +9 -0
  71. azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +292 -0
  72. azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +71 -0
  73. azure/ai/evaluation/_evaluators/_ungrounded_attributes/__init__.py +5 -0
  74. azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +103 -0
  75. azure/ai/evaluation/_evaluators/_xpia/xpia.py +2 -0
  76. azure/ai/evaluation/_exceptions.py +5 -1
  77. azure/ai/evaluation/_legacy/__init__.py +3 -0
  78. azure/ai/evaluation/_legacy/_batch_engine/__init__.py +9 -0
  79. azure/ai/evaluation/_legacy/_batch_engine/_config.py +45 -0
  80. azure/ai/evaluation/_legacy/_batch_engine/_engine.py +368 -0
  81. azure/ai/evaluation/_legacy/_batch_engine/_exceptions.py +88 -0
  82. azure/ai/evaluation/_legacy/_batch_engine/_logging.py +292 -0
  83. azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py +23 -0
  84. azure/ai/evaluation/_legacy/_batch_engine/_result.py +99 -0
  85. azure/ai/evaluation/_legacy/_batch_engine/_run.py +121 -0
  86. azure/ai/evaluation/_legacy/_batch_engine/_run_storage.py +128 -0
  87. azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +217 -0
  88. azure/ai/evaluation/_legacy/_batch_engine/_status.py +25 -0
  89. azure/ai/evaluation/_legacy/_batch_engine/_trace.py +105 -0
  90. azure/ai/evaluation/_legacy/_batch_engine/_utils.py +82 -0
  91. azure/ai/evaluation/_legacy/_batch_engine/_utils_deprecated.py +131 -0
  92. azure/ai/evaluation/_legacy/prompty/__init__.py +36 -0
  93. azure/ai/evaluation/_legacy/prompty/_connection.py +182 -0
  94. azure/ai/evaluation/_legacy/prompty/_exceptions.py +59 -0
  95. azure/ai/evaluation/_legacy/prompty/_prompty.py +313 -0
  96. azure/ai/evaluation/_legacy/prompty/_utils.py +545 -0
  97. azure/ai/evaluation/_legacy/prompty/_yaml_utils.py +99 -0
  98. azure/ai/evaluation/_red_team/__init__.py +3 -0
  99. azure/ai/evaluation/_red_team/_attack_objective_generator.py +192 -0
  100. azure/ai/evaluation/_red_team/_attack_strategy.py +42 -0
  101. azure/ai/evaluation/_red_team/_callback_chat_target.py +74 -0
  102. azure/ai/evaluation/_red_team/_default_converter.py +21 -0
  103. azure/ai/evaluation/_red_team/_red_team.py +1858 -0
  104. azure/ai/evaluation/_red_team/_red_team_result.py +246 -0
  105. azure/ai/evaluation/_red_team/_utils/__init__.py +3 -0
  106. azure/ai/evaluation/_red_team/_utils/constants.py +64 -0
  107. azure/ai/evaluation/_red_team/_utils/formatting_utils.py +164 -0
  108. azure/ai/evaluation/_red_team/_utils/logging_utils.py +139 -0
  109. azure/ai/evaluation/_red_team/_utils/strategy_utils.py +188 -0
  110. azure/ai/evaluation/_safety_evaluation/__init__.py +3 -0
  111. azure/ai/evaluation/_safety_evaluation/_generated_rai_client.py +0 -0
  112. azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +741 -0
  113. azure/ai/evaluation/_version.py +2 -1
  114. azure/ai/evaluation/simulator/_adversarial_scenario.py +3 -1
  115. azure/ai/evaluation/simulator/_adversarial_simulator.py +61 -27
  116. azure/ai/evaluation/simulator/_conversation/__init__.py +4 -5
  117. azure/ai/evaluation/simulator/_conversation/_conversation.py +4 -0
  118. azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +145 -0
  119. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +2 -0
  120. azure/ai/evaluation/simulator/_model_tools/_rai_client.py +71 -1
  121. {azure_ai_evaluation-1.2.0.dist-info → azure_ai_evaluation-1.4.0.dist-info}/METADATA +75 -15
  122. azure_ai_evaluation-1.4.0.dist-info/RECORD +197 -0
  123. {azure_ai_evaluation-1.2.0.dist-info → azure_ai_evaluation-1.4.0.dist-info}/WHEEL +1 -1
  124. azure/ai/evaluation/_evaluators/_multimodal/__init__.py +0 -20
  125. azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py +0 -132
  126. azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal_base.py +0 -55
  127. azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py +0 -100
  128. azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py +0 -124
  129. azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py +0 -100
  130. azure/ai/evaluation/_evaluators/_multimodal/_sexual.py +0 -100
  131. azure/ai/evaluation/_evaluators/_multimodal/_violence.py +0 -100
  132. azure_ai_evaluation-1.2.0.dist-info/RECORD +0 -125
  133. {azure_ai_evaluation-1.2.0.dist-info → azure_ai_evaluation-1.4.0.dist-info}/NOTICE.txt +0 -0
  134. {azure_ai_evaluation-1.2.0.dist-info → azure_ai_evaluation-1.4.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,20 @@
1
+ # ------------------------------------
2
+ # Copyright (c) Microsoft Corporation.
3
+ # Licensed under the MIT License.
4
+ # ------------------------------------
5
+ """Customize generated code here.
6
+
7
+ Follow our quickstart for examples: https://aka.ms/azsdk/python/dpcodegen/python/customize
8
+ """
9
+ from typing import List
10
+
11
+ __all__: List[str] = [] # Add all objects you want publicly available to users at this package level
12
+
13
+
14
+ def patch_sdk():
15
+ """Do not remove from this file.
16
+
17
+ `patch_sdk` is a last resort escape hatch that allows you to do customizations
18
+ you can't accomplish using the techniques described in
19
+ https://aka.ms/azsdk/python/dpcodegen/python/customize
20
+ """
@@ -0,0 +1 @@
1
+ # Marker file for PEP 561.
@@ -274,8 +274,26 @@ def _validate_typed_dict(o: object, t: Type[T_TypedDict]) -> T_TypedDict:
274
274
 
275
275
  return cast(T_TypedDict, o)
276
276
 
277
+ def check_score_is_valid(score: Union[str, float], min_score = 1, max_score = 5) -> bool:
278
+ """Check if the score is valid, i.e. is convertable to number and is in the range [min_score, max_score].
279
+
280
+ :param score: The score to check.
281
+ :type score: Union[str, float]
282
+ :param min_score: The minimum score. Default is 1.
283
+ :type min_score: int
284
+ :param max_score: The maximum score. Default is 5.
285
+ :type max_score: int
286
+ :return: True if the score is valid, False otherwise.
287
+ :rtype: bool
288
+ """
289
+ try:
290
+ numeric_score = float(score)
291
+ except (ValueError, TypeError):
292
+ return False
277
293
 
278
- def parse_quality_evaluator_reason_score(llm_output: str) -> Tuple[float, str]:
294
+ return min_score <= numeric_score <= max_score
295
+
296
+ def parse_quality_evaluator_reason_score(llm_output: str, valid_score_range: str = "[1-5]") -> Tuple[float, str]:
279
297
  """Parse the output of prompt-based quality evaluators that return a score and reason.
280
298
 
281
299
  Current supported evaluators:
@@ -284,6 +302,8 @@ def parse_quality_evaluator_reason_score(llm_output: str) -> Tuple[float, str]:
284
302
  - Retrieval
285
303
  - Groundedness
286
304
  - Coherence
305
+ - ResponseCompleteness
306
+ - TaskAdherence
287
307
 
288
308
  :param llm_output: The output of the prompt-based quality evaluator.
289
309
  :type llm_output: str
@@ -294,7 +314,7 @@ def parse_quality_evaluator_reason_score(llm_output: str) -> Tuple[float, str]:
294
314
  reason = ""
295
315
  if llm_output:
296
316
  try:
297
- score_pattern = r"<S2>\D*?([1-5]).*?</S2>"
317
+ score_pattern = rf"<S2>\D*?({valid_score_range}).*?</S2>"
298
318
  reason_pattern = r"<S1>(.*?)</S1>"
299
319
  score_match = re.findall(score_pattern, llm_output, re.DOTALL)
300
320
  reason_match = re.findall(reason_pattern, llm_output, re.DOTALL)
@@ -366,7 +386,7 @@ def validate_conversation(conversation):
366
386
  if not isinstance(messages, list):
367
387
  raise_exception(
368
388
  "'messages' parameter must be a JSON-compatible list of chat messages",
369
- ErrorTarget.CONTENT_SAFETY_MULTIMODAL_EVALUATOR,
389
+ ErrorTarget.CONTENT_SAFETY_CHAT_EVALUATOR,
370
390
  )
371
391
  expected_roles = {"user", "assistant", "system"}
372
392
  image_found = False
@@ -393,7 +413,7 @@ def validate_conversation(conversation):
393
413
  ):
394
414
  raise_exception(
395
415
  f"Messages must be a strongly typed class of ChatRequestMessage. Message number: {num}",
396
- ErrorTarget.CONTENT_SAFETY_MULTIMODAL_EVALUATOR,
416
+ ErrorTarget.CONTENT_SAFETY_CHAT_EVALUATOR,
397
417
  )
398
418
  if isinstance(message, AssistantMessage):
399
419
  assistant_message_count += 1
@@ -407,7 +427,7 @@ def validate_conversation(conversation):
407
427
  if message.get("role") not in expected_roles:
408
428
  raise_exception(
409
429
  f"Invalid role provided: {message.get('role')}. Message number: {num}",
410
- ErrorTarget.CONTENT_SAFETY_MULTIMODAL_EVALUATOR,
430
+ ErrorTarget.CONTENT_SAFETY_CHAT_EVALUATOR,
411
431
  )
412
432
  if message.get("role") == "assistant":
413
433
  assistant_message_count += 1
@@ -417,7 +437,7 @@ def validate_conversation(conversation):
417
437
  if not isinstance(content, (str, list)):
418
438
  raise_exception(
419
439
  f"Content in each turn must be a string or array. Message number: {num}",
420
- ErrorTarget.CONTENT_SAFETY_MULTIMODAL_EVALUATOR,
440
+ ErrorTarget.CONTENT_SAFETY_CHAT_EVALUATOR,
421
441
  )
422
442
  if isinstance(content, list):
423
443
  if any(item.get("type") == "image_url" and "url" in item.get("image_url", {}) for item in content):
@@ -425,21 +445,21 @@ def validate_conversation(conversation):
425
445
  if not image_found:
426
446
  raise_exception(
427
447
  "Message needs to have multi-modal input like images.",
428
- ErrorTarget.CONTENT_SAFETY_MULTIMODAL_EVALUATOR,
448
+ ErrorTarget.CONTENT_SAFETY_CHAT_EVALUATOR,
429
449
  )
430
450
  if assistant_message_count == 0:
431
451
  raise_exception(
432
452
  "Assistant role required in one of the messages.",
433
- ErrorTarget.CONTENT_SAFETY_MULTIMODAL_EVALUATOR,
453
+ ErrorTarget.CONTENT_SAFETY_CHAT_EVALUATOR,
434
454
  )
435
455
  if user_message_count == 0:
436
456
  raise_exception(
437
457
  "User role required in one of the messages.",
438
- ErrorTarget.CONTENT_SAFETY_MULTIMODAL_EVALUATOR,
458
+ ErrorTarget.CONTENT_SAFETY_CHAT_EVALUATOR,
439
459
  )
440
460
  if assistant_message_count > 1:
441
461
  raise_exception(
442
462
  "Evaluators for multimodal conversations only support single turn. "
443
463
  "User and assistant role expected as the only role in each message.",
444
- ErrorTarget.CONTENT_SAFETY_MULTIMODAL_EVALUATOR,
464
+ ErrorTarget.CONTENT_SAFETY_CHAT_EVALUATOR,
445
465
  )
@@ -22,7 +22,12 @@ class EvaluationMetrics:
22
22
  SELF_HARM = "self_harm"
23
23
  SEXUAL = "sexual"
24
24
  PROTECTED_MATERIAL = "protected_material"
25
+ ARTWORK = "artwork"
26
+ FICTIONAL_CHARACTERS = "fictional_characters"
27
+ LOGOS_AND_BRANDS = "logos_and_brands"
25
28
  XPIA = "xpia"
29
+ CODE_VULNERABILITY = "code_vulnerability"
30
+ UNGROUNDED_ATTRIBUTES = "ungrounded_attributes"
26
31
 
27
32
 
28
33
  class _InternalEvaluationMetrics:
@@ -89,3 +94,8 @@ OTEL_EXPORTER_OTLP_TRACES_TIMEOUT_DEFAULT = 60
89
94
  AZURE_OPENAI_TYPE: Literal["azure_openai"] = "azure_openai"
90
95
 
91
96
  OPENAI_TYPE: Literal["openai"] = "openai"
97
+
98
+ EVALUATION_PASS_FAIL_MAPPING = {
99
+ True: "pass",
100
+ False: "fail",
101
+ }
@@ -0,0 +1,3 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------