azure-ai-evaluation 1.0.0b4__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (83) hide show
  1. azure/ai/evaluation/__init__.py +22 -0
  2. azure/ai/evaluation/{simulator/_helpers → _common}/_experimental.py +4 -0
  3. azure/ai/evaluation/_common/constants.py +5 -0
  4. azure/ai/evaluation/_common/math.py +73 -2
  5. azure/ai/evaluation/_common/rai_service.py +250 -62
  6. azure/ai/evaluation/_common/utils.py +196 -23
  7. azure/ai/evaluation/_constants.py +7 -6
  8. azure/ai/evaluation/_evaluate/{_batch_run_client → _batch_run}/__init__.py +3 -2
  9. azure/ai/evaluation/_evaluate/{_batch_run_client/batch_run_context.py → _batch_run/eval_run_context.py} +13 -4
  10. azure/ai/evaluation/_evaluate/{_batch_run_client → _batch_run}/proxy_client.py +19 -6
  11. azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +46 -0
  12. azure/ai/evaluation/_evaluate/_eval_run.py +55 -14
  13. azure/ai/evaluation/_evaluate/_evaluate.py +312 -228
  14. azure/ai/evaluation/_evaluate/_telemetry/__init__.py +7 -6
  15. azure/ai/evaluation/_evaluate/_utils.py +46 -11
  16. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +17 -18
  17. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +67 -31
  18. azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +76 -34
  19. azure/ai/evaluation/_evaluators/_common/_base_eval.py +37 -24
  20. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +21 -9
  21. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +52 -16
  22. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +91 -48
  23. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +100 -26
  24. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +94 -26
  25. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +96 -26
  26. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +97 -26
  27. azure/ai/evaluation/_evaluators/_eci/_eci.py +31 -4
  28. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +20 -13
  29. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +67 -36
  30. azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +66 -36
  31. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +14 -16
  32. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +106 -34
  33. azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +113 -0
  34. azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +99 -0
  35. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +20 -27
  36. azure/ai/evaluation/_evaluators/_multimodal/__init__.py +20 -0
  37. azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py +132 -0
  38. azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal_base.py +55 -0
  39. azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py +100 -0
  40. azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py +124 -0
  41. azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py +100 -0
  42. azure/ai/evaluation/_evaluators/_multimodal/_sexual.py +100 -0
  43. azure/ai/evaluation/_evaluators/_multimodal/_violence.py +100 -0
  44. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +87 -31
  45. azure/ai/evaluation/_evaluators/_qa/_qa.py +23 -31
  46. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +72 -36
  47. azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +78 -42
  48. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +83 -125
  49. azure/ai/evaluation/_evaluators/_retrieval/retrieval.prompty +74 -24
  50. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +26 -27
  51. azure/ai/evaluation/_evaluators/_service_groundedness/__init__.py +9 -0
  52. azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +148 -0
  53. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +37 -28
  54. azure/ai/evaluation/_evaluators/_xpia/xpia.py +94 -33
  55. azure/ai/evaluation/_exceptions.py +19 -0
  56. azure/ai/evaluation/_model_configurations.py +83 -15
  57. azure/ai/evaluation/_version.py +1 -1
  58. azure/ai/evaluation/simulator/__init__.py +2 -1
  59. azure/ai/evaluation/simulator/_adversarial_scenario.py +20 -1
  60. azure/ai/evaluation/simulator/_adversarial_simulator.py +29 -35
  61. azure/ai/evaluation/simulator/_constants.py +11 -1
  62. azure/ai/evaluation/simulator/_data_sources/__init__.py +3 -0
  63. azure/ai/evaluation/simulator/_data_sources/grounding.json +1150 -0
  64. azure/ai/evaluation/simulator/_direct_attack_simulator.py +17 -9
  65. azure/ai/evaluation/simulator/_helpers/__init__.py +1 -2
  66. azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +22 -1
  67. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +90 -35
  68. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +4 -2
  69. azure/ai/evaluation/simulator/_model_tools/_rai_client.py +8 -4
  70. azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +4 -4
  71. azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +6 -1
  72. azure/ai/evaluation/simulator/_simulator.py +165 -105
  73. azure/ai/evaluation/simulator/_utils.py +31 -13
  74. azure_ai_evaluation-1.0.1.dist-info/METADATA +600 -0
  75. {azure_ai_evaluation-1.0.0b4.dist-info → azure_ai_evaluation-1.0.1.dist-info}/NOTICE.txt +20 -0
  76. azure_ai_evaluation-1.0.1.dist-info/RECORD +119 -0
  77. {azure_ai_evaluation-1.0.0b4.dist-info → azure_ai_evaluation-1.0.1.dist-info}/WHEEL +1 -1
  78. azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +0 -322
  79. azure/ai/evaluation/_evaluators/_groundedness/groundedness.prompty +0 -49
  80. azure_ai_evaluation-1.0.0b4.dist-info/METADATA +0 -535
  81. azure_ai_evaluation-1.0.0b4.dist-info/RECORD +0 -106
  82. /azure/ai/evaluation/_evaluate/{_batch_run_client → _batch_run}/code_client.py +0 -0
  83. {azure_ai_evaluation-1.0.0b4.dist-info → azure_ai_evaluation-1.0.1.dist-info}/top_level.txt +0 -0
@@ -12,10 +12,19 @@ from ._evaluators._content_safety import (
12
12
  SexualEvaluator,
13
13
  ViolenceEvaluator,
14
14
  )
15
+ from ._evaluators._multimodal._content_safety_multimodal import (
16
+ ContentSafetyMultimodalEvaluator,
17
+ HateUnfairnessMultimodalEvaluator,
18
+ SelfHarmMultimodalEvaluator,
19
+ SexualMultimodalEvaluator,
20
+ ViolenceMultimodalEvaluator,
21
+ )
22
+ from ._evaluators._multimodal._protected_material import ProtectedMaterialMultimodalEvaluator
15
23
  from ._evaluators._f1_score import F1ScoreEvaluator
16
24
  from ._evaluators._fluency import FluencyEvaluator
17
25
  from ._evaluators._gleu import GleuScoreEvaluator
18
26
  from ._evaluators._groundedness import GroundednessEvaluator
27
+ from ._evaluators._service_groundedness import GroundednessProEvaluator
19
28
  from ._evaluators._meteor import MeteorScoreEvaluator
20
29
  from ._evaluators._protected_material import ProtectedMaterialEvaluator
21
30
  from ._evaluators._qa import QAEvaluator
@@ -27,7 +36,10 @@ from ._evaluators._xpia import IndirectAttackEvaluator
27
36
  from ._model_configurations import (
28
37
  AzureAIProject,
29
38
  AzureOpenAIModelConfiguration,
39
+ Conversation,
40
+ EvaluationResult,
30
41
  EvaluatorConfig,
42
+ Message,
31
43
  OpenAIModelConfiguration,
32
44
  )
33
45
 
@@ -37,6 +49,7 @@ __all__ = [
37
49
  "F1ScoreEvaluator",
38
50
  "FluencyEvaluator",
39
51
  "GroundednessEvaluator",
52
+ "GroundednessProEvaluator",
40
53
  "RelevanceEvaluator",
41
54
  "SimilarityEvaluator",
42
55
  "QAEvaluator",
@@ -57,4 +70,13 @@ __all__ = [
57
70
  "AzureOpenAIModelConfiguration",
58
71
  "OpenAIModelConfiguration",
59
72
  "EvaluatorConfig",
73
+ "Conversation",
74
+ "Message",
75
+ "EvaluationResult",
76
+ "ContentSafetyMultimodalEvaluator",
77
+ "HateUnfairnessMultimodalEvaluator",
78
+ "SelfHarmMultimodalEvaluator",
79
+ "SexualMultimodalEvaluator",
80
+ "ViolenceMultimodalEvaluator",
81
+ "ProtectedMaterialMultimodalEvaluator",
60
82
  ]
@@ -2,6 +2,7 @@
2
2
  # Copyright (c) Microsoft Corporation. All rights reserved.
3
3
  # ---------------------------------------------------------
4
4
 
5
+ import os
5
6
  import functools
6
7
  import inspect
7
8
  import logging
@@ -149,6 +150,9 @@ def _get_indentation_size(doc_string: str) -> int:
149
150
  def _should_skip_warning():
150
151
  skip_warning_msg = False
151
152
 
153
+ if os.getenv("AI_EVALS_DISABLE_EXPERIMENTAL_WARNING", "false").lower() == "true":
154
+ skip_warning_msg = True
155
+
152
156
  # Cases where we want to suppress the warning:
153
157
  # 1. When converting from REST object to SDK object
154
158
  for frame in inspect.stack():
@@ -6,6 +6,9 @@ from enum import Enum
6
6
  from azure.core import CaseInsensitiveEnumMeta
7
7
 
8
8
 
9
+ PROMPT_BASED_REASON_EVALUATORS = ["coherence", "relevance", "retrieval", "groundedness", "fluency"]
10
+
11
+
9
12
  class CommonConstants:
10
13
  """Define common constants."""
11
14
 
@@ -35,6 +38,7 @@ class Tasks:
35
38
  CONTENT_HARM = "content harm"
36
39
  PROTECTED_MATERIAL = "protected material"
37
40
  XPIA = "xpia"
41
+ GROUNDEDNESS = "groundedness"
38
42
 
39
43
 
40
44
  class _InternalAnnotationTasks:
@@ -56,6 +60,7 @@ class EvaluationMetrics(str, Enum, metaclass=CaseInsensitiveEnumMeta):
56
60
  SEXUAL = "sexual"
57
61
  PROTECTED_MATERIAL = "protected_material"
58
62
  XPIA = "xpia"
63
+ GROUNDEDNESS = "generic_groundedness"
59
64
 
60
65
 
61
66
  class _InternalEvaluationMetrics(str, Enum, metaclass=CaseInsensitiveEnumMeta):
@@ -3,16 +3,87 @@
3
3
  # ---------------------------------------------------------
4
4
 
5
5
  import math
6
- from typing import List
6
+ from typing import List, Callable, Any
7
+
8
+ from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
7
9
 
8
10
 
9
11
  def list_sum(lst: List[float]) -> float:
12
+ """Given a list of floats, return the sum of the values.
13
+
14
+ :param lst: A list of floats.
15
+ :type lst: List[float]
16
+ :return: The sum of the values in the list.
17
+ :rtype: float
18
+ """
19
+
10
20
  return sum(lst)
11
21
 
12
22
 
13
23
  def list_mean(lst: List[float]) -> float:
24
+ """Given a list of floats, calculate the mean of the values.
25
+
26
+ :param lst: A list of floats.
27
+ :type lst: List[float]
28
+ :return: The mean of the values in the list.
29
+ :rtype: float
30
+ """
31
+
14
32
  return list_sum(lst) / len(lst)
15
33
 
16
34
 
17
35
  def list_mean_nan_safe(lst: List[float]) -> float:
18
- return list_mean([l for l in lst if not math.isnan(l)])
36
+ """Given a list of floats, remove all nan or None values, then calculate the mean of the remaining values.
37
+
38
+ :param lst: A list of floats.
39
+ :type lst: List[float]
40
+ :return: The mean of the values in the list.
41
+ :rtype: float
42
+ """
43
+
44
+ msg = "All score values are NaN. The mean cannot be calculated."
45
+ if all(math.isnan(l) for l in lst):
46
+ raise EvaluationException(
47
+ message=msg,
48
+ internal_message=msg,
49
+ blame=ErrorBlame.USER_ERROR,
50
+ category=ErrorCategory.INVALID_VALUE,
51
+ target=ErrorTarget.CONVERSATION,
52
+ )
53
+ return list_mean([l for l in lst if not is_none_or_nan(l)])
54
+
55
+
56
+ def apply_transform_nan_safe(lst: List[float], transform_fn: Callable[[float], Any]) -> List[Any]:
57
+ """Given a list of floats, remove all nan values, then apply the inputted transform function
58
+ to the remaining values, and return the resulting list of outputted values.
59
+
60
+ :param lst: A list of floats.
61
+ :type lst: List[float]
62
+ :param transform_fn: A function that produces something when applied to a float.
63
+ :type transform_fn: Callable[[float], Any]
64
+ :return: A list of the transformed values.
65
+ :rtype: List[Any]
66
+ """
67
+
68
+ msg = "All score values are NaN. The mean cannot be calculated."
69
+ if all(math.isnan(l) for l in lst):
70
+ raise EvaluationException(
71
+ message=msg,
72
+ internal_message=msg,
73
+ blame=ErrorBlame.USER_ERROR,
74
+ category=ErrorCategory.INVALID_VALUE,
75
+ target=ErrorTarget.CONVERSATION,
76
+ )
77
+ return [transform_fn(l) for l in lst if not is_none_or_nan(l)]
78
+
79
+
80
+ def is_none_or_nan(val: float) -> bool:
81
+ """math.isnan raises an error if None is inputted. This is a more robust wrapper.
82
+
83
+ :param val: The value to check.
84
+ :type val: float
85
+ :return: Whether the value is None or NaN.
86
+ :rtype: bool
87
+ """
88
+
89
+ return val is None or math.isnan(val)