azure-ai-evaluation 1.0.0b3__py3-none-any.whl → 1.0.0b5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (93) hide show
  1. azure/ai/evaluation/__init__.py +23 -1
  2. azure/ai/evaluation/{simulator/_helpers → _common}/_experimental.py +20 -9
  3. azure/ai/evaluation/_common/constants.py +9 -2
  4. azure/ai/evaluation/_common/math.py +29 -0
  5. azure/ai/evaluation/_common/rai_service.py +222 -93
  6. azure/ai/evaluation/_common/utils.py +328 -19
  7. azure/ai/evaluation/_constants.py +16 -8
  8. azure/ai/evaluation/_evaluate/{_batch_run_client → _batch_run}/__init__.py +3 -2
  9. azure/ai/evaluation/_evaluate/{_batch_run_client → _batch_run}/code_client.py +33 -17
  10. azure/ai/evaluation/_evaluate/{_batch_run_client/batch_run_context.py → _batch_run/eval_run_context.py} +14 -7
  11. azure/ai/evaluation/_evaluate/{_batch_run_client → _batch_run}/proxy_client.py +22 -4
  12. azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +35 -0
  13. azure/ai/evaluation/_evaluate/_eval_run.py +47 -14
  14. azure/ai/evaluation/_evaluate/_evaluate.py +370 -188
  15. azure/ai/evaluation/_evaluate/_telemetry/__init__.py +15 -16
  16. azure/ai/evaluation/_evaluate/_utils.py +77 -25
  17. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +1 -1
  18. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +16 -10
  19. azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +76 -34
  20. azure/ai/evaluation/_evaluators/_common/_base_eval.py +76 -46
  21. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +26 -19
  22. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +62 -25
  23. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +68 -36
  24. azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +67 -46
  25. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +33 -4
  26. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +33 -4
  27. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +33 -4
  28. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +33 -4
  29. azure/ai/evaluation/_evaluators/_eci/_eci.py +7 -5
  30. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +14 -6
  31. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +22 -21
  32. azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +66 -36
  33. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +1 -1
  34. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +51 -16
  35. azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +113 -0
  36. azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +99 -0
  37. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +3 -7
  38. azure/ai/evaluation/_evaluators/_multimodal/__init__.py +20 -0
  39. azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py +130 -0
  40. azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal_base.py +57 -0
  41. azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py +96 -0
  42. azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py +120 -0
  43. azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py +96 -0
  44. azure/ai/evaluation/_evaluators/_multimodal/_sexual.py +96 -0
  45. azure/ai/evaluation/_evaluators/_multimodal/_violence.py +96 -0
  46. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +46 -13
  47. azure/ai/evaluation/_evaluators/_qa/_qa.py +11 -6
  48. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +23 -20
  49. azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +78 -42
  50. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +126 -80
  51. azure/ai/evaluation/_evaluators/_retrieval/retrieval.prompty +74 -24
  52. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +2 -2
  53. azure/ai/evaluation/_evaluators/_service_groundedness/__init__.py +9 -0
  54. azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +150 -0
  55. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +32 -15
  56. azure/ai/evaluation/_evaluators/_xpia/xpia.py +36 -10
  57. azure/ai/evaluation/_exceptions.py +26 -6
  58. azure/ai/evaluation/_http_utils.py +203 -132
  59. azure/ai/evaluation/_model_configurations.py +23 -6
  60. azure/ai/evaluation/_vendor/__init__.py +3 -0
  61. azure/ai/evaluation/_vendor/rouge_score/__init__.py +14 -0
  62. azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +328 -0
  63. azure/ai/evaluation/_vendor/rouge_score/scoring.py +63 -0
  64. azure/ai/evaluation/_vendor/rouge_score/tokenize.py +63 -0
  65. azure/ai/evaluation/_vendor/rouge_score/tokenizers.py +53 -0
  66. azure/ai/evaluation/_version.py +1 -1
  67. azure/ai/evaluation/simulator/__init__.py +2 -1
  68. azure/ai/evaluation/simulator/_adversarial_scenario.py +5 -0
  69. azure/ai/evaluation/simulator/_adversarial_simulator.py +88 -60
  70. azure/ai/evaluation/simulator/_conversation/__init__.py +13 -12
  71. azure/ai/evaluation/simulator/_conversation/_conversation.py +4 -4
  72. azure/ai/evaluation/simulator/_data_sources/__init__.py +3 -0
  73. azure/ai/evaluation/simulator/_data_sources/grounding.json +1150 -0
  74. azure/ai/evaluation/simulator/_direct_attack_simulator.py +24 -66
  75. azure/ai/evaluation/simulator/_helpers/__init__.py +1 -2
  76. azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +26 -5
  77. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +98 -95
  78. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +67 -21
  79. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +28 -11
  80. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +68 -24
  81. azure/ai/evaluation/simulator/_model_tools/models.py +10 -10
  82. azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +4 -9
  83. azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +6 -5
  84. azure/ai/evaluation/simulator/_simulator.py +222 -169
  85. azure/ai/evaluation/simulator/_tracing.py +4 -4
  86. azure/ai/evaluation/simulator/_utils.py +6 -6
  87. {azure_ai_evaluation-1.0.0b3.dist-info → azure_ai_evaluation-1.0.0b5.dist-info}/METADATA +237 -52
  88. azure_ai_evaluation-1.0.0b5.dist-info/NOTICE.txt +70 -0
  89. azure_ai_evaluation-1.0.0b5.dist-info/RECORD +120 -0
  90. {azure_ai_evaluation-1.0.0b3.dist-info → azure_ai_evaluation-1.0.0b5.dist-info}/WHEEL +1 -1
  91. azure/ai/evaluation/_evaluators/_groundedness/groundedness.prompty +0 -49
  92. azure_ai_evaluation-1.0.0b3.dist-info/RECORD +0 -98
  93. {azure_ai_evaluation-1.0.0b3.dist-info → azure_ai_evaluation-1.0.0b5.dist-info}/top_level.txt +0 -0
@@ -12,10 +12,19 @@ from ._evaluators._content_safety import (
12
12
  SexualEvaluator,
13
13
  ViolenceEvaluator,
14
14
  )
15
+ from ._evaluators._multimodal._content_safety_multimodal import (
16
+ ContentSafetyMultimodalEvaluator,
17
+ HateUnfairnessMultimodalEvaluator,
18
+ SelfHarmMultimodalEvaluator,
19
+ SexualMultimodalEvaluator,
20
+ ViolenceMultimodalEvaluator,
21
+ )
22
+ from ._evaluators._multimodal._protected_material import ProtectedMaterialMultimodalEvaluator
15
23
  from ._evaluators._f1_score import F1ScoreEvaluator
16
24
  from ._evaluators._fluency import FluencyEvaluator
17
25
  from ._evaluators._gleu import GleuScoreEvaluator
18
26
  from ._evaluators._groundedness import GroundednessEvaluator
27
+ from ._evaluators._service_groundedness import GroundednessProEvaluator
19
28
  from ._evaluators._meteor import MeteorScoreEvaluator
20
29
  from ._evaluators._protected_material import ProtectedMaterialEvaluator
21
30
  from ._evaluators._qa import QAEvaluator
@@ -27,8 +36,11 @@ from ._evaluators._xpia import IndirectAttackEvaluator
27
36
  from ._model_configurations import (
28
37
  AzureAIProject,
29
38
  AzureOpenAIModelConfiguration,
30
- OpenAIModelConfiguration,
39
+ Conversation,
40
+ EvaluationResult,
31
41
  EvaluatorConfig,
42
+ Message,
43
+ OpenAIModelConfiguration,
32
44
  )
33
45
 
34
46
  __all__ = [
@@ -37,6 +49,7 @@ __all__ = [
37
49
  "F1ScoreEvaluator",
38
50
  "FluencyEvaluator",
39
51
  "GroundednessEvaluator",
52
+ "GroundednessProEvaluator",
40
53
  "RelevanceEvaluator",
41
54
  "SimilarityEvaluator",
42
55
  "QAEvaluator",
@@ -57,4 +70,13 @@ __all__ = [
57
70
  "AzureOpenAIModelConfiguration",
58
71
  "OpenAIModelConfiguration",
59
72
  "EvaluatorConfig",
73
+ "Conversation",
74
+ "Message",
75
+ "EvaluationResult",
76
+ "ContentSafetyMultimodalEvaluator",
77
+ "HateUnfairnessMultimodalEvaluator",
78
+ "SelfHarmMultimodalEvaluator",
79
+ "SexualMultimodalEvaluator",
80
+ "ViolenceMultimodalEvaluator",
81
+ "ProtectedMaterialMultimodalEvaluator",
60
82
  ]
@@ -6,9 +6,9 @@ import functools
6
6
  import inspect
7
7
  import logging
8
8
  import sys
9
- from typing import Callable, Type, TypeVar, Union
9
+ from typing import Callable, Type, TypeVar, Union, overload
10
10
 
11
- from typing_extensions import ParamSpec
11
+ from typing_extensions import ParamSpec, TypeGuard
12
12
 
13
13
  DOCSTRING_TEMPLATE = ".. note:: {0} {1}\n\n"
14
14
  DOCSTRING_DEFAULT_INDENTATION = 8
@@ -22,20 +22,31 @@ EXPERIMENTAL_LINK_MESSAGE = (
22
22
  _warning_cache = set()
23
23
  module_logger = logging.getLogger(__name__)
24
24
 
25
- TExperimental = TypeVar("TExperimental", bound=Union[Type, Callable])
26
25
  P = ParamSpec("P")
27
26
  T = TypeVar("T")
28
27
 
29
28
 
30
- def experimental(wrapped: TExperimental) -> TExperimental:
29
+ @overload
30
+ def experimental(wrapped: Type[T]) -> Type[T]: ...
31
+
32
+
33
+ @overload
34
+ def experimental(wrapped: Callable[P, T]) -> Callable[P, T]: ...
35
+
36
+
37
+ def experimental(wrapped: Union[Type[T], Callable[P, T]]) -> Union[Type[T], Callable[P, T]]:
31
38
  """Add experimental tag to a class or a method.
32
39
 
33
40
  :param wrapped: Either a Class or Function to mark as experimental
34
- :type wrapped: TExperimental
41
+ :type wrapped: Union[Type[T], Callable[P, T]]
35
42
  :return: The wrapped class or method
36
- :rtype: TExperimental
43
+ :rtype: Union[Type[T], Callable[P, T]]
37
44
  """
38
- if inspect.isclass(wrapped):
45
+
46
+ def is_class(t: Union[Type[T], Callable[P, T]]) -> TypeGuard[Type[T]]:
47
+ return isinstance(t, type)
48
+
49
+ if is_class(wrapped):
39
50
  return _add_class_docstring(wrapped)
40
51
  if inspect.isfunction(wrapped):
41
52
  return _add_method_docstring(wrapped)
@@ -74,11 +85,11 @@ def _add_class_docstring(cls: Type[T]) -> Type[T]:
74
85
  cls.__doc__ = _add_note_to_docstring(cls.__doc__, doc_string)
75
86
  else:
76
87
  cls.__doc__ = doc_string + ">"
77
- cls.__init__ = _add_class_warning(cls.__init__)
88
+ cls.__init__ = _add_class_warning(cls.__init__) # type: ignore[method-assign]
78
89
  return cls
79
90
 
80
91
 
81
- def _add_method_docstring(func: Callable[P, T] = None) -> Callable[P, T]:
92
+ def _add_method_docstring(func: Callable[P, T]) -> Callable[P, T]:
82
93
  """Add experimental tag to the method doc string.
83
94
 
84
95
  :param func: The function to update
@@ -3,6 +3,11 @@
3
3
  # ---------------------------------------------------------
4
4
  from enum import Enum
5
5
 
6
+ from azure.core import CaseInsensitiveEnumMeta
7
+
8
+
9
+ PROMPT_BASED_REASON_EVALUATORS = ["coherence", "relevance", "retrieval", "groundedness", "fluency"]
10
+
6
11
 
7
12
  class CommonConstants:
8
13
  """Define common constants."""
@@ -33,6 +38,7 @@ class Tasks:
33
38
  CONTENT_HARM = "content harm"
34
39
  PROTECTED_MATERIAL = "protected material"
35
40
  XPIA = "xpia"
41
+ GROUNDEDNESS = "groundedness"
36
42
 
37
43
 
38
44
  class _InternalAnnotationTasks:
@@ -43,7 +49,7 @@ class _InternalAnnotationTasks:
43
49
  ECI = "eci"
44
50
 
45
51
 
46
- class EvaluationMetrics:
52
+ class EvaluationMetrics(str, Enum, metaclass=CaseInsensitiveEnumMeta):
47
53
  """Evaluation metrics to aid the RAI service in determining what
48
54
  metrics to request, and how to present them back to the user."""
49
55
 
@@ -54,9 +60,10 @@ class EvaluationMetrics:
54
60
  SEXUAL = "sexual"
55
61
  PROTECTED_MATERIAL = "protected_material"
56
62
  XPIA = "xpia"
63
+ GROUNDEDNESS = "generic_groundedness"
57
64
 
58
65
 
59
- class _InternalEvaluationMetrics:
66
+ class _InternalEvaluationMetrics(str, Enum, metaclass=CaseInsensitiveEnumMeta):
60
67
  """Evaluation metrics that are not publicly supported.
61
68
  These metrics are experimental and subject to potential change or migration to the main
62
69
  enum over time.
@@ -0,0 +1,29 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+
5
+ import math
6
+ from typing import List
7
+
8
+ from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
9
+
10
+
11
+ def list_sum(lst: List[float]) -> float:
12
+ return sum(lst)
13
+
14
+
15
+ def list_mean(lst: List[float]) -> float:
16
+ return list_sum(lst) / len(lst)
17
+
18
+
19
+ def list_mean_nan_safe(lst: List[float]) -> float:
20
+ msg = "All score values are NaN. The mean cannot be calculated."
21
+ if all(math.isnan(l) for l in lst):
22
+ raise EvaluationException(
23
+ message=msg,
24
+ internal_message=msg,
25
+ blame=ErrorBlame.USER_ERROR,
26
+ category=ErrorCategory.INVALID_VALUE,
27
+ target=ErrorTarget.CONVERSATION,
28
+ )
29
+ return list_mean([l for l in lst if not math.isnan(l)])