azure-ai-evaluation 1.0.0__py3-none-any.whl → 1.0.0b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (108) hide show
  1. azure/ai/evaluation/__init__.py +4 -26
  2. azure/ai/evaluation/_common/constants.py +2 -9
  3. azure/ai/evaluation/_common/rai_service.py +122 -302
  4. azure/ai/evaluation/_common/utils.py +35 -393
  5. azure/ai/evaluation/_constants.py +6 -28
  6. azure/ai/evaluation/_evaluate/{_batch_run → _batch_run_client}/__init__.py +2 -3
  7. azure/ai/evaluation/_evaluate/{_batch_run/eval_run_context.py → _batch_run_client/batch_run_context.py} +8 -25
  8. azure/ai/evaluation/_evaluate/{_batch_run → _batch_run_client}/code_client.py +30 -68
  9. azure/ai/evaluation/_evaluate/_batch_run_client/proxy_client.py +61 -0
  10. azure/ai/evaluation/_evaluate/_eval_run.py +40 -117
  11. azure/ai/evaluation/_evaluate/_evaluate.py +255 -416
  12. azure/ai/evaluation/_evaluate/_telemetry/__init__.py +19 -24
  13. azure/ai/evaluation/_evaluate/_utils.py +47 -108
  14. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +19 -18
  15. azure/ai/evaluation/_evaluators/{_retrieval → _chat}/__init__.py +2 -2
  16. azure/ai/evaluation/_evaluators/_chat/_chat.py +350 -0
  17. azure/ai/evaluation/_evaluators/{_service_groundedness → _chat/retrieval}/__init__.py +2 -2
  18. azure/ai/evaluation/_evaluators/_chat/retrieval/_retrieval.py +163 -0
  19. azure/ai/evaluation/_evaluators/_chat/retrieval/retrieval.prompty +48 -0
  20. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +93 -78
  21. azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +39 -76
  22. azure/ai/evaluation/_evaluators/_content_safety/__init__.py +4 -0
  23. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +68 -104
  24. azure/ai/evaluation/_evaluators/{_multimodal/_content_safety_multimodal_base.py → _content_safety/_content_safety_base.py} +35 -24
  25. azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +296 -0
  26. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +54 -105
  27. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +52 -99
  28. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +52 -101
  29. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +51 -101
  30. azure/ai/evaluation/_evaluators/_eci/_eci.py +55 -45
  31. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +20 -36
  32. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +94 -76
  33. azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +41 -66
  34. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +17 -15
  35. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +92 -113
  36. azure/ai/evaluation/_evaluators/_groundedness/groundedness.prompty +54 -0
  37. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +27 -21
  38. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +80 -89
  39. azure/ai/evaluation/_evaluators/_protected_materials/__init__.py +5 -0
  40. azure/ai/evaluation/_evaluators/_protected_materials/_protected_materials.py +104 -0
  41. azure/ai/evaluation/_evaluators/_qa/_qa.py +43 -25
  42. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +101 -84
  43. azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +47 -78
  44. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +27 -27
  45. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +45 -55
  46. azure/ai/evaluation/_evaluators/_similarity/similarity.prompty +5 -0
  47. azure/ai/evaluation/_evaluators/_xpia/xpia.py +106 -91
  48. azure/ai/evaluation/_exceptions.py +7 -28
  49. azure/ai/evaluation/_http_utils.py +134 -205
  50. azure/ai/evaluation/_model_configurations.py +8 -104
  51. azure/ai/evaluation/_version.py +1 -1
  52. azure/ai/evaluation/simulator/__init__.py +2 -3
  53. azure/ai/evaluation/simulator/_adversarial_scenario.py +1 -20
  54. azure/ai/evaluation/simulator/_adversarial_simulator.py +95 -116
  55. azure/ai/evaluation/simulator/_constants.py +1 -11
  56. azure/ai/evaluation/simulator/_conversation/__init__.py +13 -14
  57. azure/ai/evaluation/simulator/_conversation/_conversation.py +20 -20
  58. azure/ai/evaluation/simulator/_direct_attack_simulator.py +68 -34
  59. azure/ai/evaluation/simulator/_helpers/__init__.py +1 -1
  60. azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +28 -31
  61. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +95 -108
  62. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +22 -70
  63. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +14 -30
  64. azure/ai/evaluation/simulator/_model_tools/_rai_client.py +14 -25
  65. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +24 -68
  66. azure/ai/evaluation/simulator/_model_tools/models.py +21 -19
  67. azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +10 -6
  68. azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +5 -6
  69. azure/ai/evaluation/simulator/_tracing.py +28 -25
  70. azure/ai/evaluation/simulator/_utils.py +13 -34
  71. azure/ai/evaluation/simulator/simulator.py +579 -0
  72. azure_ai_evaluation-1.0.0b1.dist-info/METADATA +377 -0
  73. azure_ai_evaluation-1.0.0b1.dist-info/RECORD +97 -0
  74. {azure_ai_evaluation-1.0.0.dist-info → azure_ai_evaluation-1.0.0b1.dist-info}/WHEEL +1 -1
  75. azure/ai/evaluation/_common/_experimental.py +0 -172
  76. azure/ai/evaluation/_common/math.py +0 -89
  77. azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +0 -99
  78. azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +0 -46
  79. azure/ai/evaluation/_evaluators/_common/__init__.py +0 -13
  80. azure/ai/evaluation/_evaluators/_common/_base_eval.py +0 -344
  81. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +0 -88
  82. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +0 -133
  83. azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +0 -113
  84. azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +0 -99
  85. azure/ai/evaluation/_evaluators/_multimodal/__init__.py +0 -20
  86. azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py +0 -132
  87. azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py +0 -100
  88. azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py +0 -124
  89. azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py +0 -100
  90. azure/ai/evaluation/_evaluators/_multimodal/_sexual.py +0 -100
  91. azure/ai/evaluation/_evaluators/_multimodal/_violence.py +0 -100
  92. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +0 -112
  93. azure/ai/evaluation/_evaluators/_retrieval/retrieval.prompty +0 -93
  94. azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +0 -148
  95. azure/ai/evaluation/_vendor/__init__.py +0 -3
  96. azure/ai/evaluation/_vendor/rouge_score/__init__.py +0 -14
  97. azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +0 -328
  98. azure/ai/evaluation/_vendor/rouge_score/scoring.py +0 -63
  99. azure/ai/evaluation/_vendor/rouge_score/tokenize.py +0 -63
  100. azure/ai/evaluation/_vendor/rouge_score/tokenizers.py +0 -53
  101. azure/ai/evaluation/simulator/_data_sources/__init__.py +0 -3
  102. azure/ai/evaluation/simulator/_data_sources/grounding.json +0 -1150
  103. azure/ai/evaluation/simulator/_prompty/__init__.py +0 -0
  104. azure/ai/evaluation/simulator/_simulator.py +0 -716
  105. azure_ai_evaluation-1.0.0.dist-info/METADATA +0 -595
  106. azure_ai_evaluation-1.0.0.dist-info/NOTICE.txt +0 -70
  107. azure_ai_evaluation-1.0.0.dist-info/RECORD +0 -119
  108. {azure_ai_evaluation-1.0.0.dist-info → azure_ai_evaluation-1.0.0b1.dist-info}/top_level.txt +0 -0
@@ -4,37 +4,35 @@
4
4
  # pylint: disable=C0103,C0114,C0116,E0401,E0611
5
5
 
6
6
  import functools
7
- from typing import Callable, TypeVar
8
7
 
9
8
  from promptflow._sdk._telemetry.activity import ActivityType, monitor_operation
10
- from typing_extensions import ParamSpec
11
-
12
- P = ParamSpec("P")
13
- R = TypeVar("R")
14
9
 
15
10
 
16
11
  def monitor_adversarial_scenario(activity_name: str = "adversarial.simulator.call"):
17
12
  """
18
13
  Monitor an adversarial scenario.
19
14
 
20
- :param activity_name: The name of the activity to monitor.
21
- :type activity_name: str
22
- :returns: A decorator
23
- :rtype: Callable[[Callable], Callable]
15
+ Parameters:
16
+ activity_name (str): The name of the activity to monitor.
24
17
  """
25
18
 
26
- def decorator(func: Callable[P, R]) -> Callable[P, R]:
19
+ def decorator(func):
27
20
  """
28
21
  Decorator for monitoring an adversarial scenario.
29
22
 
30
- :param func: The function to be decorated.
31
- :type func: Callable[P, R]
32
- :returns: The decorated function
33
- :rtype: Callable[P, R]
23
+ Parameters:
24
+ func (function): The function to be decorated.
34
25
  """
35
26
 
36
27
  @functools.wraps(func)
37
- def wrapper(*args: P.args, **kwargs: P.kwargs) -> R:
28
+ def wrapper(*args, **kwargs):
29
+ """
30
+ Wrapper for monitoring an adversarial scenario.
31
+
32
+ Parameters:
33
+ *args: Variable length argument list.
34
+ **kwargs: Arbitrary keyword arguments.
35
+ """
38
36
  scenario = str(kwargs.get("scenario", None))
39
37
  max_conversation_turns = kwargs.get("max_conversation_turns", None)
40
38
  max_simulation_results = kwargs.get("max_simulation_results", None)
@@ -57,28 +55,33 @@ def monitor_adversarial_scenario(activity_name: str = "adversarial.simulator.cal
57
55
  return decorator
58
56
 
59
57
 
60
- def monitor_task_simulator(func: Callable[P, R]) -> Callable[P, R]:
58
+ def monitor_task_simulator(func):
61
59
  """
62
60
  Monitor a task simulator.
63
61
 
64
- :param func: The function to be decorated.
65
- :type func: Callable[P, R]
66
- :returns: The decorated function
67
- :rtype: Callable[P, R]
62
+ Parameters:
63
+ func (function): The function to be decorated.
68
64
  """
69
65
 
70
66
  @functools.wraps(func)
71
- def wrapper(*args: P.args, **kwargs: P.kwargs) -> R:
72
- text = kwargs.get("text")
73
- user_persona = kwargs.get("user_persona")
67
+ def wrapper(*args, **kwargs):
68
+ """
69
+ Wrapper for monitoring a task simulator.
70
+
71
+ Parameters:
72
+ *args: Variable length argument list.
73
+ **kwargs: Arbitrary keyword arguments.
74
+ """
75
+ text_length = len(kwargs.get("text", ""))
76
+ user_persona_length = len(kwargs.get("user_persona", []))
74
77
  num_queries = kwargs.get("num_queries", 0)
75
78
  max_conversation_turns = kwargs.get("max_conversation_turns", 0)
76
79
  decorated_func = monitor_operation(
77
80
  activity_name="task.simulator.call",
78
81
  activity_type=ActivityType.PUBLICAPI,
79
82
  custom_dimensions={
80
- "text_length": len(text) if isinstance(text, str) else 0,
81
- "user_persona_length": len(user_persona) if isinstance(user_persona, list) else 0,
83
+ "text_length": text_length,
84
+ "user_persona_length": user_persona_length,
82
85
  "number_of_queries": num_queries,
83
86
  "max_conversation_turns": max_conversation_turns,
84
87
  },
@@ -26,9 +26,9 @@ class JsonLineList(list):
26
26
  json_lines += json.dumps(item) + "\n"
27
27
  return json_lines
28
28
 
29
- def to_eval_qr_json_lines(self):
29
+ def to_eval_qa_json_lines(self):
30
30
  """
31
- Converts the list to a string of JSON lines suitable for evaluation in a query & response format.
31
+ Converts the list to a string of JSON lines suitable for evaluation in a Q&A format.
32
32
  Each item in the list is expected to be a dictionary with
33
33
  'messages' key. The 'messages' value is a list of
34
34
  dictionaries, each with a 'role' key and a 'content' key.
@@ -44,41 +44,23 @@ class JsonLineList(list):
44
44
  for item in self:
45
45
  user_message = None
46
46
  assistant_message = None
47
- user_context = None
48
- assistant_context = None
49
- template_parameters = item.get("template_parameters", {})
50
- category = template_parameters.get("category", None)
47
+ context = None
51
48
  for message in item["messages"]:
52
49
  if message["role"] == "user":
53
50
  user_message = message["content"]
54
- user_context = message.get("context", "")
55
51
  elif message["role"] == "assistant":
56
52
  assistant_message = message["content"]
57
- assistant_context = message.get("context", "")
53
+ if "context" in message:
54
+ context = message.get("context", None)
58
55
  if user_message and assistant_message:
59
- if user_context or assistant_context:
56
+ if context:
60
57
  json_lines += (
61
- json.dumps(
62
- {
63
- "query": user_message,
64
- "response": assistant_message,
65
- "context": str(
66
- {
67
- "user_context": user_context,
68
- "assistant_context": assistant_context,
69
- }
70
- ),
71
- "category": category,
72
- }
73
- )
58
+ json.dumps({"query": user_message, "response": assistant_message, "context": context})
74
59
  + "\n"
75
60
  )
76
- user_message = assistant_message = None
61
+ user_message = assistant_message = context = None
77
62
  else:
78
- json_lines += (
79
- json.dumps({"query": user_message, "response": assistant_message, "category": category})
80
- + "\n"
81
- )
63
+ json_lines += json.dumps({"query": user_message, "response": assistant_message}) + "\n"
82
64
  user_message = assistant_message = None
83
65
 
84
66
  return json_lines
@@ -98,13 +80,10 @@ class JsonLineChatProtocol(dict):
98
80
  """
99
81
  return json.dumps(self)
100
82
 
101
- def to_eval_qr_json_lines(self) -> str:
83
+ def to_eval_qa_json_lines(self):
102
84
  """
103
- Converts the object to a string of JSON lines suitable for evaluation in a query and response format.
85
+ Converts the object to a string of JSON lines suitable for evaluation in a Q&A format.
104
86
  The object is expected to be a dictionary with 'messages' key.
105
-
106
- :returns: A json lines document
107
- :rtype: str
108
87
  """
109
88
  user_message = None
110
89
  assistant_message = None
@@ -123,10 +102,10 @@ class JsonLineChatProtocol(dict):
123
102
  if user_message and assistant_message:
124
103
  if context:
125
104
  json_lines += (
126
- json.dumps({"query": user_message, "response": assistant_message, "context": context}) + "\n"
105
+ json.dumps({"question": user_message, "answer": assistant_message, "context": context}) + "\n"
127
106
  )
128
107
  user_message = assistant_message = None
129
108
  else:
130
- json_lines += json.dumps({"query": user_message, "response": assistant_message}) + "\n"
109
+ json_lines += json.dumps({"question": user_message, "answer": assistant_message}) + "\n"
131
110
  user_message = assistant_message = None
132
111
  return json_lines