azure-ai-evaluation 1.2.0__py3-none-any.whl → 1.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (134) hide show
  1. azure/ai/evaluation/__init__.py +42 -14
  2. azure/ai/evaluation/_azure/_models.py +6 -6
  3. azure/ai/evaluation/_common/constants.py +6 -2
  4. azure/ai/evaluation/_common/rai_service.py +38 -4
  5. azure/ai/evaluation/_common/raiclient/__init__.py +34 -0
  6. azure/ai/evaluation/_common/raiclient/_client.py +128 -0
  7. azure/ai/evaluation/_common/raiclient/_configuration.py +87 -0
  8. azure/ai/evaluation/_common/raiclient/_model_base.py +1235 -0
  9. azure/ai/evaluation/_common/raiclient/_patch.py +20 -0
  10. azure/ai/evaluation/_common/raiclient/_serialization.py +2050 -0
  11. azure/ai/evaluation/_common/raiclient/_version.py +9 -0
  12. azure/ai/evaluation/_common/raiclient/aio/__init__.py +29 -0
  13. azure/ai/evaluation/_common/raiclient/aio/_client.py +130 -0
  14. azure/ai/evaluation/_common/raiclient/aio/_configuration.py +87 -0
  15. azure/ai/evaluation/_common/raiclient/aio/_patch.py +20 -0
  16. azure/ai/evaluation/_common/raiclient/aio/operations/__init__.py +25 -0
  17. azure/ai/evaluation/_common/raiclient/aio/operations/_operations.py +981 -0
  18. azure/ai/evaluation/_common/raiclient/aio/operations/_patch.py +20 -0
  19. azure/ai/evaluation/_common/raiclient/models/__init__.py +60 -0
  20. azure/ai/evaluation/_common/raiclient/models/_enums.py +18 -0
  21. azure/ai/evaluation/_common/raiclient/models/_models.py +651 -0
  22. azure/ai/evaluation/_common/raiclient/models/_patch.py +20 -0
  23. azure/ai/evaluation/_common/raiclient/operations/__init__.py +25 -0
  24. azure/ai/evaluation/_common/raiclient/operations/_operations.py +1225 -0
  25. azure/ai/evaluation/_common/raiclient/operations/_patch.py +20 -0
  26. azure/ai/evaluation/_common/raiclient/py.typed +1 -0
  27. azure/ai/evaluation/_common/utils.py +30 -10
  28. azure/ai/evaluation/_constants.py +10 -0
  29. azure/ai/evaluation/_converters/__init__.py +3 -0
  30. azure/ai/evaluation/_converters/_ai_services.py +804 -0
  31. azure/ai/evaluation/_converters/_models.py +302 -0
  32. azure/ai/evaluation/_evaluate/_batch_run/__init__.py +10 -3
  33. azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +104 -0
  34. azure/ai/evaluation/_evaluate/_batch_run/batch_clients.py +82 -0
  35. azure/ai/evaluation/_evaluate/_eval_run.py +1 -1
  36. azure/ai/evaluation/_evaluate/_evaluate.py +36 -4
  37. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +23 -3
  38. azure/ai/evaluation/_evaluators/_code_vulnerability/__init__.py +5 -0
  39. azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +120 -0
  40. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +21 -2
  41. azure/ai/evaluation/_evaluators/_common/_base_eval.py +43 -3
  42. azure/ai/evaluation/_evaluators/_common/_base_multi_eval.py +3 -1
  43. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +43 -4
  44. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +16 -4
  45. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +42 -5
  46. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +15 -0
  47. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +15 -0
  48. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +15 -0
  49. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +15 -0
  50. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +28 -4
  51. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +21 -2
  52. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +26 -3
  53. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +21 -3
  54. azure/ai/evaluation/_evaluators/_intent_resolution/__init__.py +7 -0
  55. azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +152 -0
  56. azure/ai/evaluation/_evaluators/_intent_resolution/intent_resolution.prompty +161 -0
  57. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +26 -3
  58. azure/ai/evaluation/_evaluators/_qa/_qa.py +51 -7
  59. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +26 -2
  60. azure/ai/evaluation/_evaluators/_response_completeness/__init__.py +7 -0
  61. azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +157 -0
  62. azure/ai/evaluation/_evaluators/_response_completeness/response_completeness.prompty +99 -0
  63. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +21 -2
  64. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +113 -4
  65. azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +23 -3
  66. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +24 -5
  67. azure/ai/evaluation/_evaluators/_task_adherence/__init__.py +7 -0
  68. azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +148 -0
  69. azure/ai/evaluation/_evaluators/_task_adherence/task_adherence.prompty +117 -0
  70. azure/ai/evaluation/_evaluators/_tool_call_accuracy/__init__.py +9 -0
  71. azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +292 -0
  72. azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +71 -0
  73. azure/ai/evaluation/_evaluators/_ungrounded_attributes/__init__.py +5 -0
  74. azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +103 -0
  75. azure/ai/evaluation/_evaluators/_xpia/xpia.py +2 -0
  76. azure/ai/evaluation/_exceptions.py +5 -1
  77. azure/ai/evaluation/_legacy/__init__.py +3 -0
  78. azure/ai/evaluation/_legacy/_batch_engine/__init__.py +9 -0
  79. azure/ai/evaluation/_legacy/_batch_engine/_config.py +45 -0
  80. azure/ai/evaluation/_legacy/_batch_engine/_engine.py +368 -0
  81. azure/ai/evaluation/_legacy/_batch_engine/_exceptions.py +88 -0
  82. azure/ai/evaluation/_legacy/_batch_engine/_logging.py +292 -0
  83. azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py +23 -0
  84. azure/ai/evaluation/_legacy/_batch_engine/_result.py +99 -0
  85. azure/ai/evaluation/_legacy/_batch_engine/_run.py +121 -0
  86. azure/ai/evaluation/_legacy/_batch_engine/_run_storage.py +128 -0
  87. azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +217 -0
  88. azure/ai/evaluation/_legacy/_batch_engine/_status.py +25 -0
  89. azure/ai/evaluation/_legacy/_batch_engine/_trace.py +105 -0
  90. azure/ai/evaluation/_legacy/_batch_engine/_utils.py +82 -0
  91. azure/ai/evaluation/_legacy/_batch_engine/_utils_deprecated.py +131 -0
  92. azure/ai/evaluation/_legacy/prompty/__init__.py +36 -0
  93. azure/ai/evaluation/_legacy/prompty/_connection.py +182 -0
  94. azure/ai/evaluation/_legacy/prompty/_exceptions.py +59 -0
  95. azure/ai/evaluation/_legacy/prompty/_prompty.py +313 -0
  96. azure/ai/evaluation/_legacy/prompty/_utils.py +545 -0
  97. azure/ai/evaluation/_legacy/prompty/_yaml_utils.py +99 -0
  98. azure/ai/evaluation/_red_team/__init__.py +3 -0
  99. azure/ai/evaluation/_red_team/_attack_objective_generator.py +192 -0
  100. azure/ai/evaluation/_red_team/_attack_strategy.py +42 -0
  101. azure/ai/evaluation/_red_team/_callback_chat_target.py +74 -0
  102. azure/ai/evaluation/_red_team/_default_converter.py +21 -0
  103. azure/ai/evaluation/_red_team/_red_team.py +1858 -0
  104. azure/ai/evaluation/_red_team/_red_team_result.py +246 -0
  105. azure/ai/evaluation/_red_team/_utils/__init__.py +3 -0
  106. azure/ai/evaluation/_red_team/_utils/constants.py +64 -0
  107. azure/ai/evaluation/_red_team/_utils/formatting_utils.py +164 -0
  108. azure/ai/evaluation/_red_team/_utils/logging_utils.py +139 -0
  109. azure/ai/evaluation/_red_team/_utils/strategy_utils.py +188 -0
  110. azure/ai/evaluation/_safety_evaluation/__init__.py +3 -0
  111. azure/ai/evaluation/_safety_evaluation/_generated_rai_client.py +0 -0
  112. azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +741 -0
  113. azure/ai/evaluation/_version.py +2 -1
  114. azure/ai/evaluation/simulator/_adversarial_scenario.py +3 -1
  115. azure/ai/evaluation/simulator/_adversarial_simulator.py +61 -27
  116. azure/ai/evaluation/simulator/_conversation/__init__.py +4 -5
  117. azure/ai/evaluation/simulator/_conversation/_conversation.py +4 -0
  118. azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +145 -0
  119. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +2 -0
  120. azure/ai/evaluation/simulator/_model_tools/_rai_client.py +71 -1
  121. {azure_ai_evaluation-1.2.0.dist-info → azure_ai_evaluation-1.4.0.dist-info}/METADATA +75 -15
  122. azure_ai_evaluation-1.4.0.dist-info/RECORD +197 -0
  123. {azure_ai_evaluation-1.2.0.dist-info → azure_ai_evaluation-1.4.0.dist-info}/WHEEL +1 -1
  124. azure/ai/evaluation/_evaluators/_multimodal/__init__.py +0 -20
  125. azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py +0 -132
  126. azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal_base.py +0 -55
  127. azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py +0 -100
  128. azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py +0 -124
  129. azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py +0 -100
  130. azure/ai/evaluation/_evaluators/_multimodal/_sexual.py +0 -100
  131. azure/ai/evaluation/_evaluators/_multimodal/_violence.py +0 -100
  132. azure_ai_evaluation-1.2.0.dist-info/RECORD +0 -125
  133. {azure_ai_evaluation-1.2.0.dist-info → azure_ai_evaluation-1.4.0.dist-info}/NOTICE.txt +0 -0
  134. {azure_ai_evaluation-1.2.0.dist-info → azure_ai_evaluation-1.4.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,302 @@
1
+ import datetime
2
+ import json
3
+
4
+ from pydantic import BaseModel
5
+
6
+ from azure.ai.projects.models import RunStepFunctionToolCall
7
+
8
+ from typing import List, Optional, Union
9
+
10
+ # Message roles constants.
11
+ _SYSTEM = "system"
12
+ _USER = "user"
13
+ _AGENT = "assistant"
14
+ _TOOL = "tool"
15
+
16
+ # Constant definitions for what tool details include.
17
+ _TOOL_CALL = "tool_call"
18
+ _TOOL_RESULT = "tool_result"
19
+ _FUNCTION = "function"
20
+
21
+ # This is returned by AI services in the API to filter against tool invocations.
22
+ _TOOL_CALLS = "tool_calls"
23
+
24
+
25
+ class Message(BaseModel):
26
+ """Represents a message in a conversation with agents, assistants, and tools. We need to export these structures
27
+ to JSON for evaluators and we have custom fields such as createdAt, run_id, and tool_call_id, so we cannot use
28
+ the standard pydantic models provided by OpenAI.
29
+
30
+ :param createdAt: The timestamp when the message was created.
31
+ :type createdAt: datetime.datetime
32
+ :param run_id: The ID of the run associated with the message. Optional.
33
+ :type run_id: Optional[str]
34
+ :param role: The role of the message sender (e.g., system, user, tool, assistant).
35
+ :type role: str
36
+ :param content: The content of the message, which can be a string or a list of dictionaries.
37
+ :type content: Union[str, List[dict]]
38
+ """
39
+
40
+ createdAt: Optional[Union[datetime.datetime, int]] = None # SystemMessage wouldn't have this
41
+ run_id: Optional[str] = None
42
+ tool_call_id: Optional[str] = None # see ToolMessage
43
+ role: str
44
+ content: Union[str, List[dict]]
45
+
46
+
47
+ class SystemMessage(Message):
48
+ """Represents a system message in a conversation with agents, assistants, and tools.
49
+
50
+ :param role: The role of the message sender, which is always 'system'.
51
+ :type role: str
52
+ """
53
+
54
+ role: str = _SYSTEM
55
+
56
+
57
+ class UserMessage(Message):
58
+ """Represents a user message in a conversation with agents, assistants, and tools.
59
+
60
+ :param role: The role of the message sender, which is always 'user'.
61
+ :type role: str
62
+ """
63
+
64
+ role: str = _USER
65
+
66
+
67
+ class ToolMessage(Message):
68
+ """Represents a tool message in a conversation with agents, assistants, and tools.
69
+
70
+ :param run_id: The ID of the run associated with the message.
71
+ :type run_id: str
72
+ :param role: The role of the message sender, which is always 'tool'.
73
+ :type role: str
74
+ :param tool_call_id: The ID of the tool call associated with the message. Optional.
75
+ :type tool_call_id: Optional[str]
76
+ """
77
+
78
+ run_id: str
79
+ role: str = _TOOL
80
+ tool_call_id: Optional[str] = None
81
+
82
+
83
+ class AssistantMessage(Message):
84
+ """Represents an assistant message.
85
+
86
+ :param run_id: The ID of the run associated with the message.
87
+ :type run_id: str
88
+ :param role: The role of the message sender, which is always 'assistant'.
89
+ :type role: str
90
+ """
91
+
92
+ run_id: str
93
+ role: str = _AGENT
94
+
95
+
96
+ class ToolDefinition(BaseModel):
97
+ """Represents a tool definition that will be used in the agent.
98
+
99
+ :param name: The name of the tool.
100
+ :type name: str
101
+ :param description: A description of the tool.
102
+ :type description: str
103
+ :param parameters: The parameters required by the tool.
104
+ :type parameters: dict
105
+ """
106
+
107
+ name: str
108
+ description: Optional[str] = None
109
+ parameters: dict
110
+
111
+
112
+ class ToolCall:
113
+ """Represents a tool call, used as an intermediate step in the conversion process.
114
+
115
+ :param created: The timestamp when the tool call was created.
116
+ :type created: datetime.datetime
117
+ :param completed: The timestamp when the tool call was completed.
118
+ :type completed: datetime.datetime
119
+ :param details: The details of the tool call.
120
+ :type details: RunStepFunctionToolCall
121
+ """
122
+
123
+ def __init__(self, created: datetime.datetime, completed: datetime.datetime, details: RunStepFunctionToolCall):
124
+ self.created = created
125
+ self.completed = completed
126
+ self.details = details
127
+
128
+
129
+ class EvaluatorData(BaseModel):
130
+ """Represents the result of a conversion.
131
+
132
+ :param query: A list of messages representing the system message, chat history, and user query.
133
+ :type query: List[Message]
134
+ :param response: A list of messages representing the assistant's response, including tool calls and results.
135
+ :type response: List[Message]
136
+ :param tool_definitions: A list of tool definitions used in the agent.
137
+ :type tool_definitions: List[ToolDefinition]
138
+ """
139
+
140
+ query: List[Message]
141
+ response: List[Message]
142
+ tool_definitions: List[ToolDefinition]
143
+
144
+ def to_json(self):
145
+ """Converts the result to a JSON string.
146
+
147
+ :return: The JSON representation of the result.
148
+ :rtype: str
149
+ """
150
+ return self.model_dump_json(exclude={}, exclude_none=True)
151
+
152
+
153
+ def break_tool_call_into_messages(tool_call: ToolCall, run_id: str) -> List[Message]:
154
+ """
155
+ Breaks a tool call into a list of messages, including the tool call and its result.
156
+
157
+ :param tool_call: The tool call to be broken into messages.
158
+ :type tool_call: ToolCall
159
+ :param run_id: The ID of the run associated with the messages.
160
+ :type run_id: str
161
+ :return: A list of messages representing the tool call and its result.
162
+ :rtype: List[Message]
163
+ """
164
+ # We will use this as our accumulator.
165
+ messages: List[Message] = []
166
+
167
+ # As of March 17th, 2025, we only support custom functions due to built-in code interpreters and bing grounding
168
+ # tooling not reporting their function calls in the same way. Code interpreters don't include the tool call at
169
+ # all in most of the cases, and bing would only show the API URL, without arguments or results.
170
+ # Bing grounding would have "bing_grounding" in details with "requesturl" that will just be the API path with query.
171
+ # TODO: Work with AI Services to add converter support for BingGrounding and CodeInterpreter.
172
+ if hasattr(tool_call.details, _FUNCTION):
173
+ # This is the internals of the content object that will be included with the tool call.
174
+ tool_call_id = tool_call.details.id
175
+ content_tool_call = {
176
+ "type": _TOOL_CALL,
177
+ "tool_call_id": tool_call_id,
178
+ "name": tool_call.details.function.name,
179
+ "arguments": safe_loads(tool_call.details.function.arguments),
180
+ }
181
+ else:
182
+ # Treat built-in tools separately. Object models may be unique so handle each case separately
183
+ # Just converting to dicts here rather than custom serializers for simplicity for now.
184
+ # Don't fail if we run into a newly seen tool, just skip
185
+ if tool_call.details["type"] == "code_interpreter":
186
+ arguments = {"input": tool_call.details.code_interpreter.input}
187
+ elif tool_call.details["type"] == "bing_grounding":
188
+ arguments = {"requesturl": tool_call.details["bing_grounding"]["requesturl"]}
189
+ elif tool_call.details["type"] == "file_search":
190
+ options = tool_call.details["file_search"]["ranking_options"]
191
+ arguments = {
192
+ "ranking_options": {"ranker": options["ranker"], "score_threshold": options["score_threshold"]}
193
+ }
194
+ else:
195
+ # unsupported tool type, skip
196
+ return messages
197
+ try:
198
+ tool_call_id = tool_call.details.id
199
+ content_tool_call = {
200
+ "type": _TOOL_CALL,
201
+ "tool_call_id": tool_call_id,
202
+ "name": tool_call.details.type,
203
+ "arguments": arguments,
204
+ }
205
+ except:
206
+ return messages
207
+
208
+ # We format it into an assistant message, where the content is a singleton list of the content object.
209
+ # It should be a tool message, since this is the call, but the given schema treats this message as
210
+ # assistant's action of calling the tool.
211
+ messages.append(AssistantMessage(run_id=run_id, content=[to_dict(content_tool_call)], createdAt=tool_call.created))
212
+
213
+ if hasattr(tool_call.details, _FUNCTION):
214
+ output = safe_loads(tool_call.details.function.output)
215
+ else:
216
+ try:
217
+ # Some built-ins may have output, others may not
218
+ # Try to retrieve it, but if we don't find anything, skip adding the message
219
+ # Just manually converting to dicts for easy serialization for now rather than custom serializers
220
+ if tool_call.details.type == "code_interpreter":
221
+ output = tool_call.details.code_interpreter.outputs
222
+ elif tool_call.details.type == "bing_grounding":
223
+ return messages # not supported yet from bing grounding tool
224
+ elif tool_call.details.type == "file_search":
225
+ output = [
226
+ {
227
+ "file_id": result.file_id,
228
+ "file_name": result.file_name,
229
+ "score": result.score,
230
+ "content": result.content,
231
+ }
232
+ for result in tool_call.details.file_search.results
233
+ ]
234
+ except:
235
+ return messages
236
+
237
+ # Now, onto the tool result, which only includes the result of the function call.
238
+ content_tool_call_result = {"type": _TOOL_RESULT, _TOOL_RESULT: output}
239
+
240
+ # Since this is a tool's action of returning, we put it as a tool message.
241
+ messages.append(
242
+ ToolMessage(
243
+ run_id=run_id,
244
+ tool_call_id=tool_call_id,
245
+ content=[to_dict(content_tool_call_result)],
246
+ createdAt=tool_call.completed,
247
+ )
248
+ )
249
+ return messages
250
+
251
+
252
+ def to_dict(obj) -> dict:
253
+ """
254
+ Converts an object to a dictionary.
255
+
256
+ :param obj: The object to be converted.
257
+ :type obj: Any
258
+ :return: The dictionary representation of the object.
259
+ :rtype: dict
260
+ """
261
+ return json.loads(json.dumps(obj))
262
+
263
+
264
+ def safe_loads(data: str) -> Union[dict, str]:
265
+ """
266
+ Safely loads a JSON string into a Python dictionary or returns the original string if loading fails.
267
+ :param data: The JSON string to be loaded.
268
+ :type data: str
269
+ :return: The loaded dictionary or the original string.
270
+ :rtype: Union[dict, str]
271
+ """
272
+ try:
273
+ return json.loads(data)
274
+ except json.JSONDecodeError:
275
+ return data
276
+
277
+
278
+ def convert_message(msg: dict) -> Message:
279
+ """
280
+ Converts a dictionary to the appropriate Message subclass.
281
+
282
+ :param msg: The message dictionary.
283
+ :type msg: dict
284
+ :return: The Message object.
285
+ :rtype: Message
286
+ """
287
+ role = msg["role"]
288
+ if role == "system":
289
+ return SystemMessage(content=str(msg["content"]))
290
+ elif role == "user":
291
+ return UserMessage(content=msg["content"], createdAt=msg["createdAt"])
292
+ elif role == "assistant":
293
+ return AssistantMessage(run_id=str(msg["run_id"]), content=msg["content"], createdAt=msg["createdAt"])
294
+ elif role == "tool":
295
+ return ToolMessage(
296
+ run_id=str(msg["run_id"]),
297
+ tool_call_id=str(msg["tool_call_id"]),
298
+ content=msg["content"],
299
+ createdAt=msg["createdAt"],
300
+ )
301
+ else:
302
+ raise ValueError(f"Unknown role: {role}")
@@ -3,8 +3,15 @@
3
3
  # ---------------------------------------------------------
4
4
  from .eval_run_context import EvalRunContext
5
5
  from .code_client import CodeClient
6
- from .proxy_client import ProxyClient
6
+ from .proxy_client import ProxyClient, ProxyRun
7
+ from ._run_submitter_client import RunSubmitterClient
7
8
  from .target_run_context import TargetRunContext
8
- from .proxy_client import ProxyRun
9
9
 
10
- __all__ = ["CodeClient", "ProxyClient", "EvalRunContext", "TargetRunContext", "ProxyRun"]
10
+ __all__ = [
11
+ "CodeClient",
12
+ "ProxyClient",
13
+ "EvalRunContext",
14
+ "TargetRunContext",
15
+ "ProxyRun",
16
+ "RunSubmitterClient",
17
+ ]
@@ -0,0 +1,104 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+
5
+ import logging
6
+ import pandas as pd
7
+ import sys
8
+ from collections import defaultdict
9
+ from concurrent.futures import Future, ThreadPoolExecutor
10
+ from os import PathLike
11
+ from typing import Any, Callable, Dict, Final, List, Mapping, Optional, Sequence, Union, cast
12
+
13
+ from .batch_clients import BatchClientRun, HasAsyncCallable
14
+ from ..._legacy._batch_engine._run_submitter import RunSubmitter
15
+ from ..._legacy._batch_engine._config import BatchEngineConfig
16
+ from ..._legacy._batch_engine._run import Run
17
+
18
+
19
+ LOGGER = logging.getLogger(__name__)
20
+
21
+
22
+ class RunSubmitterClient:
23
+ def __init__(self, config: Optional[BatchEngineConfig] = None) -> None:
24
+ self._config = config or BatchEngineConfig(LOGGER, use_async=True)
25
+ self._thread_pool = ThreadPoolExecutor(thread_name_prefix="evaluators_thread")
26
+
27
+ def run(
28
+ self,
29
+ flow: Callable,
30
+ data: Union[str, PathLike, pd.DataFrame],
31
+ column_mapping: Optional[Dict[str, str]] = None,
32
+ evaluator_name: Optional[str] = None,
33
+ **kwargs: Any,
34
+ ) -> BatchClientRun:
35
+ if not isinstance(data, pd.DataFrame):
36
+ # Should never get here
37
+ raise ValueError("Data must be a pandas DataFrame")
38
+ if not column_mapping:
39
+ raise ValueError("Column mapping must be provided")
40
+
41
+ # The column mappings are index by data to indicate they come from the data
42
+ # input. Update the inputs so that each entry is a dictionary with a data key
43
+ # that contains the original input data.
44
+ inputs = [{"data": input_data} for input_data in data.to_dict(orient="records")]
45
+
46
+ # always uses async behind the scenes
47
+ if isinstance(flow, HasAsyncCallable):
48
+ flow = flow._to_async() # pylint: disable=protected-access
49
+
50
+ run_submitter = RunSubmitter(self._config)
51
+ run_future = self._thread_pool.submit(
52
+ run_submitter.submit,
53
+ dynamic_callable=flow,
54
+ inputs=inputs,
55
+ column_mapping=column_mapping,
56
+ name_prefix=evaluator_name,
57
+ created_on=kwargs.pop("created_on", None),
58
+ storage_creator=kwargs.pop("storage_creator", None),
59
+ **kwargs,
60
+ )
61
+
62
+ return run_future
63
+
64
+ def get_details(self, client_run: BatchClientRun, all_results: bool = False) -> pd.DataFrame:
65
+ run = self._get_run(client_run)
66
+
67
+ data: Dict[str, List[Any]] = defaultdict(list)
68
+ stop_at: Final[int] = self._config.default_num_results if not all_results else sys.maxsize
69
+
70
+ def _update(prefix: str, items: Sequence[Mapping[str, Any]]) -> None:
71
+ for i, line in enumerate(items):
72
+ if i >= stop_at:
73
+ break
74
+ for k, value in line.items():
75
+ key = f"{prefix}.{k}"
76
+ data[key].append(value)
77
+
78
+ _update("inputs", run.inputs)
79
+ _update("outputs", run.outputs)
80
+
81
+ df = pd.DataFrame(data).reindex(columns=[k for k in data.keys()])
82
+ return df
83
+
84
+ def get_metrics(self, client_run: BatchClientRun) -> Dict[str, Any]:
85
+ run = self._get_run(client_run)
86
+ return dict(run.metrics)
87
+
88
+ def get_run_summary(self, client_run: BatchClientRun) -> Dict[str, Any]:
89
+ run = self._get_run(client_run)
90
+
91
+ total_lines = run.result.total_lines if run.result else 0
92
+ failed_lines = run.result.failed_lines if run.result else 0
93
+
94
+ return {
95
+ "status": run.status.value,
96
+ "duration": str(run.duration),
97
+ "completed_lines": total_lines - failed_lines,
98
+ "failed_lines": failed_lines,
99
+ # "log_path": "",
100
+ }
101
+
102
+ @staticmethod
103
+ def _get_run(run: BatchClientRun) -> Run:
104
+ return cast(Future[Run], run).result()
@@ -0,0 +1,82 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+
5
+ import pandas
6
+ from os import PathLike
7
+ from typing import Any, Awaitable, Callable, Dict, Optional, Protocol, Union, runtime_checkable
8
+
9
+
10
+ class BatchClientRun(Protocol):
11
+ """The protocol for the batch client run."""
12
+
13
+ pass
14
+
15
+
16
+ @runtime_checkable
17
+ class HasAsyncCallable(Protocol):
18
+ """The protocol for an object that has an async callable."""
19
+
20
+ def _to_async(self) -> Callable[[Any, Any], Awaitable[Any]]: ...
21
+
22
+
23
+ class BatchClient(Protocol):
24
+ """The protocol for the batch client. This allows for running a flow on a data source
25
+ and getting the details of the run."""
26
+
27
+ def run(
28
+ self,
29
+ flow: Callable,
30
+ data: Union[str, PathLike, pandas.DataFrame],
31
+ column_mapping: Optional[Dict[str, str]] = None,
32
+ evaluator_name: Optional[str] = None,
33
+ **kwargs: Any,
34
+ ) -> BatchClientRun:
35
+ """Run the given flow on the data with the given column mapping.
36
+
37
+ :param flow: The flow to run.
38
+ :type flow: Union[Callable, HasAsyncCallable]
39
+ :param data: The JSONL file containing the data to run the flow on,
40
+ or the loaded data
41
+ :type data: Union[str, PathLike]
42
+ :param column_mapping: The column mapping to use.
43
+ :type column_mapping: Mapping[str, str]
44
+ :param name: The name of the run.
45
+ :type name: Optional[str]
46
+ :param kwargs: Additional keyword arguments to pass to the flow.
47
+ :return: The result of the batch client run.
48
+ :rtype: BatchClientRun
49
+ """
50
+ ...
51
+
52
+ def get_details(self, client_run: BatchClientRun, all_results: bool = False) -> pandas.DataFrame:
53
+ """Get the details of the run.
54
+
55
+ :param client_run: The run to get the details of.
56
+ :type client_run: BatchClientRun
57
+ :param all_results: Whether to get all results.
58
+ :type all_results: bool
59
+ :return: The details of the run.
60
+ :rtype: pandas.DataFrame
61
+ """
62
+ ...
63
+
64
+ def get_metrics(self, client_run: BatchClientRun) -> Dict[str, Any]:
65
+ """Get the metrics of the run.
66
+
67
+ :param client_run: The run to get the metrics of.
68
+ :type client_run: BatchClientRun
69
+ :return: The metrics of the run.
70
+ :rtype: Mapping[str, Any]
71
+ """
72
+ ...
73
+
74
+ def get_run_summary(self, client_run: BatchClientRun) -> Dict[str, Any]:
75
+ """Get the summary of the run.
76
+
77
+ :param client_run: The run to get the summary of.
78
+ :type client_run: BatchClientRun
79
+ :return: The summary of the run.
80
+ :rtype: Mapping[str, Any]
81
+ """
82
+ ...
@@ -404,7 +404,7 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
404
404
  LOGGER.warning("The run results file was not found, skipping artifacts upload.")
405
405
  return
406
406
  # First we will list the files and the appropriate remote paths for them.
407
- root_upload_path = posixpath.join("promptflow", "PromptFlowArtifacts", self.info.run_name)
407
+ root_upload_path = posixpath.join("promptflow", "PromptFlowArtifacts", self.info.run_id)
408
408
  remote_paths: Dict[str, List[Dict[str, str]]] = {"paths": []}
409
409
  local_paths = []
410
410
  # Go over the artifact folder and upload all artifacts.
@@ -147,14 +147,22 @@ def _aggregate_label_defect_metrics(df: pd.DataFrame) -> Tuple[List[str], Dict[s
147
147
  """
148
148
  handled_metrics = [
149
149
  EvaluationMetrics.PROTECTED_MATERIAL,
150
+ EvaluationMetrics.FICTIONAL_CHARACTERS,
151
+ EvaluationMetrics.ARTWORK,
152
+ EvaluationMetrics.LOGOS_AND_BRANDS,
150
153
  _InternalEvaluationMetrics.ECI,
151
154
  EvaluationMetrics.XPIA,
155
+ EvaluationMetrics.CODE_VULNERABILITY,
156
+ EvaluationMetrics.UNGROUNDED_ATTRIBUTES,
152
157
  ]
153
158
  label_cols = []
159
+ details_cols = []
154
160
  for col in df.columns:
155
161
  metric_name = col.split(".")[1]
156
162
  if metric_name.endswith("_label") and metric_name.replace("_label", "").lower() in handled_metrics:
157
163
  label_cols.append(col)
164
+ if metric_name.endswith("_details") and metric_name.replace("_details", "").lower() in handled_metrics:
165
+ details_cols = col
158
166
 
159
167
  label_df = df[label_cols]
160
168
  defect_rates = {}
@@ -166,8 +174,30 @@ def _aggregate_label_defect_metrics(df: pd.DataFrame) -> Tuple[List[str], Dict[s
166
174
  except EvaluationException: # only exception that can be cause is all NaN values
167
175
  msg = f"All score evaluations are NaN/None for column {col}. No aggregation can be performed."
168
176
  LOGGER.warning(msg)
177
+
178
+ if details_cols:
179
+ details_df = df[details_cols]
180
+ detail_defect_rates = {}
181
+
182
+ for key, value in details_df.items():
183
+ _process_rows(value, detail_defect_rates)
184
+
185
+ for key, value in detail_defect_rates.items():
186
+ col_with_boolean_values = pd.to_numeric(value, errors="coerce")
187
+ try:
188
+ defect_rates[f"{details_cols}.{key}_defect_rate"] = round(list_mean_nan_safe(col_with_boolean_values), 2)
189
+ except EvaluationException: # only exception that can be cause is all NaN values
190
+ msg = f"All score evaluations are NaN/None for column {key}. No aggregation can be performed."
191
+ LOGGER.warning(msg)
192
+
169
193
  return label_cols, defect_rates
170
194
 
195
+ def _process_rows(row, detail_defect_rates):
196
+ for key, value in row.items():
197
+ if key not in detail_defect_rates:
198
+ detail_defect_rates[key] = []
199
+ detail_defect_rates[key].append(value)
200
+ return detail_defect_rates
171
201
 
172
202
  def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, Callable]) -> Dict[str, float]:
173
203
  """Aggregate metrics from the evaluation results.
@@ -483,8 +513,10 @@ def _apply_target_to_data(
483
513
  run_summary = batch_client.get_run_summary(run)
484
514
 
485
515
  if run_summary["completed_lines"] == 0:
486
- msg = (f"Evaluation target failed to produce any results."
487
- f" Please check the logs at {run_summary['log_path']} for more details about cause of failure.")
516
+ msg = (
517
+ f"Evaluation target failed to produce any results."
518
+ f" Please check the logs at {run_summary['log_path']} for more details about cause of failure."
519
+ )
488
520
  raise EvaluationException(
489
521
  message=msg,
490
522
  target=ErrorTarget.EVALUATE,
@@ -525,7 +557,7 @@ def _process_column_mappings(
525
557
 
526
558
  processed_config: Dict[str, Dict[str, str]] = {}
527
559
 
528
- unexpected_references = re.compile(r"\${(?!target\.|data\.).+?}")
560
+ expected_references = re.compile(r"^\$\{(target|data)\.[a-zA-Z_]+\}$")
529
561
 
530
562
  if column_mapping:
531
563
  for evaluator, mapping_config in column_mapping.items():
@@ -534,7 +566,7 @@ def _process_column_mappings(
534
566
 
535
567
  for map_to_key, map_value in mapping_config.items():
536
568
  # Check if there's any unexpected reference other than ${target.} or ${data.}
537
- if unexpected_references.search(map_value):
569
+ if not expected_references.search(map_value):
538
570
  msg = "Unexpected references detected in 'column_mapping'. Ensure only ${target.} and ${data.} are used."
539
571
  raise EvaluationException(
540
572
  message=msg,
@@ -8,6 +8,7 @@ from typing_extensions import overload, override
8
8
  from azure.ai.evaluation._common.utils import nltk_tokenize
9
9
 
10
10
  from azure.ai.evaluation._evaluators._common import EvaluatorBase
11
+ from azure.ai.evaluation._constants import EVALUATION_PASS_FAIL_MAPPING
11
12
 
12
13
 
13
14
  class BleuScoreEvaluator(EvaluatorBase):
@@ -22,6 +23,8 @@ class BleuScoreEvaluator(EvaluatorBase):
22
23
  indicator of quality.
23
24
 
24
25
  The BLEU score ranges from 0 to 1, with higher scores indicating better quality.
26
+ :param threshold: The threshold for the evaluation. Default is 0.5.
27
+ :type threshold: float
25
28
 
26
29
  .. admonition:: Example:
27
30
 
@@ -31,17 +34,27 @@ class BleuScoreEvaluator(EvaluatorBase):
31
34
  :language: python
32
35
  :dedent: 8
33
36
  :caption: Initialize and call an BleuScoreEvaluator.
37
+
38
+ .. admonition:: Example with Threshold:
39
+ .. literalinclude:: ../samples/evaluation_samples_threshold.py
40
+ :start-after: [START threshold_bleu_score_evaluator]
41
+ :end-before: [END threshold_bleu_score_evaluator]
42
+ :language: python
43
+ :dedent: 8
44
+ :caption: Initialize with threshold and call an BleuScoreEvaluator.
34
45
  """
35
46
 
36
47
  id = "azureml://registries/azureml/models/Bleu-Score-Evaluator/versions/3"
37
48
  """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
38
49
 
39
- def __init__(self):
40
- super().__init__()
50
+ def __init__(self, *, threshold=0.5):
51
+ self._threshold = threshold
52
+ self._higher_is_better = True
53
+ super().__init__(threshold=threshold, _higher_is_better=self._higher_is_better)
41
54
 
42
55
  @override
43
56
  async def _do_eval(self, eval_input: Dict) -> Dict[str, float]:
44
- """Produce a glue score evaluation result.
57
+ """Produce a bleu score evaluation result.
45
58
 
46
59
  :param eval_input: The input to the evaluation function.
47
60
  :type eval_input: Dict
@@ -56,9 +69,16 @@ class BleuScoreEvaluator(EvaluatorBase):
56
69
  # NIST Smoothing
57
70
  smoothing_function = SmoothingFunction().method4
58
71
  score = sentence_bleu([reference_tokens], hypothesis_tokens, smoothing_function=smoothing_function)
72
+ binary_result = False
73
+ if self._higher_is_better:
74
+ binary_result = score >= self._threshold
75
+ else:
76
+ binary_result = score <= self._threshold
59
77
 
60
78
  return {
61
79
  "bleu_score": score,
80
+ "bleu_result": EVALUATION_PASS_FAIL_MAPPING[binary_result],
81
+ "bleu_threshold": self._threshold,
62
82
  }
63
83
 
64
84
  @overload # type: ignore
@@ -0,0 +1,5 @@
1
+ from ._code_vulnerability import CodeVulnerabilityEvaluator
2
+
3
+ __all__ = [
4
+ "CodeVulnerabilityEvaluator",
5
+ ]