azure-ai-evaluation 1.5.0__py3-none-any.whl → 1.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of azure-ai-evaluation might be problematic. Click here for more details.
- azure/ai/evaluation/__init__.py +10 -0
- azure/ai/evaluation/_aoai/__init__.py +10 -0
- azure/ai/evaluation/_aoai/aoai_grader.py +89 -0
- azure/ai/evaluation/_aoai/label_grader.py +66 -0
- azure/ai/evaluation/_aoai/string_check_grader.py +65 -0
- azure/ai/evaluation/_aoai/text_similarity_grader.py +88 -0
- azure/ai/evaluation/_azure/_clients.py +4 -4
- azure/ai/evaluation/_azure/_envs.py +208 -0
- azure/ai/evaluation/_azure/_token_manager.py +12 -7
- azure/ai/evaluation/_common/__init__.py +7 -0
- azure/ai/evaluation/_common/evaluation_onedp_client.py +163 -0
- azure/ai/evaluation/_common/onedp/__init__.py +32 -0
- azure/ai/evaluation/_common/onedp/_client.py +139 -0
- azure/ai/evaluation/_common/onedp/_configuration.py +73 -0
- azure/ai/evaluation/_common/onedp/_model_base.py +1232 -0
- azure/ai/evaluation/_common/onedp/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/_serialization.py +2032 -0
- azure/ai/evaluation/_common/onedp/_types.py +21 -0
- azure/ai/evaluation/_common/onedp/_validation.py +50 -0
- azure/ai/evaluation/_common/onedp/_vendor.py +50 -0
- azure/ai/evaluation/_common/onedp/_version.py +9 -0
- azure/ai/evaluation/_common/onedp/aio/__init__.py +29 -0
- azure/ai/evaluation/_common/onedp/aio/_client.py +143 -0
- azure/ai/evaluation/_common/onedp/aio/_configuration.py +75 -0
- azure/ai/evaluation/_common/onedp/aio/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/aio/_vendor.py +40 -0
- azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +39 -0
- azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +4494 -0
- azure/ai/evaluation/_common/onedp/aio/operations/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/models/__init__.py +142 -0
- azure/ai/evaluation/_common/onedp/models/_enums.py +162 -0
- azure/ai/evaluation/_common/onedp/models/_models.py +2228 -0
- azure/ai/evaluation/_common/onedp/models/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/operations/__init__.py +39 -0
- azure/ai/evaluation/_common/onedp/operations/_operations.py +5655 -0
- azure/ai/evaluation/_common/onedp/operations/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/py.typed +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/__init__.py +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/aio/__init__.py +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/__init__.py +25 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_operations.py +34 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/__init__.py +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/__init__.py +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/__init__.py +22 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_operations.py +29 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/__init__.py +22 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_operations.py +29 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/operations/__init__.py +25 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/operations/_operations.py +34 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/rai_service.py +165 -34
- azure/ai/evaluation/_common/raiclient/_version.py +1 -1
- azure/ai/evaluation/_common/utils.py +79 -1
- azure/ai/evaluation/_constants.py +16 -0
- azure/ai/evaluation/_converters/_ai_services.py +162 -118
- azure/ai/evaluation/_converters/_models.py +76 -6
- azure/ai/evaluation/_eval_mapping.py +73 -0
- azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +30 -16
- azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +8 -0
- azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +5 -0
- azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +17 -1
- azure/ai/evaluation/_evaluate/_eval_run.py +1 -1
- azure/ai/evaluation/_evaluate/_evaluate.py +325 -76
- azure/ai/evaluation/_evaluate/_evaluate_aoai.py +553 -0
- azure/ai/evaluation/_evaluate/_utils.py +117 -4
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +11 -1
- azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +9 -1
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +12 -2
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +12 -3
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +12 -3
- azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +2 -2
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +12 -2
- azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +14 -4
- azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +9 -8
- azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +10 -0
- azure/ai/evaluation/_evaluators/_content_safety/_violence.py +10 -0
- azure/ai/evaluation/_evaluators/_document_retrieval/__init__.py +11 -0
- azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +469 -0
- azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +10 -0
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +11 -1
- azure/ai/evaluation/_evaluators/_gleu/_gleu.py +10 -0
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +11 -1
- azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +16 -2
- azure/ai/evaluation/_evaluators/_meteor/_meteor.py +10 -0
- azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +11 -0
- azure/ai/evaluation/_evaluators/_qa/_qa.py +10 -0
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +11 -1
- azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +20 -2
- azure/ai/evaluation/_evaluators/_response_completeness/response_completeness.prompty +31 -46
- azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +10 -0
- azure/ai/evaluation/_evaluators/_rouge/_rouge.py +10 -0
- azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +10 -0
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +11 -1
- azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +16 -2
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +86 -12
- azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +10 -0
- azure/ai/evaluation/_evaluators/_xpia/xpia.py +11 -0
- azure/ai/evaluation/_exceptions.py +2 -0
- azure/ai/evaluation/_legacy/_adapters/__init__.py +0 -14
- azure/ai/evaluation/_legacy/_adapters/_check.py +17 -0
- azure/ai/evaluation/_legacy/_adapters/_flows.py +1 -1
- azure/ai/evaluation/_legacy/_batch_engine/_engine.py +51 -32
- azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py +114 -8
- azure/ai/evaluation/_legacy/_batch_engine/_result.py +6 -0
- azure/ai/evaluation/_legacy/_batch_engine/_run.py +6 -0
- azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +69 -29
- azure/ai/evaluation/_legacy/_batch_engine/_trace.py +54 -62
- azure/ai/evaluation/_legacy/_batch_engine/_utils.py +19 -1
- azure/ai/evaluation/_legacy/_common/__init__.py +3 -0
- azure/ai/evaluation/_legacy/_common/_async_token_provider.py +124 -0
- azure/ai/evaluation/_legacy/_common/_thread_pool_executor_with_context.py +15 -0
- azure/ai/evaluation/_legacy/prompty/_connection.py +11 -74
- azure/ai/evaluation/_legacy/prompty/_exceptions.py +80 -0
- azure/ai/evaluation/_legacy/prompty/_prompty.py +119 -9
- azure/ai/evaluation/_legacy/prompty/_utils.py +72 -2
- azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +114 -22
- azure/ai/evaluation/_version.py +1 -1
- azure/ai/evaluation/red_team/_attack_strategy.py +1 -1
- azure/ai/evaluation/red_team/_red_team.py +976 -546
- azure/ai/evaluation/red_team/_utils/metric_mapping.py +23 -0
- azure/ai/evaluation/red_team/_utils/strategy_utils.py +1 -1
- azure/ai/evaluation/simulator/_adversarial_simulator.py +63 -39
- azure/ai/evaluation/simulator/_constants.py +1 -0
- azure/ai/evaluation/simulator/_conversation/__init__.py +13 -6
- azure/ai/evaluation/simulator/_conversation/_conversation.py +2 -1
- azure/ai/evaluation/simulator/_conversation/constants.py +1 -1
- azure/ai/evaluation/simulator/_direct_attack_simulator.py +38 -25
- azure/ai/evaluation/simulator/_helpers/_language_suffix_mapping.py +1 -0
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +43 -28
- azure/ai/evaluation/simulator/_model_tools/__init__.py +2 -1
- azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +26 -18
- azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +5 -10
- azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +65 -41
- azure/ai/evaluation/simulator/_model_tools/_template_handler.py +15 -10
- azure/ai/evaluation/simulator/_model_tools/models.py +20 -17
- {azure_ai_evaluation-1.5.0.dist-info → azure_ai_evaluation-1.7.0.dist-info}/METADATA +49 -3
- {azure_ai_evaluation-1.5.0.dist-info → azure_ai_evaluation-1.7.0.dist-info}/RECORD +144 -86
- /azure/ai/evaluation/_legacy/{_batch_engine → _common}/_logging.py +0 -0
- {azure_ai_evaluation-1.5.0.dist-info → azure_ai_evaluation-1.7.0.dist-info}/NOTICE.txt +0 -0
- {azure_ai_evaluation-1.5.0.dist-info → azure_ai_evaluation-1.7.0.dist-info}/WHEEL +0 -0
- {azure_ai_evaluation-1.5.0.dist-info → azure_ai_evaluation-1.7.0.dist-info}/top_level.txt +0 -0
|
@@ -3,10 +3,18 @@ import json
|
|
|
3
3
|
|
|
4
4
|
from pydantic import BaseModel
|
|
5
5
|
|
|
6
|
-
from azure.ai.projects.models import RunStepFunctionToolCall
|
|
7
|
-
|
|
8
6
|
from typing import List, Optional, Union
|
|
9
7
|
|
|
8
|
+
# Models moved in a later version of agents SDK, so try a few different locations
|
|
9
|
+
try:
|
|
10
|
+
from azure.ai.projects.models import RunStepFunctionToolCall
|
|
11
|
+
except ImportError:
|
|
12
|
+
pass
|
|
13
|
+
try:
|
|
14
|
+
from azure.ai.agents.models import RunStepFunctionToolCall
|
|
15
|
+
except ImportError:
|
|
16
|
+
pass
|
|
17
|
+
|
|
10
18
|
# Message roles constants.
|
|
11
19
|
_SYSTEM = "system"
|
|
12
20
|
_USER = "user"
|
|
@@ -21,6 +29,57 @@ _FUNCTION = "function"
|
|
|
21
29
|
# This is returned by AI services in the API to filter against tool invocations.
|
|
22
30
|
_TOOL_CALLS = "tool_calls"
|
|
23
31
|
|
|
32
|
+
# Constants to only be used internally in this file for the built-in tools.
|
|
33
|
+
_CODE_INTERPRETER = "code_interpreter"
|
|
34
|
+
_BING_GROUNDING = "bing_grounding"
|
|
35
|
+
_FILE_SEARCH = "file_search"
|
|
36
|
+
_AZURE_AI_SEARCH = "azure_ai_search"
|
|
37
|
+
_FABRIC_DATAAGENT = "fabric_dataagent"
|
|
38
|
+
|
|
39
|
+
# Built-in tool descriptions and parameters are hidden, but we include basic descriptions
|
|
40
|
+
# for evaluation purposes.
|
|
41
|
+
_BUILT_IN_DESCRIPTIONS = {
|
|
42
|
+
_CODE_INTERPRETER: "Use code interpreter to read and interpret information from datasets, "
|
|
43
|
+
+ "generate code, and create graphs and charts using your data. Supports "
|
|
44
|
+
+ "up to 20 files.",
|
|
45
|
+
_BING_GROUNDING: "Enhance model output with web data.",
|
|
46
|
+
_FILE_SEARCH: "Search for data across uploaded files.",
|
|
47
|
+
_AZURE_AI_SEARCH: "Search an Azure AI Search index for relevant data.",
|
|
48
|
+
_FABRIC_DATAAGENT: "Connect to Microsoft Fabric data agents to retrieve data across different data sources.",
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
# Built-in tool parameters are hidden, but we include basic parameters for evaluation purposes.
|
|
52
|
+
_BUILT_IN_PARAMS = {
|
|
53
|
+
_CODE_INTERPRETER: {
|
|
54
|
+
"type": "object",
|
|
55
|
+
"properties": {"input": {"type": "string", "description": "Generated code to be executed."}},
|
|
56
|
+
},
|
|
57
|
+
_BING_GROUNDING: {
|
|
58
|
+
"type": "object",
|
|
59
|
+
"properties": {"requesturl": {"type": "string", "description": "URL used in Bing Search API."}},
|
|
60
|
+
},
|
|
61
|
+
_FILE_SEARCH: {
|
|
62
|
+
"type": "object",
|
|
63
|
+
"properties": {
|
|
64
|
+
"ranking_options": {
|
|
65
|
+
"type": "object",
|
|
66
|
+
"properties": {
|
|
67
|
+
"ranker": {"type": "string", "description": "Ranking algorithm to use."},
|
|
68
|
+
"score_threshold": {"type": "number", "description": "Threshold for search results."},
|
|
69
|
+
},
|
|
70
|
+
"description": "Ranking options for search results.",
|
|
71
|
+
}
|
|
72
|
+
},
|
|
73
|
+
},
|
|
74
|
+
_AZURE_AI_SEARCH: {
|
|
75
|
+
"type": "object",
|
|
76
|
+
"properties": {"input": {"type": "string", "description": "Search terms to use."}},
|
|
77
|
+
},
|
|
78
|
+
_FABRIC_DATAAGENT: {
|
|
79
|
+
"type": "object",
|
|
80
|
+
"properties": {"input": {"type": "string", "description": "Search terms to use."}},
|
|
81
|
+
},
|
|
82
|
+
}
|
|
24
83
|
|
|
25
84
|
class Message(BaseModel):
|
|
26
85
|
"""Represents a message in a conversation with agents, assistants, and tools. We need to export these structures
|
|
@@ -98,6 +157,8 @@ class ToolDefinition(BaseModel):
|
|
|
98
157
|
|
|
99
158
|
:param name: The name of the tool.
|
|
100
159
|
:type name: str
|
|
160
|
+
:param type: The type of the tool.
|
|
161
|
+
:type type: str
|
|
101
162
|
:param description: A description of the tool.
|
|
102
163
|
:type description: str
|
|
103
164
|
:param parameters: The parameters required by the tool.
|
|
@@ -105,6 +166,7 @@ class ToolDefinition(BaseModel):
|
|
|
105
166
|
"""
|
|
106
167
|
|
|
107
168
|
name: str
|
|
169
|
+
type: str
|
|
108
170
|
description: Optional[str] = None
|
|
109
171
|
parameters: dict
|
|
110
172
|
|
|
@@ -191,6 +253,10 @@ def break_tool_call_into_messages(tool_call: ToolCall, run_id: str) -> List[Mess
|
|
|
191
253
|
arguments = {
|
|
192
254
|
"ranking_options": {"ranker": options["ranker"], "score_threshold": options["score_threshold"]}
|
|
193
255
|
}
|
|
256
|
+
elif tool_call.details["type"] == "azure_ai_search":
|
|
257
|
+
arguments = {"input": tool_call.details["azure_ai_search"]["input"]}
|
|
258
|
+
elif tool_call.details["type"] == "fabric_dataagent":
|
|
259
|
+
arguments = {"input": tool_call.details["fabric_dataagent"]["input"]}
|
|
194
260
|
else:
|
|
195
261
|
# unsupported tool type, skip
|
|
196
262
|
return messages
|
|
@@ -211,17 +277,17 @@ def break_tool_call_into_messages(tool_call: ToolCall, run_id: str) -> List[Mess
|
|
|
211
277
|
messages.append(AssistantMessage(run_id=run_id, content=[to_dict(content_tool_call)], createdAt=tool_call.created))
|
|
212
278
|
|
|
213
279
|
if hasattr(tool_call.details, _FUNCTION):
|
|
214
|
-
output = safe_loads(tool_call.details.function
|
|
280
|
+
output = safe_loads(tool_call.details.function["output"])
|
|
215
281
|
else:
|
|
216
282
|
try:
|
|
217
283
|
# Some built-ins may have output, others may not
|
|
218
284
|
# Try to retrieve it, but if we don't find anything, skip adding the message
|
|
219
285
|
# Just manually converting to dicts for easy serialization for now rather than custom serializers
|
|
220
|
-
if tool_call.details.type ==
|
|
286
|
+
if tool_call.details.type == _CODE_INTERPRETER:
|
|
221
287
|
output = tool_call.details.code_interpreter.outputs
|
|
222
|
-
elif tool_call.details.type ==
|
|
288
|
+
elif tool_call.details.type == _BING_GROUNDING:
|
|
223
289
|
return messages # not supported yet from bing grounding tool
|
|
224
|
-
elif tool_call.details.type ==
|
|
290
|
+
elif tool_call.details.type == _FILE_SEARCH:
|
|
225
291
|
output = [
|
|
226
292
|
{
|
|
227
293
|
"file_id": result.file_id,
|
|
@@ -231,6 +297,10 @@ def break_tool_call_into_messages(tool_call: ToolCall, run_id: str) -> List[Mess
|
|
|
231
297
|
}
|
|
232
298
|
for result in tool_call.details.file_search.results
|
|
233
299
|
]
|
|
300
|
+
elif tool_call.details.type == _AZURE_AI_SEARCH:
|
|
301
|
+
output = tool_call.details.azure_ai_search["output"]
|
|
302
|
+
elif tool_call.details.type == _FABRIC_DATAAGENT:
|
|
303
|
+
output = tool_call.details.fabric_dataagent["output"]
|
|
234
304
|
except:
|
|
235
305
|
return messages
|
|
236
306
|
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
# ---------------------------------------------------------
|
|
2
|
+
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
|
+
# ---------------------------------------------------------
|
|
4
|
+
|
|
5
|
+
# Note: This was removed from the normal constants file due to circular import issues.
|
|
6
|
+
|
|
7
|
+
# In the future, it would be nice to instead rely on the id value
|
|
8
|
+
# of each eval class, but I wouldn't like to rely on those before
|
|
9
|
+
# we simplify them into version-less, static values, instead of the
|
|
10
|
+
# problematic registry references they currently are.
|
|
11
|
+
|
|
12
|
+
# Import all evals
|
|
13
|
+
from azure.ai.evaluation._evaluators._eci._eci import ECIEvaluator
|
|
14
|
+
from azure.ai.evaluation import (
|
|
15
|
+
BleuScoreEvaluator,
|
|
16
|
+
CodeVulnerabilityEvaluator,
|
|
17
|
+
CoherenceEvaluator,
|
|
18
|
+
ContentSafetyEvaluator,
|
|
19
|
+
DocumentRetrievalEvaluator,
|
|
20
|
+
F1ScoreEvaluator,
|
|
21
|
+
FluencyEvaluator,
|
|
22
|
+
GleuScoreEvaluator,
|
|
23
|
+
GroundednessEvaluator,
|
|
24
|
+
GroundednessProEvaluator,
|
|
25
|
+
HateUnfairnessEvaluator,
|
|
26
|
+
IndirectAttackEvaluator,
|
|
27
|
+
IntentResolutionEvaluator,
|
|
28
|
+
MeteorScoreEvaluator,
|
|
29
|
+
ProtectedMaterialEvaluator,
|
|
30
|
+
QAEvaluator,
|
|
31
|
+
RelevanceEvaluator,
|
|
32
|
+
ResponseCompletenessEvaluator,
|
|
33
|
+
RetrievalEvaluator,
|
|
34
|
+
RougeScoreEvaluator,
|
|
35
|
+
SelfHarmEvaluator,
|
|
36
|
+
SexualEvaluator,
|
|
37
|
+
SimilarityEvaluator,
|
|
38
|
+
TaskAdherenceEvaluator,
|
|
39
|
+
ToolCallAccuracyEvaluator,
|
|
40
|
+
UngroundedAttributesEvaluator,
|
|
41
|
+
ViolenceEvaluator
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
EVAL_CLASS_MAP = {
|
|
45
|
+
BleuScoreEvaluator: "bleu_score",
|
|
46
|
+
CodeVulnerabilityEvaluator: "code_vulnerability",
|
|
47
|
+
CoherenceEvaluator: "coherence",
|
|
48
|
+
ContentSafetyEvaluator: "content_safety",
|
|
49
|
+
DocumentRetrievalEvaluator: "document_retrieval",
|
|
50
|
+
ECIEvaluator: "eci",
|
|
51
|
+
F1ScoreEvaluator: "f1_score",
|
|
52
|
+
FluencyEvaluator: "fluency",
|
|
53
|
+
GleuScoreEvaluator: "gleu_score",
|
|
54
|
+
GroundednessEvaluator: "groundedness",
|
|
55
|
+
GroundednessProEvaluator: "groundedness_pro",
|
|
56
|
+
HateUnfairnessEvaluator: "hate_unfairness",
|
|
57
|
+
IndirectAttackEvaluator: "indirect_attack",
|
|
58
|
+
IntentResolutionEvaluator: "intent_resolution",
|
|
59
|
+
MeteorScoreEvaluator: "meteor_score",
|
|
60
|
+
ProtectedMaterialEvaluator: "protected_material",
|
|
61
|
+
QAEvaluator: "qa",
|
|
62
|
+
RelevanceEvaluator: "relevance",
|
|
63
|
+
ResponseCompletenessEvaluator: "response_completeness",
|
|
64
|
+
RetrievalEvaluator: "retrieval",
|
|
65
|
+
RougeScoreEvaluator: "rouge_score",
|
|
66
|
+
SelfHarmEvaluator: "self_harm",
|
|
67
|
+
SexualEvaluator: "sexual",
|
|
68
|
+
SimilarityEvaluator: "similarity",
|
|
69
|
+
TaskAdherenceEvaluator: "task_adherence",
|
|
70
|
+
ToolCallAccuracyEvaluator: "tool_call_accuracy",
|
|
71
|
+
UngroundedAttributesEvaluator: "ungrounded_attributes",
|
|
72
|
+
ViolenceEvaluator: "violence",
|
|
73
|
+
}
|
|
@@ -2,11 +2,12 @@
|
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
4
|
|
|
5
|
+
import asyncio
|
|
5
6
|
import logging
|
|
6
7
|
import pandas as pd
|
|
7
8
|
import sys
|
|
8
9
|
from collections import defaultdict
|
|
9
|
-
from concurrent.futures import Future
|
|
10
|
+
from concurrent.futures import Future
|
|
10
11
|
from os import PathLike
|
|
11
12
|
from typing import Any, Callable, Dict, Final, List, Mapping, Optional, Sequence, Union, cast
|
|
12
13
|
|
|
@@ -14,6 +15,8 @@ from .batch_clients import BatchClientRun, HasAsyncCallable
|
|
|
14
15
|
from ..._legacy._batch_engine._run_submitter import RunSubmitter
|
|
15
16
|
from ..._legacy._batch_engine._config import BatchEngineConfig
|
|
16
17
|
from ..._legacy._batch_engine._run import Run
|
|
18
|
+
from ..._legacy._adapters._constants import LINE_NUMBER
|
|
19
|
+
from ..._legacy._common._thread_pool_executor_with_context import ThreadPoolExecutorWithContext
|
|
17
20
|
|
|
18
21
|
|
|
19
22
|
LOGGER = logging.getLogger(__name__)
|
|
@@ -22,7 +25,9 @@ LOGGER = logging.getLogger(__name__)
|
|
|
22
25
|
class RunSubmitterClient:
|
|
23
26
|
def __init__(self, config: Optional[BatchEngineConfig] = None) -> None:
|
|
24
27
|
self._config = config or BatchEngineConfig(LOGGER, use_async=True)
|
|
25
|
-
self._thread_pool =
|
|
28
|
+
self._thread_pool = ThreadPoolExecutorWithContext(
|
|
29
|
+
thread_name_prefix="evaluators_thread",
|
|
30
|
+
max_workers=self._config.max_concurrency)
|
|
26
31
|
|
|
27
32
|
def run(
|
|
28
33
|
self,
|
|
@@ -33,30 +38,36 @@ class RunSubmitterClient:
|
|
|
33
38
|
**kwargs: Any,
|
|
34
39
|
) -> BatchClientRun:
|
|
35
40
|
if not isinstance(data, pd.DataFrame):
|
|
36
|
-
# Should never get here
|
|
37
41
|
raise ValueError("Data must be a pandas DataFrame")
|
|
38
|
-
if not column_mapping:
|
|
39
|
-
raise ValueError("Column mapping must be provided")
|
|
40
42
|
|
|
41
|
-
# The column mappings are
|
|
43
|
+
# The column mappings are indexed by data to indicate they come from the data
|
|
42
44
|
# input. Update the inputs so that each entry is a dictionary with a data key
|
|
43
45
|
# that contains the original input data.
|
|
44
46
|
inputs = [{"data": input_data} for input_data in data.to_dict(orient="records")]
|
|
45
47
|
|
|
46
|
-
#
|
|
48
|
+
# Pass the correct previous run to the evaluator
|
|
49
|
+
run: Optional[BatchClientRun] = kwargs.pop("run", None)
|
|
50
|
+
if run:
|
|
51
|
+
kwargs["run"] = self._get_run(run)
|
|
52
|
+
|
|
53
|
+
# Try to get async function to use
|
|
47
54
|
if isinstance(flow, HasAsyncCallable):
|
|
48
55
|
flow = flow._to_async() # pylint: disable=protected-access
|
|
49
56
|
|
|
50
|
-
|
|
57
|
+
# Start an event loop for async execution on a thread pool thread to separate it
|
|
58
|
+
# from the caller's thread.
|
|
59
|
+
run_submitter = RunSubmitter(self._config, self._thread_pool)
|
|
51
60
|
run_future = self._thread_pool.submit(
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
61
|
+
asyncio.run,
|
|
62
|
+
run_submitter.submit(
|
|
63
|
+
dynamic_callable=flow,
|
|
64
|
+
inputs=inputs,
|
|
65
|
+
column_mapping=column_mapping,
|
|
66
|
+
name_prefix=evaluator_name,
|
|
67
|
+
created_on=kwargs.pop("created_on", None),
|
|
68
|
+
storage_creator=kwargs.pop("storage_creator", None),
|
|
69
|
+
**kwargs,
|
|
70
|
+
)
|
|
60
71
|
)
|
|
61
72
|
|
|
62
73
|
return run_future
|
|
@@ -75,7 +86,10 @@ class RunSubmitterClient:
|
|
|
75
86
|
key = f"{prefix}.{k}"
|
|
76
87
|
data[key].append(value)
|
|
77
88
|
|
|
89
|
+
# Go from a list of dictionaries (i.e. a row view of the data) to a dictionary of lists
|
|
90
|
+
# (i.e. a column view of the data)
|
|
78
91
|
_update("inputs", run.inputs)
|
|
92
|
+
_update("inputs", [{ LINE_NUMBER: i } for i in range(len(run.inputs)) ])
|
|
79
93
|
_update("outputs", run.outputs)
|
|
80
94
|
|
|
81
95
|
df = pd.DataFrame(data).reindex(columns=[k for k in data.keys()])
|
|
@@ -8,6 +8,10 @@ from typing import Optional, Type, Union
|
|
|
8
8
|
from azure.ai.evaluation._legacy._adapters._constants import PF_FLOW_ENTRY_IN_TMP, PF_FLOW_META_LOAD_IN_SUBPROCESS
|
|
9
9
|
from azure.ai.evaluation._legacy._adapters.utils import ClientUserAgentUtil
|
|
10
10
|
from azure.ai.evaluation._legacy._adapters.tracing import inject_openai_api, recover_openai_api
|
|
11
|
+
from azure.ai.evaluation._legacy._batch_engine._openai_injector import (
|
|
12
|
+
inject_openai_api as ported_inject_openai_api,
|
|
13
|
+
recover_openai_api as ported_recover_openai_api,
|
|
14
|
+
)
|
|
11
15
|
|
|
12
16
|
from azure.ai.evaluation._constants import (
|
|
13
17
|
OTEL_EXPORTER_OTLP_TRACES_TIMEOUT,
|
|
@@ -68,6 +72,7 @@ class EvalRunContext:
|
|
|
68
72
|
|
|
69
73
|
if isinstance(self.client, RunSubmitterClient):
|
|
70
74
|
set_event_loop_policy()
|
|
75
|
+
ported_inject_openai_api()
|
|
71
76
|
|
|
72
77
|
def __exit__(
|
|
73
78
|
self,
|
|
@@ -92,3 +97,6 @@ class EvalRunContext:
|
|
|
92
97
|
if self._is_otel_timeout_set_by_system:
|
|
93
98
|
os.environ.pop(OTEL_EXPORTER_OTLP_TRACES_TIMEOUT, None)
|
|
94
99
|
self._is_otel_timeout_set_by_system = False
|
|
100
|
+
|
|
101
|
+
if isinstance(self.client, RunSubmitterClient):
|
|
102
|
+
ported_recover_openai_api()
|
|
@@ -58,6 +58,11 @@ class ProxyClient: # pylint: disable=client-accepts-api-version-keyword
|
|
|
58
58
|
if not name:
|
|
59
59
|
name = f"azure_ai_evaluation_evaluators_{evaluator_name}_{datetime.now().strftime('%Y%m%d_%H%M%S_%f')}"
|
|
60
60
|
|
|
61
|
+
# Pass the correct previous run to the evaluator
|
|
62
|
+
run: Optional[BatchClientRun] = kwargs.pop("run", None)
|
|
63
|
+
if run:
|
|
64
|
+
kwargs["run"] = self.get_result(run)
|
|
65
|
+
|
|
61
66
|
batch_use_async = self._should_batch_use_async(flow_to_run)
|
|
62
67
|
eval_future = self._thread_pool.submit(
|
|
63
68
|
self._pf_client.run,
|
|
@@ -5,8 +5,15 @@ import os
|
|
|
5
5
|
import types
|
|
6
6
|
from typing import Optional, Type
|
|
7
7
|
|
|
8
|
+
from azure.ai.evaluation._evaluate._batch_run.batch_clients import BatchClient
|
|
9
|
+
from azure.ai.evaluation._evaluate._batch_run import RunSubmitterClient
|
|
8
10
|
from azure.ai.evaluation._legacy._adapters._constants import PF_FLOW_ENTRY_IN_TMP
|
|
11
|
+
from azure.ai.evaluation._legacy._batch_engine._openai_injector import (
|
|
12
|
+
inject_openai_api as ported_inject_openai_api,
|
|
13
|
+
recover_openai_api as ported_recover_openai_api,
|
|
14
|
+
)
|
|
9
15
|
from azure.ai.evaluation._constants import PF_DISABLE_TRACING
|
|
16
|
+
from azure.ai.evaluation._evaluate._utils import set_event_loop_policy
|
|
10
17
|
|
|
11
18
|
|
|
12
19
|
class TargetRunContext:
|
|
@@ -16,7 +23,8 @@ class TargetRunContext:
|
|
|
16
23
|
:type upload_snapshot: bool
|
|
17
24
|
"""
|
|
18
25
|
|
|
19
|
-
def __init__(self, upload_snapshot: bool = False) -> None:
|
|
26
|
+
def __init__(self, client: BatchClient, upload_snapshot: bool = False) -> None:
|
|
27
|
+
self._client = client
|
|
20
28
|
self._upload_snapshot = upload_snapshot
|
|
21
29
|
self._original_cwd = os.getcwd()
|
|
22
30
|
|
|
@@ -32,6 +40,11 @@ class TargetRunContext:
|
|
|
32
40
|
|
|
33
41
|
os.environ[PF_DISABLE_TRACING] = "true"
|
|
34
42
|
|
|
43
|
+
if isinstance(self._client, RunSubmitterClient):
|
|
44
|
+
ported_inject_openai_api()
|
|
45
|
+
# For addressing the issue of asyncio event loop closed on Windows
|
|
46
|
+
set_event_loop_policy()
|
|
47
|
+
|
|
35
48
|
def __exit__(
|
|
36
49
|
self,
|
|
37
50
|
exc_type: Optional[Type[BaseException]],
|
|
@@ -44,3 +57,6 @@ class TargetRunContext:
|
|
|
44
57
|
os.environ.pop(PF_FLOW_ENTRY_IN_TMP, None)
|
|
45
58
|
|
|
46
59
|
os.environ.pop(PF_DISABLE_TRACING, None)
|
|
60
|
+
|
|
61
|
+
if isinstance(self._client, RunSubmitterClient):
|
|
62
|
+
ported_recover_openai_api()
|
|
@@ -295,7 +295,7 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
|
|
|
295
295
|
return f"https://{self._url_base}" "/mlflow/v2.0" f"{self._get_scope()}" f"/api/2.0/mlflow/runs/log-metric"
|
|
296
296
|
|
|
297
297
|
def _get_token(self) -> str:
|
|
298
|
-
return self._management_client.get_token()
|
|
298
|
+
return self._management_client.get_token().token
|
|
299
299
|
|
|
300
300
|
def request_with_retry(
|
|
301
301
|
self, url: str, method: str, json_dict: Dict[str, Any], headers: Optional[Dict[str, str]] = None
|