azure-ai-evaluation 1.9.0__py3-none-any.whl → 1.11.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of azure-ai-evaluation might be problematic. Click here for more details.
- azure/ai/evaluation/__init__.py +46 -12
- azure/ai/evaluation/_aoai/python_grader.py +84 -0
- azure/ai/evaluation/_aoai/score_model_grader.py +1 -0
- azure/ai/evaluation/_common/onedp/models/_models.py +5 -0
- azure/ai/evaluation/_common/rai_service.py +3 -3
- azure/ai/evaluation/_common/utils.py +74 -17
- azure/ai/evaluation/_converters/_ai_services.py +60 -10
- azure/ai/evaluation/_converters/_models.py +75 -26
- azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +70 -22
- azure/ai/evaluation/_evaluate/_eval_run.py +14 -1
- azure/ai/evaluation/_evaluate/_evaluate.py +163 -44
- azure/ai/evaluation/_evaluate/_evaluate_aoai.py +79 -33
- azure/ai/evaluation/_evaluate/_utils.py +5 -2
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +1 -1
- azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +8 -1
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +3 -2
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +143 -25
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +7 -2
- azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +19 -9
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +15 -5
- azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +4 -1
- azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +4 -1
- azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +5 -2
- azure/ai/evaluation/_evaluators/_content_safety/_violence.py +4 -1
- azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +3 -0
- azure/ai/evaluation/_evaluators/_eci/_eci.py +3 -0
- azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +1 -1
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +3 -2
- azure/ai/evaluation/_evaluators/_gleu/_gleu.py +1 -1
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +114 -4
- azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +9 -3
- azure/ai/evaluation/_evaluators/_meteor/_meteor.py +1 -1
- azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +8 -1
- azure/ai/evaluation/_evaluators/_qa/_qa.py +1 -1
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +56 -3
- azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +140 -59
- azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +11 -3
- azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +3 -2
- azure/ai/evaluation/_evaluators/_rouge/_rouge.py +1 -1
- azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +2 -1
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +3 -2
- azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +24 -12
- azure/ai/evaluation/_evaluators/_task_adherence/task_adherence.prompty +354 -66
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +214 -187
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +126 -31
- azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +8 -1
- azure/ai/evaluation/_evaluators/_xpia/xpia.py +4 -1
- azure/ai/evaluation/_exceptions.py +1 -0
- azure/ai/evaluation/_legacy/_batch_engine/_config.py +6 -3
- azure/ai/evaluation/_legacy/_batch_engine/_engine.py +115 -30
- azure/ai/evaluation/_legacy/_batch_engine/_result.py +2 -0
- azure/ai/evaluation/_legacy/_batch_engine/_run.py +2 -2
- azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +28 -31
- azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +2 -0
- azure/ai/evaluation/_version.py +1 -1
- azure/ai/evaluation/red_team/__init__.py +4 -3
- azure/ai/evaluation/red_team/_attack_objective_generator.py +17 -0
- azure/ai/evaluation/red_team/_callback_chat_target.py +14 -1
- azure/ai/evaluation/red_team/_evaluation_processor.py +376 -0
- azure/ai/evaluation/red_team/_mlflow_integration.py +322 -0
- azure/ai/evaluation/red_team/_orchestrator_manager.py +661 -0
- azure/ai/evaluation/red_team/_red_team.py +655 -2665
- azure/ai/evaluation/red_team/_red_team_result.py +6 -0
- azure/ai/evaluation/red_team/_result_processor.py +610 -0
- azure/ai/evaluation/red_team/_utils/__init__.py +34 -0
- azure/ai/evaluation/red_team/_utils/_rai_service_eval_chat_target.py +11 -4
- azure/ai/evaluation/red_team/_utils/_rai_service_true_false_scorer.py +6 -0
- azure/ai/evaluation/red_team/_utils/constants.py +0 -2
- azure/ai/evaluation/red_team/_utils/exception_utils.py +345 -0
- azure/ai/evaluation/red_team/_utils/file_utils.py +266 -0
- azure/ai/evaluation/red_team/_utils/formatting_utils.py +115 -13
- azure/ai/evaluation/red_team/_utils/metric_mapping.py +24 -4
- azure/ai/evaluation/red_team/_utils/progress_utils.py +252 -0
- azure/ai/evaluation/red_team/_utils/retry_utils.py +218 -0
- azure/ai/evaluation/red_team/_utils/strategy_utils.py +17 -4
- azure/ai/evaluation/simulator/_adversarial_simulator.py +14 -2
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +13 -1
- azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +21 -7
- azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +24 -5
- azure/ai/evaluation/simulator/_simulator.py +12 -0
- {azure_ai_evaluation-1.9.0.dist-info → azure_ai_evaluation-1.11.0.dist-info}/METADATA +63 -4
- {azure_ai_evaluation-1.9.0.dist-info → azure_ai_evaluation-1.11.0.dist-info}/RECORD +85 -76
- {azure_ai_evaluation-1.9.0.dist-info → azure_ai_evaluation-1.11.0.dist-info}/WHEEL +1 -1
- {azure_ai_evaluation-1.9.0.dist-info → azure_ai_evaluation-1.11.0.dist-info/licenses}/NOTICE.txt +0 -0
- {azure_ai_evaluation-1.9.0.dist-info → azure_ai_evaluation-1.11.0.dist-info}/top_level.txt +0 -0
azure/ai/evaluation/__init__.py
CHANGED
|
@@ -46,6 +46,7 @@ from ._aoai.label_grader import AzureOpenAILabelGrader
|
|
|
46
46
|
from ._aoai.string_check_grader import AzureOpenAIStringCheckGrader
|
|
47
47
|
from ._aoai.text_similarity_grader import AzureOpenAITextSimilarityGrader
|
|
48
48
|
from ._aoai.score_model_grader import AzureOpenAIScoreModelGrader
|
|
49
|
+
from ._aoai.python_grader import AzureOpenAIPythonGrader
|
|
49
50
|
|
|
50
51
|
|
|
51
52
|
_patch_all = []
|
|
@@ -53,21 +54,46 @@ _patch_all = []
|
|
|
53
54
|
# The converter from the AI service to the evaluator schema requires a dependency on
|
|
54
55
|
# ai.projects, but we also don't want to force users installing ai.evaluations to pull
|
|
55
56
|
# in ai.projects. So we only import it if it's available and the user has ai.projects.
|
|
56
|
-
|
|
57
|
-
|
|
57
|
+
# We use lazy loading to avoid printing messages during import unless the classes are actually used.
|
|
58
|
+
_lazy_imports = {}
|
|
58
59
|
|
|
59
|
-
_patch_all.append("AIAgentConverter")
|
|
60
|
-
except ImportError:
|
|
61
|
-
print(
|
|
62
|
-
"[INFO] Could not import AIAgentConverter. Please install the dependency with `pip install azure-ai-projects`."
|
|
63
|
-
)
|
|
64
60
|
|
|
65
|
-
|
|
66
|
-
|
|
61
|
+
def _create_lazy_import(class_name, module_path, dependency_name):
|
|
62
|
+
"""Create a lazy import function for optional dependencies.
|
|
67
63
|
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
64
|
+
Args:
|
|
65
|
+
class_name: Name of the class to import
|
|
66
|
+
module_path: Module path to import from
|
|
67
|
+
dependency_name: Name of the dependency package for error message
|
|
68
|
+
|
|
69
|
+
Returns:
|
|
70
|
+
A function that performs the lazy import when called
|
|
71
|
+
"""
|
|
72
|
+
|
|
73
|
+
def lazy_import():
|
|
74
|
+
try:
|
|
75
|
+
module = __import__(module_path, fromlist=[class_name])
|
|
76
|
+
cls = getattr(module, class_name)
|
|
77
|
+
_patch_all.append(class_name)
|
|
78
|
+
return cls
|
|
79
|
+
except ImportError:
|
|
80
|
+
raise ImportError(
|
|
81
|
+
f"Could not import {class_name}. Please install the dependency with `pip install {dependency_name}`."
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
return lazy_import
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
_lazy_imports["AIAgentConverter"] = _create_lazy_import(
|
|
88
|
+
"AIAgentConverter",
|
|
89
|
+
"azure.ai.evaluation._converters._ai_services",
|
|
90
|
+
"azure-ai-projects",
|
|
91
|
+
)
|
|
92
|
+
_lazy_imports["SKAgentConverter"] = _create_lazy_import(
|
|
93
|
+
"SKAgentConverter",
|
|
94
|
+
"azure.ai.evaluation._converters._sk_services",
|
|
95
|
+
"semantic-kernel",
|
|
96
|
+
)
|
|
71
97
|
|
|
72
98
|
__all__ = [
|
|
73
99
|
"evaluate",
|
|
@@ -110,6 +136,14 @@ __all__ = [
|
|
|
110
136
|
"AzureOpenAIStringCheckGrader",
|
|
111
137
|
"AzureOpenAITextSimilarityGrader",
|
|
112
138
|
"AzureOpenAIScoreModelGrader",
|
|
139
|
+
"AzureOpenAIPythonGrader",
|
|
113
140
|
]
|
|
114
141
|
|
|
115
142
|
__all__.extend([p for p in _patch_all if p not in __all__])
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def __getattr__(name):
|
|
146
|
+
"""Handle lazy imports for optional dependencies."""
|
|
147
|
+
if name in _lazy_imports:
|
|
148
|
+
return _lazy_imports[name]()
|
|
149
|
+
raise AttributeError(f"module '{__name__}' has no attribute '{name}'")
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
# ---------------------------------------------------------
|
|
2
|
+
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
|
+
# ---------------------------------------------------------
|
|
4
|
+
from typing import Any, Dict, Union, Optional
|
|
5
|
+
|
|
6
|
+
from azure.ai.evaluation._model_configurations import AzureOpenAIModelConfiguration, OpenAIModelConfiguration
|
|
7
|
+
from openai.types.graders import PythonGrader
|
|
8
|
+
from azure.ai.evaluation._common._experimental import experimental
|
|
9
|
+
|
|
10
|
+
from .aoai_grader import AzureOpenAIGrader
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@experimental
|
|
14
|
+
class AzureOpenAIPythonGrader(AzureOpenAIGrader):
|
|
15
|
+
"""
|
|
16
|
+
Wrapper class for OpenAI's Python code graders.
|
|
17
|
+
|
|
18
|
+
Enables custom Python-based evaluation logic with flexible scoring and
|
|
19
|
+
pass/fail thresholds. The grader executes user-provided Python code
|
|
20
|
+
to evaluate outputs against custom criteria.
|
|
21
|
+
|
|
22
|
+
Supplying a PythonGrader to the `evaluate` method will cause an
|
|
23
|
+
asynchronous request to evaluate the grader via the OpenAI API. The
|
|
24
|
+
results of the evaluation will then be merged into the standard
|
|
25
|
+
evaluation results.
|
|
26
|
+
|
|
27
|
+
:param model_config: The model configuration to use for the grader.
|
|
28
|
+
:type model_config: Union[
|
|
29
|
+
~azure.ai.evaluation.AzureOpenAIModelConfiguration,
|
|
30
|
+
~azure.ai.evaluation.OpenAIModelConfiguration
|
|
31
|
+
]
|
|
32
|
+
:param name: The name of the grader.
|
|
33
|
+
:type name: str
|
|
34
|
+
:param image_tag: The image tag for the Python execution environment.
|
|
35
|
+
:type image_tag: str
|
|
36
|
+
:param pass_threshold: Score threshold for pass/fail classification.
|
|
37
|
+
Scores >= threshold are considered passing.
|
|
38
|
+
:type pass_threshold: float
|
|
39
|
+
:param source: Python source code containing the grade function.
|
|
40
|
+
Must define: def grade(sample: dict, item: dict) -> float
|
|
41
|
+
:type source: str
|
|
42
|
+
:param kwargs: Additional keyword arguments to pass to the grader.
|
|
43
|
+
:type kwargs: Any
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
.. admonition:: Example:
|
|
47
|
+
|
|
48
|
+
.. literalinclude:: ../samples/evaluation_samples_common.py
|
|
49
|
+
:start-after: [START python_grader_example]
|
|
50
|
+
:end-before: [END python_grader_example]
|
|
51
|
+
:language: python
|
|
52
|
+
:dedent: 8
|
|
53
|
+
:caption: Using AzureOpenAIPythonGrader for custom evaluation logic.
|
|
54
|
+
"""
|
|
55
|
+
|
|
56
|
+
id = "azureai://built-in/evaluators/azure-openai/python_grader"
|
|
57
|
+
|
|
58
|
+
def __init__(
|
|
59
|
+
self,
|
|
60
|
+
*,
|
|
61
|
+
model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration],
|
|
62
|
+
name: str,
|
|
63
|
+
image_tag: str,
|
|
64
|
+
pass_threshold: float,
|
|
65
|
+
source: str,
|
|
66
|
+
**kwargs: Any,
|
|
67
|
+
):
|
|
68
|
+
# Validate pass_threshold
|
|
69
|
+
if not 0.0 <= pass_threshold <= 1.0:
|
|
70
|
+
raise ValueError("pass_threshold must be between 0.0 and 1.0")
|
|
71
|
+
|
|
72
|
+
# Store pass_threshold as instance attribute for potential future use
|
|
73
|
+
self.pass_threshold = pass_threshold
|
|
74
|
+
|
|
75
|
+
# Create OpenAI PythonGrader instance
|
|
76
|
+
grader = PythonGrader(
|
|
77
|
+
name=name,
|
|
78
|
+
image_tag=image_tag,
|
|
79
|
+
pass_threshold=pass_threshold,
|
|
80
|
+
source=source,
|
|
81
|
+
type="python",
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
super().__init__(model_config=model_config, grader_config=grader, **kwargs)
|
|
@@ -84,6 +84,7 @@ class AzureOpenAIScoreModelGrader(AzureOpenAIGrader):
|
|
|
84
84
|
grader_kwargs["range"] = range
|
|
85
85
|
if sampling_params is not None:
|
|
86
86
|
grader_kwargs["sampling_params"] = sampling_params
|
|
87
|
+
grader_kwargs["pass_threshold"] = self.pass_threshold
|
|
87
88
|
|
|
88
89
|
grader = ScoreModelGrader(**grader_kwargs)
|
|
89
90
|
|
|
@@ -1961,12 +1961,16 @@ class Message(_Model):
|
|
|
1961
1961
|
:vartype role: str
|
|
1962
1962
|
:ivar content: The content.
|
|
1963
1963
|
:vartype content: str
|
|
1964
|
+
:ivar context: The context.
|
|
1965
|
+
:vartype context: str
|
|
1964
1966
|
"""
|
|
1965
1967
|
|
|
1966
1968
|
role: Optional[str] = rest_field(name="Role", visibility=["read", "create", "update", "delete", "query"])
|
|
1967
1969
|
"""The role."""
|
|
1968
1970
|
content: Optional[str] = rest_field(name="Content", visibility=["read", "create", "update", "delete", "query"])
|
|
1969
1971
|
"""The content."""
|
|
1972
|
+
context: Optional[str] = rest_field(name="Context", visibility=["read", "create", "update", "delete", "query"])
|
|
1973
|
+
"""The context."""
|
|
1970
1974
|
|
|
1971
1975
|
@overload
|
|
1972
1976
|
def __init__(
|
|
@@ -1974,6 +1978,7 @@ class Message(_Model):
|
|
|
1974
1978
|
*,
|
|
1975
1979
|
role: Optional[str] = None,
|
|
1976
1980
|
content: Optional[str] = None,
|
|
1981
|
+
context: Optional[str] = None,
|
|
1977
1982
|
) -> None: ...
|
|
1978
1983
|
|
|
1979
1984
|
@overload
|
|
@@ -290,7 +290,7 @@ async def submit_request_onedp(
|
|
|
290
290
|
payload = generate_payload(normalized_user_text, metric, annotation_task=annotation_task)
|
|
291
291
|
headers = get_common_headers(token, evaluator_name)
|
|
292
292
|
if scan_session_id:
|
|
293
|
-
headers["
|
|
293
|
+
headers["x-ms-client-request-id"] = scan_session_id
|
|
294
294
|
response = client.evaluations.submit_annotation(payload, headers=headers)
|
|
295
295
|
result = json.loads(response)
|
|
296
296
|
operation_id = result["location"].split("/")[-1]
|
|
@@ -319,8 +319,8 @@ async def fetch_result(operation_id: str, rai_svc_url: str, credential: TokenCre
|
|
|
319
319
|
token = await fetch_or_reuse_token(credential, token)
|
|
320
320
|
headers = get_common_headers(token)
|
|
321
321
|
|
|
322
|
-
async with
|
|
323
|
-
response = await client.get(url, headers=headers)
|
|
322
|
+
async with get_async_http_client() as client:
|
|
323
|
+
response = await client.get(url, headers=headers, timeout=RAIService.TIMEOUT)
|
|
324
324
|
|
|
325
325
|
if response.status_code == 200:
|
|
326
326
|
return response.json()
|
|
@@ -6,11 +6,11 @@ import posixpath
|
|
|
6
6
|
import re
|
|
7
7
|
import math
|
|
8
8
|
import threading
|
|
9
|
-
from typing import Any, List, Literal, Mapping, Type, TypeVar, Tuple, Union, cast, get_args, get_origin
|
|
9
|
+
from typing import Any, List, Literal, Mapping, Optional, Type, TypeVar, Tuple, Union, cast, get_args, get_origin
|
|
10
10
|
|
|
11
11
|
import nltk
|
|
12
12
|
from azure.storage.blob import ContainerClient
|
|
13
|
-
from typing_extensions import NotRequired, Required, TypeGuard
|
|
13
|
+
from typing_extensions import NotRequired, Required, TypeGuard, TypeIs
|
|
14
14
|
from azure.ai.evaluation._legacy._adapters._errors import MissingRequiredPackage
|
|
15
15
|
from azure.ai.evaluation._constants import AZURE_OPENAI_TYPE, OPENAI_TYPE
|
|
16
16
|
from azure.ai.evaluation._exceptions import ErrorMessage, ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
@@ -127,17 +127,15 @@ def construct_prompty_model_config(
|
|
|
127
127
|
return prompty_model_config
|
|
128
128
|
|
|
129
129
|
|
|
130
|
-
def is_onedp_project(azure_ai_project: AzureAIProject) ->
|
|
130
|
+
def is_onedp_project(azure_ai_project: Optional[Union[str, AzureAIProject]]) -> TypeIs[str]:
|
|
131
131
|
"""Check if the Azure AI project is an OneDP project.
|
|
132
132
|
|
|
133
133
|
:param azure_ai_project: The scope of the Azure AI project.
|
|
134
|
-
:type azure_ai_project:
|
|
134
|
+
:type azure_ai_project: Optional[Union[str,~azure.ai.evaluation.AzureAIProject]]
|
|
135
135
|
:return: True if the Azure AI project is an OneDP project, False otherwise.
|
|
136
136
|
:rtype: bool
|
|
137
137
|
"""
|
|
138
|
-
|
|
139
|
-
return True
|
|
140
|
-
return False
|
|
138
|
+
return isinstance(azure_ai_project, str)
|
|
141
139
|
|
|
142
140
|
|
|
143
141
|
def validate_azure_ai_project(o: object) -> AzureAIProject:
|
|
@@ -494,14 +492,17 @@ def _extract_text_from_content(content):
|
|
|
494
492
|
return text
|
|
495
493
|
|
|
496
494
|
|
|
497
|
-
def _get_conversation_history(query):
|
|
495
|
+
def _get_conversation_history(query, include_system_messages=False):
|
|
498
496
|
all_user_queries = []
|
|
499
497
|
cur_user_query = []
|
|
500
498
|
all_agent_responses = []
|
|
501
499
|
cur_agent_response = []
|
|
500
|
+
system_message = None
|
|
502
501
|
for msg in query:
|
|
503
502
|
if not "role" in msg:
|
|
504
503
|
continue
|
|
504
|
+
if include_system_messages and msg["role"] == "system" and "content" in msg:
|
|
505
|
+
system_message = msg.get("content", "")
|
|
505
506
|
if msg["role"] == "user" and "content" in msg:
|
|
506
507
|
if cur_agent_response != []:
|
|
507
508
|
all_agent_responses.append(cur_agent_response)
|
|
@@ -530,13 +531,18 @@ def _get_conversation_history(query):
|
|
|
530
531
|
category=ErrorCategory.INVALID_VALUE,
|
|
531
532
|
blame=ErrorBlame.USER_ERROR,
|
|
532
533
|
)
|
|
533
|
-
|
|
534
|
-
|
|
534
|
+
result = {"user_queries": all_user_queries, "agent_responses": all_agent_responses}
|
|
535
|
+
if include_system_messages:
|
|
536
|
+
result["system_message"] = system_message
|
|
537
|
+
return result
|
|
535
538
|
|
|
536
539
|
|
|
537
540
|
def _pretty_format_conversation_history(conversation_history):
|
|
538
541
|
"""Formats the conversation history for better readability."""
|
|
539
542
|
formatted_history = ""
|
|
543
|
+
if "system_message" in conversation_history and conversation_history["system_message"] is not None:
|
|
544
|
+
formatted_history += "SYSTEM_PROMPT:\n"
|
|
545
|
+
formatted_history += " " + conversation_history["system_message"] + "\n\n"
|
|
540
546
|
for i, (user_query, agent_response) in enumerate(
|
|
541
547
|
zip(conversation_history["user_queries"], conversation_history["agent_responses"] + [None])
|
|
542
548
|
):
|
|
@@ -552,10 +558,10 @@ def _pretty_format_conversation_history(conversation_history):
|
|
|
552
558
|
return formatted_history
|
|
553
559
|
|
|
554
560
|
|
|
555
|
-
def reformat_conversation_history(query, logger=None):
|
|
561
|
+
def reformat_conversation_history(query, logger=None, include_system_messages=False):
|
|
556
562
|
"""Reformats the conversation history to a more compact representation."""
|
|
557
563
|
try:
|
|
558
|
-
conversation_history = _get_conversation_history(query)
|
|
564
|
+
conversation_history = _get_conversation_history(query, include_system_messages=include_system_messages)
|
|
559
565
|
return _pretty_format_conversation_history(conversation_history)
|
|
560
566
|
except:
|
|
561
567
|
# If the conversation history cannot be parsed for whatever reason (e.g. the converter format changed), the original query is returned
|
|
@@ -570,22 +576,53 @@ def reformat_conversation_history(query, logger=None):
|
|
|
570
576
|
return query
|
|
571
577
|
|
|
572
578
|
|
|
573
|
-
def _get_agent_response(agent_response_msgs):
|
|
574
|
-
"""Extracts
|
|
579
|
+
def _get_agent_response(agent_response_msgs, include_tool_messages=False):
|
|
580
|
+
"""Extracts formatted agent response including text, and optionally tool calls/results."""
|
|
575
581
|
agent_response_text = []
|
|
582
|
+
tool_results = {}
|
|
583
|
+
|
|
584
|
+
# First pass: collect tool results
|
|
585
|
+
if include_tool_messages:
|
|
586
|
+
for msg in agent_response_msgs:
|
|
587
|
+
if msg.get("role") == "tool" and "tool_call_id" in msg:
|
|
588
|
+
for content in msg.get("content", []):
|
|
589
|
+
if content.get("type") == "tool_result":
|
|
590
|
+
result = content.get("tool_result")
|
|
591
|
+
tool_results[msg["tool_call_id"]] = f"[TOOL_RESULT] {result}"
|
|
592
|
+
|
|
593
|
+
# Second pass: parse assistant messages and tool calls
|
|
576
594
|
for msg in agent_response_msgs:
|
|
577
|
-
if "role" in msg and msg
|
|
595
|
+
if "role" in msg and msg.get("role") == "assistant" and "content" in msg:
|
|
578
596
|
text = _extract_text_from_content(msg["content"])
|
|
579
597
|
if text:
|
|
580
598
|
agent_response_text.extend(text)
|
|
599
|
+
if include_tool_messages:
|
|
600
|
+
for content in msg.get("content", []):
|
|
601
|
+
# Todo: Verify if this is the correct way to handle tool calls
|
|
602
|
+
if content.get("type") == "tool_call":
|
|
603
|
+
if "tool_call" in content and "function" in content.get("tool_call", {}):
|
|
604
|
+
tc = content.get("tool_call", {})
|
|
605
|
+
func_name = tc.get("function", {}).get("name", "")
|
|
606
|
+
args = tc.get("function", {}).get("arguments", {})
|
|
607
|
+
tool_call_id = tc.get("id")
|
|
608
|
+
else:
|
|
609
|
+
tool_call_id = content.get("tool_call_id")
|
|
610
|
+
func_name = content.get("name", "")
|
|
611
|
+
args = content.get("arguments", {})
|
|
612
|
+
args_str = ", ".join(f'{k}="{v}"' for k, v in args.items())
|
|
613
|
+
call_line = f"[TOOL_CALL] {func_name}({args_str})"
|
|
614
|
+
agent_response_text.append(call_line)
|
|
615
|
+
if tool_call_id in tool_results:
|
|
616
|
+
agent_response_text.append(tool_results[tool_call_id])
|
|
617
|
+
|
|
581
618
|
return agent_response_text
|
|
582
619
|
|
|
583
620
|
|
|
584
|
-
def reformat_agent_response(response, logger=None):
|
|
621
|
+
def reformat_agent_response(response, logger=None, include_tool_messages=False):
|
|
585
622
|
try:
|
|
586
623
|
if response is None or response == []:
|
|
587
624
|
return ""
|
|
588
|
-
agent_response = _get_agent_response(response)
|
|
625
|
+
agent_response = _get_agent_response(response, include_tool_messages=include_tool_messages)
|
|
589
626
|
if agent_response == []:
|
|
590
627
|
# If no message could be extracted, likely the format changed, fallback to the original response in that case
|
|
591
628
|
if logger:
|
|
@@ -602,6 +639,26 @@ def reformat_agent_response(response, logger=None):
|
|
|
602
639
|
return response
|
|
603
640
|
|
|
604
641
|
|
|
642
|
+
def reformat_tool_definitions(tool_definitions, logger=None):
|
|
643
|
+
try:
|
|
644
|
+
output_lines = ["TOOL_DEFINITIONS:"]
|
|
645
|
+
for tool in tool_definitions:
|
|
646
|
+
name = tool.get("name", "unnamed_tool")
|
|
647
|
+
desc = tool.get("description", "").strip()
|
|
648
|
+
params = tool.get("parameters", {}).get("properties", {})
|
|
649
|
+
param_names = ", ".join(params.keys()) if params else "no parameters"
|
|
650
|
+
output_lines.append(f"- {name}: {desc} (inputs: {param_names})")
|
|
651
|
+
return "\n".join(output_lines)
|
|
652
|
+
except Exception as e:
|
|
653
|
+
# If the tool definitions cannot be parsed for whatever reason, the original tool definitions are returned
|
|
654
|
+
# This is a fallback to ensure that the evaluation can still proceed. See comments on reformat_conversation_history for more details.
|
|
655
|
+
if logger:
|
|
656
|
+
logger.warning(
|
|
657
|
+
f"Tool definitions could not be parsed, falling back to original definitions: {tool_definitions}"
|
|
658
|
+
)
|
|
659
|
+
return tool_definitions
|
|
660
|
+
|
|
661
|
+
|
|
605
662
|
def upload(path: str, container_client: ContainerClient, logger=None):
|
|
606
663
|
"""Upload files or directories to Azure Blob Storage using a container client.
|
|
607
664
|
|
|
@@ -11,7 +11,18 @@ from azure.ai.evaluation._common._experimental import experimental
|
|
|
11
11
|
from packaging.version import Version
|
|
12
12
|
|
|
13
13
|
# Constants.
|
|
14
|
-
from ._models import
|
|
14
|
+
from ._models import (
|
|
15
|
+
_USER,
|
|
16
|
+
_AGENT,
|
|
17
|
+
_TOOL,
|
|
18
|
+
_TOOL_CALL,
|
|
19
|
+
_TOOL_CALLS,
|
|
20
|
+
_FUNCTION,
|
|
21
|
+
_BUILT_IN_DESCRIPTIONS,
|
|
22
|
+
_BUILT_IN_PARAMS,
|
|
23
|
+
_OPENAPI,
|
|
24
|
+
OpenAPIToolDefinition,
|
|
25
|
+
)
|
|
15
26
|
|
|
16
27
|
# Message instances.
|
|
17
28
|
from ._models import Message, SystemMessage, UserMessage, AssistantMessage, ToolCall
|
|
@@ -93,7 +104,7 @@ class AIAgentConverter:
|
|
|
93
104
|
return tool_calls_chronological
|
|
94
105
|
|
|
95
106
|
@staticmethod
|
|
96
|
-
def _extract_function_tool_definitions(thread_run: object) -> List[ToolDefinition]:
|
|
107
|
+
def _extract_function_tool_definitions(thread_run: object) -> List[Union[ToolDefinition, OpenAPIToolDefinition]]:
|
|
97
108
|
"""
|
|
98
109
|
Extracts tool definitions from a thread run.
|
|
99
110
|
|
|
@@ -121,6 +132,26 @@ class AIAgentConverter:
|
|
|
121
132
|
parameters=parameters,
|
|
122
133
|
)
|
|
123
134
|
)
|
|
135
|
+
elif tool.type == _OPENAPI:
|
|
136
|
+
openapi_tool = tool.openapi
|
|
137
|
+
tool_definition = OpenAPIToolDefinition(
|
|
138
|
+
name=openapi_tool.name,
|
|
139
|
+
description=openapi_tool.description,
|
|
140
|
+
type=_OPENAPI,
|
|
141
|
+
spec=openapi_tool.spec,
|
|
142
|
+
auth=openapi_tool.auth.as_dict(),
|
|
143
|
+
default_params=openapi_tool.default_params.as_dict() if openapi_tool.default_params else None,
|
|
144
|
+
functions=[
|
|
145
|
+
ToolDefinition(
|
|
146
|
+
name=func.get("name"),
|
|
147
|
+
description=func.get("description"),
|
|
148
|
+
parameters=func.get("parameters"),
|
|
149
|
+
type="function",
|
|
150
|
+
)
|
|
151
|
+
for func in openapi_tool.get("functions")
|
|
152
|
+
],
|
|
153
|
+
)
|
|
154
|
+
final_tools.append(tool_definition)
|
|
124
155
|
else:
|
|
125
156
|
# Add limited support for built-in tools. Descriptions and parameters
|
|
126
157
|
# are not published, but we'll include placeholders.
|
|
@@ -243,16 +274,30 @@ class AIAgentConverter:
|
|
|
243
274
|
if len(single_turn.content) < 1:
|
|
244
275
|
continue
|
|
245
276
|
|
|
246
|
-
|
|
247
|
-
content
|
|
248
|
-
|
|
249
|
-
"text":
|
|
250
|
-
|
|
277
|
+
content_list = []
|
|
278
|
+
# If content is a list, process all content items.
|
|
279
|
+
for content_item in single_turn.content:
|
|
280
|
+
if content_item.type == "text":
|
|
281
|
+
content_list.append(
|
|
282
|
+
{
|
|
283
|
+
"type": "text",
|
|
284
|
+
"text": content_item.text.value,
|
|
285
|
+
}
|
|
286
|
+
)
|
|
287
|
+
elif content_item.type == "image":
|
|
288
|
+
content_list.append(
|
|
289
|
+
{
|
|
290
|
+
"type": "image",
|
|
291
|
+
"image": {
|
|
292
|
+
"file_id": content_item.image_file.file_id,
|
|
293
|
+
},
|
|
294
|
+
}
|
|
295
|
+
)
|
|
251
296
|
|
|
252
297
|
# If we have a user message, then we save it as such and since it's a human message, there is no
|
|
253
298
|
# run_id associated with it.
|
|
254
299
|
if single_turn.role == _USER:
|
|
255
|
-
final_messages.append(UserMessage(content=
|
|
300
|
+
final_messages.append(UserMessage(content=content_list, createdAt=single_turn.created_at))
|
|
256
301
|
continue
|
|
257
302
|
|
|
258
303
|
# In this case, we have an assistant message. Unfortunately, this would only have the user-facing
|
|
@@ -261,7 +306,7 @@ class AIAgentConverter:
|
|
|
261
306
|
if single_turn.role == _AGENT:
|
|
262
307
|
# We are required to put the run_id in the assistant message.
|
|
263
308
|
final_messages.append(
|
|
264
|
-
AssistantMessage(content=
|
|
309
|
+
AssistantMessage(content=content_list, run_id=single_turn.run_id, createdAt=single_turn.created_at)
|
|
265
310
|
)
|
|
266
311
|
continue
|
|
267
312
|
|
|
@@ -791,6 +836,7 @@ class LegacyAgentDataRetriever(AIAgentDataRetriever):
|
|
|
791
836
|
limit=self._AI_SERVICES_API_MAX_LIMIT,
|
|
792
837
|
order="asc",
|
|
793
838
|
after=after,
|
|
839
|
+
include=["step_details.tool_calls[*].file_search.results[*].content"],
|
|
794
840
|
)
|
|
795
841
|
has_more = run_steps.has_more
|
|
796
842
|
after = run_steps.last_id
|
|
@@ -838,7 +884,11 @@ class FDPAgentDataRetriever(AIAgentDataRetriever):
|
|
|
838
884
|
def _list_run_steps_chronological(self, thread_id: str, run_id: str):
|
|
839
885
|
|
|
840
886
|
return self.project_client.agents.run_steps.list(
|
|
841
|
-
thread_id=thread_id,
|
|
887
|
+
thread_id=thread_id,
|
|
888
|
+
run_id=run_id,
|
|
889
|
+
limit=self._AI_SERVICES_API_MAX_LIMIT,
|
|
890
|
+
order="asc",
|
|
891
|
+
include=["step_details.tool_calls[*].file_search.results[*].content"],
|
|
842
892
|
)
|
|
843
893
|
|
|
844
894
|
def _list_run_ids_chronological(self, thread_id: str) -> List[str]:
|