azure-ai-evaluation 1.8.0__py3-none-any.whl → 1.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of azure-ai-evaluation might be problematic. Click here for more details.
- azure/ai/evaluation/__init__.py +51 -6
- azure/ai/evaluation/_aoai/__init__.py +1 -1
- azure/ai/evaluation/_aoai/aoai_grader.py +21 -11
- azure/ai/evaluation/_aoai/label_grader.py +3 -2
- azure/ai/evaluation/_aoai/python_grader.py +84 -0
- azure/ai/evaluation/_aoai/score_model_grader.py +91 -0
- azure/ai/evaluation/_aoai/string_check_grader.py +3 -2
- azure/ai/evaluation/_aoai/text_similarity_grader.py +3 -2
- azure/ai/evaluation/_azure/_envs.py +9 -10
- azure/ai/evaluation/_azure/_token_manager.py +7 -1
- azure/ai/evaluation/_common/constants.py +11 -2
- azure/ai/evaluation/_common/evaluation_onedp_client.py +32 -26
- azure/ai/evaluation/_common/onedp/__init__.py +32 -32
- azure/ai/evaluation/_common/onedp/_client.py +136 -139
- azure/ai/evaluation/_common/onedp/_configuration.py +70 -73
- azure/ai/evaluation/_common/onedp/_patch.py +21 -21
- azure/ai/evaluation/_common/onedp/_utils/__init__.py +6 -0
- azure/ai/evaluation/_common/onedp/_utils/model_base.py +1232 -0
- azure/ai/evaluation/_common/onedp/_utils/serialization.py +2032 -0
- azure/ai/evaluation/_common/onedp/_validation.py +50 -50
- azure/ai/evaluation/_common/onedp/_version.py +9 -9
- azure/ai/evaluation/_common/onedp/aio/__init__.py +29 -29
- azure/ai/evaluation/_common/onedp/aio/_client.py +138 -143
- azure/ai/evaluation/_common/onedp/aio/_configuration.py +70 -75
- azure/ai/evaluation/_common/onedp/aio/_patch.py +21 -21
- azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +37 -39
- azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +4832 -4494
- azure/ai/evaluation/_common/onedp/aio/operations/_patch.py +21 -21
- azure/ai/evaluation/_common/onedp/models/__init__.py +168 -142
- azure/ai/evaluation/_common/onedp/models/_enums.py +230 -162
- azure/ai/evaluation/_common/onedp/models/_models.py +2685 -2228
- azure/ai/evaluation/_common/onedp/models/_patch.py +21 -21
- azure/ai/evaluation/_common/onedp/operations/__init__.py +37 -39
- azure/ai/evaluation/_common/onedp/operations/_operations.py +6106 -5657
- azure/ai/evaluation/_common/onedp/operations/_patch.py +21 -21
- azure/ai/evaluation/_common/rai_service.py +88 -52
- azure/ai/evaluation/_common/raiclient/__init__.py +1 -1
- azure/ai/evaluation/_common/raiclient/operations/_operations.py +14 -1
- azure/ai/evaluation/_common/utils.py +188 -10
- azure/ai/evaluation/_constants.py +2 -1
- azure/ai/evaluation/_converters/__init__.py +1 -1
- azure/ai/evaluation/_converters/_ai_services.py +9 -8
- azure/ai/evaluation/_converters/_models.py +46 -0
- azure/ai/evaluation/_converters/_sk_services.py +495 -0
- azure/ai/evaluation/_eval_mapping.py +2 -2
- azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +73 -25
- azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +2 -2
- azure/ai/evaluation/_evaluate/_evaluate.py +210 -94
- azure/ai/evaluation/_evaluate/_evaluate_aoai.py +132 -89
- azure/ai/evaluation/_evaluate/_telemetry/__init__.py +0 -1
- azure/ai/evaluation/_evaluate/_utils.py +25 -17
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +4 -4
- azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +20 -12
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +6 -6
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +45 -11
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +24 -9
- azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +24 -9
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +28 -18
- azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +11 -8
- azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +11 -8
- azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +12 -9
- azure/ai/evaluation/_evaluators/_content_safety/_violence.py +10 -7
- azure/ai/evaluation/_evaluators/_document_retrieval/__init__.py +1 -5
- azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +37 -64
- azure/ai/evaluation/_evaluators/_eci/_eci.py +6 -3
- azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +5 -5
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +3 -3
- azure/ai/evaluation/_evaluators/_gleu/_gleu.py +4 -4
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +12 -8
- azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +31 -26
- azure/ai/evaluation/_evaluators/_intent_resolution/intent_resolution.prompty +210 -96
- azure/ai/evaluation/_evaluators/_meteor/_meteor.py +3 -4
- azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +14 -7
- azure/ai/evaluation/_evaluators/_qa/_qa.py +5 -5
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +62 -15
- azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +140 -59
- azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +21 -26
- azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +5 -5
- azure/ai/evaluation/_evaluators/_rouge/_rouge.py +22 -22
- azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +7 -6
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +4 -4
- azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +27 -24
- azure/ai/evaluation/_evaluators/_task_adherence/task_adherence.prompty +354 -66
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +175 -183
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +99 -21
- azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +20 -12
- azure/ai/evaluation/_evaluators/_xpia/xpia.py +10 -7
- azure/ai/evaluation/_exceptions.py +10 -0
- azure/ai/evaluation/_http_utils.py +3 -3
- azure/ai/evaluation/_legacy/_batch_engine/_config.py +6 -3
- azure/ai/evaluation/_legacy/_batch_engine/_engine.py +117 -32
- azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py +5 -2
- azure/ai/evaluation/_legacy/_batch_engine/_result.py +2 -0
- azure/ai/evaluation/_legacy/_batch_engine/_run.py +2 -2
- azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +33 -41
- azure/ai/evaluation/_legacy/_batch_engine/_utils.py +1 -4
- azure/ai/evaluation/_legacy/_common/_async_token_provider.py +12 -19
- azure/ai/evaluation/_legacy/_common/_thread_pool_executor_with_context.py +2 -0
- azure/ai/evaluation/_legacy/prompty/_prompty.py +11 -5
- azure/ai/evaluation/_safety_evaluation/__init__.py +1 -1
- azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +195 -111
- azure/ai/evaluation/_user_agent.py +32 -1
- azure/ai/evaluation/_version.py +1 -1
- azure/ai/evaluation/red_team/__init__.py +3 -1
- azure/ai/evaluation/red_team/_agent/__init__.py +1 -1
- azure/ai/evaluation/red_team/_agent/_agent_functions.py +68 -71
- azure/ai/evaluation/red_team/_agent/_agent_tools.py +103 -145
- azure/ai/evaluation/red_team/_agent/_agent_utils.py +26 -6
- azure/ai/evaluation/red_team/_agent/_semantic_kernel_plugin.py +62 -71
- azure/ai/evaluation/red_team/_attack_objective_generator.py +94 -52
- azure/ai/evaluation/red_team/_attack_strategy.py +2 -1
- azure/ai/evaluation/red_team/_callback_chat_target.py +4 -9
- azure/ai/evaluation/red_team/_default_converter.py +1 -1
- azure/ai/evaluation/red_team/_red_team.py +1947 -1040
- azure/ai/evaluation/red_team/_red_team_result.py +49 -38
- azure/ai/evaluation/red_team/_utils/__init__.py +1 -1
- azure/ai/evaluation/red_team/_utils/_rai_service_eval_chat_target.py +39 -34
- azure/ai/evaluation/red_team/_utils/_rai_service_target.py +163 -138
- azure/ai/evaluation/red_team/_utils/_rai_service_true_false_scorer.py +14 -14
- azure/ai/evaluation/red_team/_utils/constants.py +1 -13
- azure/ai/evaluation/red_team/_utils/formatting_utils.py +41 -44
- azure/ai/evaluation/red_team/_utils/logging_utils.py +17 -17
- azure/ai/evaluation/red_team/_utils/metric_mapping.py +31 -4
- azure/ai/evaluation/red_team/_utils/strategy_utils.py +33 -25
- azure/ai/evaluation/simulator/_adversarial_scenario.py +2 -0
- azure/ai/evaluation/simulator/_adversarial_simulator.py +31 -17
- azure/ai/evaluation/simulator/_conversation/__init__.py +2 -2
- azure/ai/evaluation/simulator/_direct_attack_simulator.py +8 -8
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +18 -6
- azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +54 -24
- azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +7 -1
- azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +30 -10
- azure/ai/evaluation/simulator/_model_tools/_rai_client.py +19 -31
- azure/ai/evaluation/simulator/_model_tools/_template_handler.py +20 -6
- azure/ai/evaluation/simulator/_model_tools/models.py +1 -1
- azure/ai/evaluation/simulator/_simulator.py +21 -8
- {azure_ai_evaluation-1.8.0.dist-info → azure_ai_evaluation-1.10.0.dist-info}/METADATA +46 -3
- {azure_ai_evaluation-1.8.0.dist-info → azure_ai_evaluation-1.10.0.dist-info}/RECORD +141 -136
- azure/ai/evaluation/_common/onedp/aio/_vendor.py +0 -40
- {azure_ai_evaluation-1.8.0.dist-info → azure_ai_evaluation-1.10.0.dist-info}/NOTICE.txt +0 -0
- {azure_ai_evaluation-1.8.0.dist-info → azure_ai_evaluation-1.10.0.dist-info}/WHEEL +0 -0
- {azure_ai_evaluation-1.8.0.dist-info → azure_ai_evaluation-1.10.0.dist-info}/top_level.txt +0 -0
|
@@ -6,14 +6,14 @@ import posixpath
|
|
|
6
6
|
import re
|
|
7
7
|
import math
|
|
8
8
|
import threading
|
|
9
|
-
from typing import Any, List, Literal, Mapping, Type, TypeVar, Tuple, Union, cast, get_args, get_origin
|
|
9
|
+
from typing import Any, List, Literal, Mapping, Optional, Type, TypeVar, Tuple, Union, cast, get_args, get_origin
|
|
10
10
|
|
|
11
11
|
import nltk
|
|
12
12
|
from azure.storage.blob import ContainerClient
|
|
13
|
-
from typing_extensions import NotRequired, Required, TypeGuard
|
|
13
|
+
from typing_extensions import NotRequired, Required, TypeGuard, TypeIs
|
|
14
14
|
from azure.ai.evaluation._legacy._adapters._errors import MissingRequiredPackage
|
|
15
15
|
from azure.ai.evaluation._constants import AZURE_OPENAI_TYPE, OPENAI_TYPE
|
|
16
|
-
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
16
|
+
from azure.ai.evaluation._exceptions import ErrorMessage, ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
17
17
|
from azure.ai.evaluation._model_configurations import (
|
|
18
18
|
AzureAIProject,
|
|
19
19
|
AzureOpenAIModelConfiguration,
|
|
@@ -126,17 +126,17 @@ def construct_prompty_model_config(
|
|
|
126
126
|
|
|
127
127
|
return prompty_model_config
|
|
128
128
|
|
|
129
|
-
|
|
129
|
+
|
|
130
|
+
def is_onedp_project(azure_ai_project: Optional[Union[str, AzureAIProject]]) -> TypeIs[str]:
|
|
130
131
|
"""Check if the Azure AI project is an OneDP project.
|
|
131
132
|
|
|
132
133
|
:param azure_ai_project: The scope of the Azure AI project.
|
|
133
|
-
:type azure_ai_project:
|
|
134
|
+
:type azure_ai_project: Optional[Union[str,~azure.ai.evaluation.AzureAIProject]]
|
|
134
135
|
:return: True if the Azure AI project is an OneDP project, False otherwise.
|
|
135
136
|
:rtype: bool
|
|
136
137
|
"""
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
return False
|
|
138
|
+
return isinstance(azure_ai_project, str)
|
|
139
|
+
|
|
140
140
|
|
|
141
141
|
def validate_azure_ai_project(o: object) -> AzureAIProject:
|
|
142
142
|
fields = {"subscription_id": str, "resource_group_name": str, "project_name": str}
|
|
@@ -291,7 +291,8 @@ def _validate_typed_dict(o: object, t: Type[T_TypedDict]) -> T_TypedDict:
|
|
|
291
291
|
|
|
292
292
|
return cast(T_TypedDict, o)
|
|
293
293
|
|
|
294
|
-
|
|
294
|
+
|
|
295
|
+
def check_score_is_valid(score: Union[str, float], min_score=1, max_score=5) -> bool:
|
|
295
296
|
"""Check if the score is valid, i.e. is convertable to number and is in the range [min_score, max_score].
|
|
296
297
|
|
|
297
298
|
:param score: The score to check.
|
|
@@ -310,6 +311,7 @@ def check_score_is_valid(score: Union[str, float], min_score = 1, max_score = 5)
|
|
|
310
311
|
|
|
311
312
|
return min_score <= numeric_score <= max_score
|
|
312
313
|
|
|
314
|
+
|
|
313
315
|
def parse_quality_evaluator_reason_score(llm_output: str, valid_score_range: str = "[1-5]") -> Tuple[float, str]:
|
|
314
316
|
"""Parse the output of prompt-based quality evaluators that return a score and reason.
|
|
315
317
|
|
|
@@ -481,6 +483,182 @@ def validate_conversation(conversation):
|
|
|
481
483
|
ErrorTarget.CONTENT_SAFETY_CHAT_EVALUATOR,
|
|
482
484
|
)
|
|
483
485
|
|
|
486
|
+
|
|
487
|
+
def _extract_text_from_content(content):
|
|
488
|
+
text = []
|
|
489
|
+
for msg in content:
|
|
490
|
+
if "text" in msg:
|
|
491
|
+
text.append(msg["text"])
|
|
492
|
+
return text
|
|
493
|
+
|
|
494
|
+
|
|
495
|
+
def _get_conversation_history(query, include_system_messages=False):
|
|
496
|
+
all_user_queries = []
|
|
497
|
+
cur_user_query = []
|
|
498
|
+
all_agent_responses = []
|
|
499
|
+
cur_agent_response = []
|
|
500
|
+
system_message = None
|
|
501
|
+
for msg in query:
|
|
502
|
+
if not "role" in msg:
|
|
503
|
+
continue
|
|
504
|
+
if include_system_messages and msg["role"] == "system" and "content" in msg:
|
|
505
|
+
system_message = msg.get("content", "")
|
|
506
|
+
if msg["role"] == "user" and "content" in msg:
|
|
507
|
+
if cur_agent_response != []:
|
|
508
|
+
all_agent_responses.append(cur_agent_response)
|
|
509
|
+
cur_agent_response = []
|
|
510
|
+
text_in_msg = _extract_text_from_content(msg["content"])
|
|
511
|
+
if text_in_msg:
|
|
512
|
+
cur_user_query.append(text_in_msg)
|
|
513
|
+
|
|
514
|
+
if msg["role"] == "assistant" and "content" in msg:
|
|
515
|
+
if cur_user_query != []:
|
|
516
|
+
all_user_queries.append(cur_user_query)
|
|
517
|
+
cur_user_query = []
|
|
518
|
+
text_in_msg = _extract_text_from_content(msg["content"])
|
|
519
|
+
if text_in_msg:
|
|
520
|
+
cur_agent_response.append(text_in_msg)
|
|
521
|
+
if cur_user_query != []:
|
|
522
|
+
all_user_queries.append(cur_user_query)
|
|
523
|
+
if cur_agent_response != []:
|
|
524
|
+
all_agent_responses.append(cur_agent_response)
|
|
525
|
+
|
|
526
|
+
if len(all_user_queries) != len(all_agent_responses) + 1:
|
|
527
|
+
raise EvaluationException(
|
|
528
|
+
message=ErrorMessage.MALFORMED_CONVERSATION_HISTORY,
|
|
529
|
+
internal_message=ErrorMessage.MALFORMED_CONVERSATION_HISTORY,
|
|
530
|
+
target=ErrorTarget.CONVERSATION_HISTORY_PARSING,
|
|
531
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
532
|
+
blame=ErrorBlame.USER_ERROR,
|
|
533
|
+
)
|
|
534
|
+
result = {"user_queries": all_user_queries, "agent_responses": all_agent_responses}
|
|
535
|
+
if include_system_messages:
|
|
536
|
+
result["system_message"] = system_message
|
|
537
|
+
return result
|
|
538
|
+
|
|
539
|
+
|
|
540
|
+
def _pretty_format_conversation_history(conversation_history):
|
|
541
|
+
"""Formats the conversation history for better readability."""
|
|
542
|
+
formatted_history = ""
|
|
543
|
+
if "system_message" in conversation_history and conversation_history["system_message"] is not None:
|
|
544
|
+
formatted_history += "SYSTEM_PROMPT:\n"
|
|
545
|
+
formatted_history += " " + conversation_history["system_message"] + "\n\n"
|
|
546
|
+
for i, (user_query, agent_response) in enumerate(
|
|
547
|
+
zip(conversation_history["user_queries"], conversation_history["agent_responses"] + [None])
|
|
548
|
+
):
|
|
549
|
+
formatted_history += f"User turn {i+1}:\n"
|
|
550
|
+
for msg in user_query:
|
|
551
|
+
formatted_history += " " + "\n ".join(msg)
|
|
552
|
+
formatted_history += "\n\n"
|
|
553
|
+
if agent_response:
|
|
554
|
+
formatted_history += f"Agent turn {i+1}:\n"
|
|
555
|
+
for msg in agent_response:
|
|
556
|
+
formatted_history += " " + "\n ".join(msg)
|
|
557
|
+
formatted_history += "\n\n"
|
|
558
|
+
return formatted_history
|
|
559
|
+
|
|
560
|
+
|
|
561
|
+
def reformat_conversation_history(query, logger=None, include_system_messages=False):
|
|
562
|
+
"""Reformats the conversation history to a more compact representation."""
|
|
563
|
+
try:
|
|
564
|
+
conversation_history = _get_conversation_history(query, include_system_messages=include_system_messages)
|
|
565
|
+
return _pretty_format_conversation_history(conversation_history)
|
|
566
|
+
except:
|
|
567
|
+
# If the conversation history cannot be parsed for whatever reason (e.g. the converter format changed), the original query is returned
|
|
568
|
+
# This is a fallback to ensure that the evaluation can still proceed. However the accuracy of the evaluation will be affected.
|
|
569
|
+
# From our tests the negative impact on IntentResolution is:
|
|
570
|
+
# Higher intra model variance (0.142 vs 0.046)
|
|
571
|
+
# Higher inter model variance (0.345 vs 0.607)
|
|
572
|
+
# Lower percentage of mode in Likert scale (73.4% vs 75.4%)
|
|
573
|
+
# Lower pairwise agreement between LLMs (85% vs 90% at the pass/fail level with threshold of 3)
|
|
574
|
+
if logger:
|
|
575
|
+
logger.warning(f"Conversation history could not be parsed, falling back to original query: {query}")
|
|
576
|
+
return query
|
|
577
|
+
|
|
578
|
+
|
|
579
|
+
def _get_agent_response(agent_response_msgs, include_tool_messages=False):
|
|
580
|
+
"""Extracts formatted agent response including text, and optionally tool calls/results."""
|
|
581
|
+
agent_response_text = []
|
|
582
|
+
tool_results = {}
|
|
583
|
+
|
|
584
|
+
# First pass: collect tool results
|
|
585
|
+
if include_tool_messages:
|
|
586
|
+
for msg in agent_response_msgs:
|
|
587
|
+
if msg.get("role") == "tool" and "tool_call_id" in msg:
|
|
588
|
+
for content in msg.get("content", []):
|
|
589
|
+
if content.get("type") == "tool_result":
|
|
590
|
+
result = content.get("tool_result")
|
|
591
|
+
tool_results[msg["tool_call_id"]] = f"[TOOL_RESULT] {result}"
|
|
592
|
+
|
|
593
|
+
# Second pass: parse assistant messages and tool calls
|
|
594
|
+
for msg in agent_response_msgs:
|
|
595
|
+
if "role" in msg and msg.get("role") == "assistant" and "content" in msg:
|
|
596
|
+
text = _extract_text_from_content(msg["content"])
|
|
597
|
+
if text:
|
|
598
|
+
agent_response_text.extend(text)
|
|
599
|
+
if include_tool_messages:
|
|
600
|
+
for content in msg.get("content", []):
|
|
601
|
+
# Todo: Verify if this is the correct way to handle tool calls
|
|
602
|
+
if content.get("type") == "tool_call":
|
|
603
|
+
if "tool_call" in content and "function" in content.get("tool_call", {}):
|
|
604
|
+
tc = content.get("tool_call", {})
|
|
605
|
+
func_name = tc.get("function", {}).get("name", "")
|
|
606
|
+
args = tc.get("function", {}).get("arguments", {})
|
|
607
|
+
tool_call_id = tc.get("id")
|
|
608
|
+
else:
|
|
609
|
+
tool_call_id = content.get("tool_call_id")
|
|
610
|
+
func_name = content.get("name", "")
|
|
611
|
+
args = content.get("arguments", {})
|
|
612
|
+
args_str = ", ".join(f'{k}="{v}"' for k, v in args.items())
|
|
613
|
+
call_line = f"[TOOL_CALL] {func_name}({args_str})"
|
|
614
|
+
agent_response_text.append(call_line)
|
|
615
|
+
if tool_call_id in tool_results:
|
|
616
|
+
agent_response_text.append(tool_results[tool_call_id])
|
|
617
|
+
|
|
618
|
+
return agent_response_text
|
|
619
|
+
|
|
620
|
+
|
|
621
|
+
def reformat_agent_response(response, logger=None, include_tool_messages=False):
|
|
622
|
+
try:
|
|
623
|
+
if response is None or response == []:
|
|
624
|
+
return ""
|
|
625
|
+
agent_response = _get_agent_response(response, include_tool_messages=include_tool_messages)
|
|
626
|
+
if agent_response == []:
|
|
627
|
+
# If no message could be extracted, likely the format changed, fallback to the original response in that case
|
|
628
|
+
if logger:
|
|
629
|
+
logger.warning(
|
|
630
|
+
f"Empty agent response extracted, likely due to input schema change. Falling back to using the original response: {response}"
|
|
631
|
+
)
|
|
632
|
+
return response
|
|
633
|
+
return "\n".join(agent_response)
|
|
634
|
+
except:
|
|
635
|
+
# If the agent response cannot be parsed for whatever reason (e.g. the converter format changed), the original response is returned
|
|
636
|
+
# This is a fallback to ensure that the evaluation can still proceed. See comments on reformat_conversation_history for more details.
|
|
637
|
+
if logger:
|
|
638
|
+
logger.warning(f"Agent response could not be parsed, falling back to original response: {response}")
|
|
639
|
+
return response
|
|
640
|
+
|
|
641
|
+
|
|
642
|
+
def reformat_tool_definitions(tool_definitions, logger=None):
|
|
643
|
+
try:
|
|
644
|
+
output_lines = ["TOOL_DEFINITIONS:"]
|
|
645
|
+
for tool in tool_definitions:
|
|
646
|
+
name = tool.get("name", "unnamed_tool")
|
|
647
|
+
desc = tool.get("description", "").strip()
|
|
648
|
+
params = tool.get("parameters", {}).get("properties", {})
|
|
649
|
+
param_names = ", ".join(params.keys()) if params else "no parameters"
|
|
650
|
+
output_lines.append(f"- {name}: {desc} (inputs: {param_names})")
|
|
651
|
+
return "\n".join(output_lines)
|
|
652
|
+
except Exception as e:
|
|
653
|
+
# If the tool definitions cannot be parsed for whatever reason, the original tool definitions are returned
|
|
654
|
+
# This is a fallback to ensure that the evaluation can still proceed. See comments on reformat_conversation_history for more details.
|
|
655
|
+
if logger:
|
|
656
|
+
logger.warning(
|
|
657
|
+
f"Tool definitions could not be parsed, falling back to original definitions: {tool_definitions}"
|
|
658
|
+
)
|
|
659
|
+
return tool_definitions
|
|
660
|
+
|
|
661
|
+
|
|
484
662
|
def upload(path: str, container_client: ContainerClient, logger=None):
|
|
485
663
|
"""Upload files or directories to Azure Blob Storage using a container client.
|
|
486
664
|
|
|
@@ -509,7 +687,7 @@ def upload(path: str, container_client: ContainerClient, logger=None):
|
|
|
509
687
|
local_paths = []
|
|
510
688
|
|
|
511
689
|
if os.path.isdir(path):
|
|
512
|
-
for
|
|
690
|
+
for root, _, filenames in os.walk(path):
|
|
513
691
|
upload_path = ""
|
|
514
692
|
if root != path:
|
|
515
693
|
rel_path = os.path.relpath(root, path)
|
|
@@ -81,6 +81,7 @@ class _AggregationType(enum.Enum):
|
|
|
81
81
|
SUM = "sum"
|
|
82
82
|
CUSTOM = "custom"
|
|
83
83
|
|
|
84
|
+
|
|
84
85
|
class TokenScope(str, enum.Enum):
|
|
85
86
|
"""Defines the scope of the token used to access Azure resources."""
|
|
86
87
|
|
|
@@ -114,4 +115,4 @@ BINARY_AGGREGATE_SUFFIX = "binary_aggregate"
|
|
|
114
115
|
|
|
115
116
|
AOAI_COLUMN_NAME = "aoai"
|
|
116
117
|
DEFAULT_OAI_EVAL_RUN_NAME = "AI_SDK_EVAL_RUN"
|
|
117
|
-
DEFAULT_AOAI_API_VERSION = "2025-04-01-preview"
|
|
118
|
+
DEFAULT_AOAI_API_VERSION = "2025-04-01-preview" # Unfortunately relying on preview version for now.
|
|
@@ -718,6 +718,7 @@ class AIAgentConverter:
|
|
|
718
718
|
|
|
719
719
|
return AIAgentConverter._convert_from_conversation(data, run_id)
|
|
720
720
|
|
|
721
|
+
|
|
721
722
|
@experimental
|
|
722
723
|
class AIAgentDataRetriever:
|
|
723
724
|
# Maximum items to fetch in a single AI Services API call (imposed by the service).
|
|
@@ -748,6 +749,7 @@ class AIAgentDataRetriever:
|
|
|
748
749
|
def _list_run_ids_chronological(self, thread_id: str) -> List[str]:
|
|
749
750
|
pass
|
|
750
751
|
|
|
752
|
+
|
|
751
753
|
@experimental
|
|
752
754
|
class LegacyAgentDataRetriever(AIAgentDataRetriever):
|
|
753
755
|
|
|
@@ -768,7 +770,8 @@ class LegacyAgentDataRetriever(AIAgentDataRetriever):
|
|
|
768
770
|
after = None
|
|
769
771
|
while has_more:
|
|
770
772
|
messages = self.project_client.agents.list_messages(
|
|
771
|
-
|
|
773
|
+
thread_id=thread_id, limit=self._AI_SERVICES_API_MAX_LIMIT, order="asc", after=after
|
|
774
|
+
)
|
|
772
775
|
has_more = messages.has_more
|
|
773
776
|
after = messages.last_id
|
|
774
777
|
if messages.data:
|
|
@@ -812,6 +815,7 @@ class LegacyAgentDataRetriever(AIAgentDataRetriever):
|
|
|
812
815
|
def _get_run(self, thread_id: str, run_id: str):
|
|
813
816
|
return self.project_client.agents.get_run(thread_id=thread_id, run_id=run_id)
|
|
814
817
|
|
|
818
|
+
|
|
815
819
|
@experimental
|
|
816
820
|
class FDPAgentDataRetriever(AIAgentDataRetriever):
|
|
817
821
|
|
|
@@ -833,16 +837,13 @@ class FDPAgentDataRetriever(AIAgentDataRetriever):
|
|
|
833
837
|
|
|
834
838
|
def _list_run_steps_chronological(self, thread_id: str, run_id: str):
|
|
835
839
|
|
|
836
|
-
return
|
|
837
|
-
|
|
838
|
-
|
|
839
|
-
limit=self._AI_SERVICES_API_MAX_LIMIT,
|
|
840
|
-
order="asc"
|
|
841
|
-
)
|
|
840
|
+
return self.project_client.agents.run_steps.list(
|
|
841
|
+
thread_id=thread_id, run_id=run_id, limit=self._AI_SERVICES_API_MAX_LIMIT, order="asc"
|
|
842
|
+
)
|
|
842
843
|
|
|
843
844
|
def _list_run_ids_chronological(self, thread_id: str) -> List[str]:
|
|
844
845
|
runs = self.project_client.agents.runs.list(thread_id=thread_id, order="asc")
|
|
845
846
|
return [run.id for run in runs]
|
|
846
847
|
|
|
847
848
|
def _get_run(self, thread_id: str, run_id: str):
|
|
848
|
-
return self.project_client.agents.runs.get(thread_id=thread_id, run_id=run_id)
|
|
849
|
+
return self.project_client.agents.runs.get(thread_id=thread_id, run_id=run_id)
|
|
@@ -20,6 +20,7 @@ _SYSTEM = "system"
|
|
|
20
20
|
_USER = "user"
|
|
21
21
|
_AGENT = "assistant"
|
|
22
22
|
_TOOL = "tool"
|
|
23
|
+
_DEVELOPER = "developer" # part of the semantic kernel
|
|
23
24
|
|
|
24
25
|
# Constant definitions for what tool details include.
|
|
25
26
|
_TOOL_CALL = "tool_call"
|
|
@@ -81,6 +82,7 @@ _BUILT_IN_PARAMS = {
|
|
|
81
82
|
},
|
|
82
83
|
}
|
|
83
84
|
|
|
85
|
+
|
|
84
86
|
class Message(BaseModel):
|
|
85
87
|
"""Represents a message in a conversation with agents, assistants, and tools. We need to export these structures
|
|
86
88
|
to JSON for evaluators and we have custom fields such as createdAt, run_id, and tool_call_id, so we cannot use
|
|
@@ -123,6 +125,17 @@ class UserMessage(Message):
|
|
|
123
125
|
role: str = _USER
|
|
124
126
|
|
|
125
127
|
|
|
128
|
+
class SKDeveloperMessage(Message):
|
|
129
|
+
"""Represents a developer message in a conversation with agents, assistants, and tools.
|
|
130
|
+
This is used in the context of Semantic Kernel (SK) agents.
|
|
131
|
+
|
|
132
|
+
:param role: The role of the message sender, which is always 'developer'.
|
|
133
|
+
:type role: str
|
|
134
|
+
"""
|
|
135
|
+
|
|
136
|
+
role: str = _DEVELOPER
|
|
137
|
+
|
|
138
|
+
|
|
126
139
|
class ToolMessage(Message):
|
|
127
140
|
"""Represents a tool message in a conversation with agents, assistants, and tools.
|
|
128
141
|
|
|
@@ -139,6 +152,19 @@ class ToolMessage(Message):
|
|
|
139
152
|
tool_call_id: Optional[str] = None
|
|
140
153
|
|
|
141
154
|
|
|
155
|
+
class SKToolMessage(Message):
|
|
156
|
+
"""Represents a tool message in the context of a Semantic Kernel (SK) agent.
|
|
157
|
+
|
|
158
|
+
:param role: The role of the message sender, which is always 'tool'.
|
|
159
|
+
:type role: str
|
|
160
|
+
:param tool_call_id: The ID of the tool call associated with the message. Optional.
|
|
161
|
+
:type tool_call_id: Optional[str]
|
|
162
|
+
"""
|
|
163
|
+
|
|
164
|
+
role: str = _TOOL
|
|
165
|
+
tool_call_id: Optional[str] = None
|
|
166
|
+
|
|
167
|
+
|
|
142
168
|
class AssistantMessage(Message):
|
|
143
169
|
"""Represents an assistant message.
|
|
144
170
|
|
|
@@ -152,6 +178,26 @@ class AssistantMessage(Message):
|
|
|
152
178
|
role: str = _AGENT
|
|
153
179
|
|
|
154
180
|
|
|
181
|
+
class SKAssistantMessage(Message):
|
|
182
|
+
"""Represents an assistant message in the context of a Semantic Kernel (SK) agent.
|
|
183
|
+
|
|
184
|
+
:param role: The role of the message sender, which is always 'assistant'.
|
|
185
|
+
:type role: str
|
|
186
|
+
"""
|
|
187
|
+
|
|
188
|
+
role: str = _AGENT
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
class SKAssistantMessage(Message):
|
|
192
|
+
"""Represents an assistant message in the context of a Semantic Kernel (SK) agent.
|
|
193
|
+
|
|
194
|
+
:param role: The role of the message sender, which is always 'assistant'.
|
|
195
|
+
:type role: str
|
|
196
|
+
"""
|
|
197
|
+
|
|
198
|
+
role: str = _AGENT
|
|
199
|
+
|
|
200
|
+
|
|
155
201
|
class ToolDefinition(BaseModel):
|
|
156
202
|
"""Represents a tool definition that will be used in the agent.
|
|
157
203
|
|