azure-ai-evaluation 1.8.0__py3-none-any.whl → 1.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (142) hide show
  1. azure/ai/evaluation/__init__.py +51 -6
  2. azure/ai/evaluation/_aoai/__init__.py +1 -1
  3. azure/ai/evaluation/_aoai/aoai_grader.py +21 -11
  4. azure/ai/evaluation/_aoai/label_grader.py +3 -2
  5. azure/ai/evaluation/_aoai/python_grader.py +84 -0
  6. azure/ai/evaluation/_aoai/score_model_grader.py +91 -0
  7. azure/ai/evaluation/_aoai/string_check_grader.py +3 -2
  8. azure/ai/evaluation/_aoai/text_similarity_grader.py +3 -2
  9. azure/ai/evaluation/_azure/_envs.py +9 -10
  10. azure/ai/evaluation/_azure/_token_manager.py +7 -1
  11. azure/ai/evaluation/_common/constants.py +11 -2
  12. azure/ai/evaluation/_common/evaluation_onedp_client.py +32 -26
  13. azure/ai/evaluation/_common/onedp/__init__.py +32 -32
  14. azure/ai/evaluation/_common/onedp/_client.py +136 -139
  15. azure/ai/evaluation/_common/onedp/_configuration.py +70 -73
  16. azure/ai/evaluation/_common/onedp/_patch.py +21 -21
  17. azure/ai/evaluation/_common/onedp/_utils/__init__.py +6 -0
  18. azure/ai/evaluation/_common/onedp/_utils/model_base.py +1232 -0
  19. azure/ai/evaluation/_common/onedp/_utils/serialization.py +2032 -0
  20. azure/ai/evaluation/_common/onedp/_validation.py +50 -50
  21. azure/ai/evaluation/_common/onedp/_version.py +9 -9
  22. azure/ai/evaluation/_common/onedp/aio/__init__.py +29 -29
  23. azure/ai/evaluation/_common/onedp/aio/_client.py +138 -143
  24. azure/ai/evaluation/_common/onedp/aio/_configuration.py +70 -75
  25. azure/ai/evaluation/_common/onedp/aio/_patch.py +21 -21
  26. azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +37 -39
  27. azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +4832 -4494
  28. azure/ai/evaluation/_common/onedp/aio/operations/_patch.py +21 -21
  29. azure/ai/evaluation/_common/onedp/models/__init__.py +168 -142
  30. azure/ai/evaluation/_common/onedp/models/_enums.py +230 -162
  31. azure/ai/evaluation/_common/onedp/models/_models.py +2685 -2228
  32. azure/ai/evaluation/_common/onedp/models/_patch.py +21 -21
  33. azure/ai/evaluation/_common/onedp/operations/__init__.py +37 -39
  34. azure/ai/evaluation/_common/onedp/operations/_operations.py +6106 -5657
  35. azure/ai/evaluation/_common/onedp/operations/_patch.py +21 -21
  36. azure/ai/evaluation/_common/rai_service.py +88 -52
  37. azure/ai/evaluation/_common/raiclient/__init__.py +1 -1
  38. azure/ai/evaluation/_common/raiclient/operations/_operations.py +14 -1
  39. azure/ai/evaluation/_common/utils.py +188 -10
  40. azure/ai/evaluation/_constants.py +2 -1
  41. azure/ai/evaluation/_converters/__init__.py +1 -1
  42. azure/ai/evaluation/_converters/_ai_services.py +9 -8
  43. azure/ai/evaluation/_converters/_models.py +46 -0
  44. azure/ai/evaluation/_converters/_sk_services.py +495 -0
  45. azure/ai/evaluation/_eval_mapping.py +2 -2
  46. azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +73 -25
  47. azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +2 -2
  48. azure/ai/evaluation/_evaluate/_evaluate.py +210 -94
  49. azure/ai/evaluation/_evaluate/_evaluate_aoai.py +132 -89
  50. azure/ai/evaluation/_evaluate/_telemetry/__init__.py +0 -1
  51. azure/ai/evaluation/_evaluate/_utils.py +25 -17
  52. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +4 -4
  53. azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +20 -12
  54. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +6 -6
  55. azure/ai/evaluation/_evaluators/_common/_base_eval.py +45 -11
  56. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +24 -9
  57. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +24 -9
  58. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +28 -18
  59. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +11 -8
  60. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +11 -8
  61. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +12 -9
  62. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +10 -7
  63. azure/ai/evaluation/_evaluators/_document_retrieval/__init__.py +1 -5
  64. azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +37 -64
  65. azure/ai/evaluation/_evaluators/_eci/_eci.py +6 -3
  66. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +5 -5
  67. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +3 -3
  68. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +4 -4
  69. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +12 -8
  70. azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +31 -26
  71. azure/ai/evaluation/_evaluators/_intent_resolution/intent_resolution.prompty +210 -96
  72. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +3 -4
  73. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +14 -7
  74. azure/ai/evaluation/_evaluators/_qa/_qa.py +5 -5
  75. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +62 -15
  76. azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +140 -59
  77. azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +21 -26
  78. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +5 -5
  79. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +22 -22
  80. azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +7 -6
  81. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +4 -4
  82. azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +27 -24
  83. azure/ai/evaluation/_evaluators/_task_adherence/task_adherence.prompty +354 -66
  84. azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +175 -183
  85. azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +99 -21
  86. azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +20 -12
  87. azure/ai/evaluation/_evaluators/_xpia/xpia.py +10 -7
  88. azure/ai/evaluation/_exceptions.py +10 -0
  89. azure/ai/evaluation/_http_utils.py +3 -3
  90. azure/ai/evaluation/_legacy/_batch_engine/_config.py +6 -3
  91. azure/ai/evaluation/_legacy/_batch_engine/_engine.py +117 -32
  92. azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py +5 -2
  93. azure/ai/evaluation/_legacy/_batch_engine/_result.py +2 -0
  94. azure/ai/evaluation/_legacy/_batch_engine/_run.py +2 -2
  95. azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +33 -41
  96. azure/ai/evaluation/_legacy/_batch_engine/_utils.py +1 -4
  97. azure/ai/evaluation/_legacy/_common/_async_token_provider.py +12 -19
  98. azure/ai/evaluation/_legacy/_common/_thread_pool_executor_with_context.py +2 -0
  99. azure/ai/evaluation/_legacy/prompty/_prompty.py +11 -5
  100. azure/ai/evaluation/_safety_evaluation/__init__.py +1 -1
  101. azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +195 -111
  102. azure/ai/evaluation/_user_agent.py +32 -1
  103. azure/ai/evaluation/_version.py +1 -1
  104. azure/ai/evaluation/red_team/__init__.py +3 -1
  105. azure/ai/evaluation/red_team/_agent/__init__.py +1 -1
  106. azure/ai/evaluation/red_team/_agent/_agent_functions.py +68 -71
  107. azure/ai/evaluation/red_team/_agent/_agent_tools.py +103 -145
  108. azure/ai/evaluation/red_team/_agent/_agent_utils.py +26 -6
  109. azure/ai/evaluation/red_team/_agent/_semantic_kernel_plugin.py +62 -71
  110. azure/ai/evaluation/red_team/_attack_objective_generator.py +94 -52
  111. azure/ai/evaluation/red_team/_attack_strategy.py +2 -1
  112. azure/ai/evaluation/red_team/_callback_chat_target.py +4 -9
  113. azure/ai/evaluation/red_team/_default_converter.py +1 -1
  114. azure/ai/evaluation/red_team/_red_team.py +1947 -1040
  115. azure/ai/evaluation/red_team/_red_team_result.py +49 -38
  116. azure/ai/evaluation/red_team/_utils/__init__.py +1 -1
  117. azure/ai/evaluation/red_team/_utils/_rai_service_eval_chat_target.py +39 -34
  118. azure/ai/evaluation/red_team/_utils/_rai_service_target.py +163 -138
  119. azure/ai/evaluation/red_team/_utils/_rai_service_true_false_scorer.py +14 -14
  120. azure/ai/evaluation/red_team/_utils/constants.py +1 -13
  121. azure/ai/evaluation/red_team/_utils/formatting_utils.py +41 -44
  122. azure/ai/evaluation/red_team/_utils/logging_utils.py +17 -17
  123. azure/ai/evaluation/red_team/_utils/metric_mapping.py +31 -4
  124. azure/ai/evaluation/red_team/_utils/strategy_utils.py +33 -25
  125. azure/ai/evaluation/simulator/_adversarial_scenario.py +2 -0
  126. azure/ai/evaluation/simulator/_adversarial_simulator.py +31 -17
  127. azure/ai/evaluation/simulator/_conversation/__init__.py +2 -2
  128. azure/ai/evaluation/simulator/_direct_attack_simulator.py +8 -8
  129. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +18 -6
  130. azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +54 -24
  131. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +7 -1
  132. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +30 -10
  133. azure/ai/evaluation/simulator/_model_tools/_rai_client.py +19 -31
  134. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +20 -6
  135. azure/ai/evaluation/simulator/_model_tools/models.py +1 -1
  136. azure/ai/evaluation/simulator/_simulator.py +21 -8
  137. {azure_ai_evaluation-1.8.0.dist-info → azure_ai_evaluation-1.10.0.dist-info}/METADATA +46 -3
  138. {azure_ai_evaluation-1.8.0.dist-info → azure_ai_evaluation-1.10.0.dist-info}/RECORD +141 -136
  139. azure/ai/evaluation/_common/onedp/aio/_vendor.py +0 -40
  140. {azure_ai_evaluation-1.8.0.dist-info → azure_ai_evaluation-1.10.0.dist-info}/NOTICE.txt +0 -0
  141. {azure_ai_evaluation-1.8.0.dist-info → azure_ai_evaluation-1.10.0.dist-info}/WHEEL +0 -0
  142. {azure_ai_evaluation-1.8.0.dist-info → azure_ai_evaluation-1.10.0.dist-info}/top_level.txt +0 -0
@@ -6,14 +6,14 @@ import posixpath
6
6
  import re
7
7
  import math
8
8
  import threading
9
- from typing import Any, List, Literal, Mapping, Type, TypeVar, Tuple, Union, cast, get_args, get_origin
9
+ from typing import Any, List, Literal, Mapping, Optional, Type, TypeVar, Tuple, Union, cast, get_args, get_origin
10
10
 
11
11
  import nltk
12
12
  from azure.storage.blob import ContainerClient
13
- from typing_extensions import NotRequired, Required, TypeGuard
13
+ from typing_extensions import NotRequired, Required, TypeGuard, TypeIs
14
14
  from azure.ai.evaluation._legacy._adapters._errors import MissingRequiredPackage
15
15
  from azure.ai.evaluation._constants import AZURE_OPENAI_TYPE, OPENAI_TYPE
16
- from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
16
+ from azure.ai.evaluation._exceptions import ErrorMessage, ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
17
17
  from azure.ai.evaluation._model_configurations import (
18
18
  AzureAIProject,
19
19
  AzureOpenAIModelConfiguration,
@@ -126,17 +126,17 @@ def construct_prompty_model_config(
126
126
 
127
127
  return prompty_model_config
128
128
 
129
- def is_onedp_project(azure_ai_project: AzureAIProject) -> bool:
129
+
130
+ def is_onedp_project(azure_ai_project: Optional[Union[str, AzureAIProject]]) -> TypeIs[str]:
130
131
  """Check if the Azure AI project is an OneDP project.
131
132
 
132
133
  :param azure_ai_project: The scope of the Azure AI project.
133
- :type azure_ai_project: ~azure.ai.evaluation.AzureAIProject
134
+ :type azure_ai_project: Optional[Union[str,~azure.ai.evaluation.AzureAIProject]]
134
135
  :return: True if the Azure AI project is an OneDP project, False otherwise.
135
136
  :rtype: bool
136
137
  """
137
- if isinstance(azure_ai_project, str):
138
- return True
139
- return False
138
+ return isinstance(azure_ai_project, str)
139
+
140
140
 
141
141
  def validate_azure_ai_project(o: object) -> AzureAIProject:
142
142
  fields = {"subscription_id": str, "resource_group_name": str, "project_name": str}
@@ -291,7 +291,8 @@ def _validate_typed_dict(o: object, t: Type[T_TypedDict]) -> T_TypedDict:
291
291
 
292
292
  return cast(T_TypedDict, o)
293
293
 
294
- def check_score_is_valid(score: Union[str, float], min_score = 1, max_score = 5) -> bool:
294
+
295
+ def check_score_is_valid(score: Union[str, float], min_score=1, max_score=5) -> bool:
295
296
  """Check if the score is valid, i.e. is convertable to number and is in the range [min_score, max_score].
296
297
 
297
298
  :param score: The score to check.
@@ -310,6 +311,7 @@ def check_score_is_valid(score: Union[str, float], min_score = 1, max_score = 5)
310
311
 
311
312
  return min_score <= numeric_score <= max_score
312
313
 
314
+
313
315
  def parse_quality_evaluator_reason_score(llm_output: str, valid_score_range: str = "[1-5]") -> Tuple[float, str]:
314
316
  """Parse the output of prompt-based quality evaluators that return a score and reason.
315
317
 
@@ -481,6 +483,182 @@ def validate_conversation(conversation):
481
483
  ErrorTarget.CONTENT_SAFETY_CHAT_EVALUATOR,
482
484
  )
483
485
 
486
+
487
+ def _extract_text_from_content(content):
488
+ text = []
489
+ for msg in content:
490
+ if "text" in msg:
491
+ text.append(msg["text"])
492
+ return text
493
+
494
+
495
+ def _get_conversation_history(query, include_system_messages=False):
496
+ all_user_queries = []
497
+ cur_user_query = []
498
+ all_agent_responses = []
499
+ cur_agent_response = []
500
+ system_message = None
501
+ for msg in query:
502
+ if not "role" in msg:
503
+ continue
504
+ if include_system_messages and msg["role"] == "system" and "content" in msg:
505
+ system_message = msg.get("content", "")
506
+ if msg["role"] == "user" and "content" in msg:
507
+ if cur_agent_response != []:
508
+ all_agent_responses.append(cur_agent_response)
509
+ cur_agent_response = []
510
+ text_in_msg = _extract_text_from_content(msg["content"])
511
+ if text_in_msg:
512
+ cur_user_query.append(text_in_msg)
513
+
514
+ if msg["role"] == "assistant" and "content" in msg:
515
+ if cur_user_query != []:
516
+ all_user_queries.append(cur_user_query)
517
+ cur_user_query = []
518
+ text_in_msg = _extract_text_from_content(msg["content"])
519
+ if text_in_msg:
520
+ cur_agent_response.append(text_in_msg)
521
+ if cur_user_query != []:
522
+ all_user_queries.append(cur_user_query)
523
+ if cur_agent_response != []:
524
+ all_agent_responses.append(cur_agent_response)
525
+
526
+ if len(all_user_queries) != len(all_agent_responses) + 1:
527
+ raise EvaluationException(
528
+ message=ErrorMessage.MALFORMED_CONVERSATION_HISTORY,
529
+ internal_message=ErrorMessage.MALFORMED_CONVERSATION_HISTORY,
530
+ target=ErrorTarget.CONVERSATION_HISTORY_PARSING,
531
+ category=ErrorCategory.INVALID_VALUE,
532
+ blame=ErrorBlame.USER_ERROR,
533
+ )
534
+ result = {"user_queries": all_user_queries, "agent_responses": all_agent_responses}
535
+ if include_system_messages:
536
+ result["system_message"] = system_message
537
+ return result
538
+
539
+
540
+ def _pretty_format_conversation_history(conversation_history):
541
+ """Formats the conversation history for better readability."""
542
+ formatted_history = ""
543
+ if "system_message" in conversation_history and conversation_history["system_message"] is not None:
544
+ formatted_history += "SYSTEM_PROMPT:\n"
545
+ formatted_history += " " + conversation_history["system_message"] + "\n\n"
546
+ for i, (user_query, agent_response) in enumerate(
547
+ zip(conversation_history["user_queries"], conversation_history["agent_responses"] + [None])
548
+ ):
549
+ formatted_history += f"User turn {i+1}:\n"
550
+ for msg in user_query:
551
+ formatted_history += " " + "\n ".join(msg)
552
+ formatted_history += "\n\n"
553
+ if agent_response:
554
+ formatted_history += f"Agent turn {i+1}:\n"
555
+ for msg in agent_response:
556
+ formatted_history += " " + "\n ".join(msg)
557
+ formatted_history += "\n\n"
558
+ return formatted_history
559
+
560
+
561
+ def reformat_conversation_history(query, logger=None, include_system_messages=False):
562
+ """Reformats the conversation history to a more compact representation."""
563
+ try:
564
+ conversation_history = _get_conversation_history(query, include_system_messages=include_system_messages)
565
+ return _pretty_format_conversation_history(conversation_history)
566
+ except:
567
+ # If the conversation history cannot be parsed for whatever reason (e.g. the converter format changed), the original query is returned
568
+ # This is a fallback to ensure that the evaluation can still proceed. However the accuracy of the evaluation will be affected.
569
+ # From our tests the negative impact on IntentResolution is:
570
+ # Higher intra model variance (0.142 vs 0.046)
571
+ # Higher inter model variance (0.345 vs 0.607)
572
+ # Lower percentage of mode in Likert scale (73.4% vs 75.4%)
573
+ # Lower pairwise agreement between LLMs (85% vs 90% at the pass/fail level with threshold of 3)
574
+ if logger:
575
+ logger.warning(f"Conversation history could not be parsed, falling back to original query: {query}")
576
+ return query
577
+
578
+
579
+ def _get_agent_response(agent_response_msgs, include_tool_messages=False):
580
+ """Extracts formatted agent response including text, and optionally tool calls/results."""
581
+ agent_response_text = []
582
+ tool_results = {}
583
+
584
+ # First pass: collect tool results
585
+ if include_tool_messages:
586
+ for msg in agent_response_msgs:
587
+ if msg.get("role") == "tool" and "tool_call_id" in msg:
588
+ for content in msg.get("content", []):
589
+ if content.get("type") == "tool_result":
590
+ result = content.get("tool_result")
591
+ tool_results[msg["tool_call_id"]] = f"[TOOL_RESULT] {result}"
592
+
593
+ # Second pass: parse assistant messages and tool calls
594
+ for msg in agent_response_msgs:
595
+ if "role" in msg and msg.get("role") == "assistant" and "content" in msg:
596
+ text = _extract_text_from_content(msg["content"])
597
+ if text:
598
+ agent_response_text.extend(text)
599
+ if include_tool_messages:
600
+ for content in msg.get("content", []):
601
+ # Todo: Verify if this is the correct way to handle tool calls
602
+ if content.get("type") == "tool_call":
603
+ if "tool_call" in content and "function" in content.get("tool_call", {}):
604
+ tc = content.get("tool_call", {})
605
+ func_name = tc.get("function", {}).get("name", "")
606
+ args = tc.get("function", {}).get("arguments", {})
607
+ tool_call_id = tc.get("id")
608
+ else:
609
+ tool_call_id = content.get("tool_call_id")
610
+ func_name = content.get("name", "")
611
+ args = content.get("arguments", {})
612
+ args_str = ", ".join(f'{k}="{v}"' for k, v in args.items())
613
+ call_line = f"[TOOL_CALL] {func_name}({args_str})"
614
+ agent_response_text.append(call_line)
615
+ if tool_call_id in tool_results:
616
+ agent_response_text.append(tool_results[tool_call_id])
617
+
618
+ return agent_response_text
619
+
620
+
621
+ def reformat_agent_response(response, logger=None, include_tool_messages=False):
622
+ try:
623
+ if response is None or response == []:
624
+ return ""
625
+ agent_response = _get_agent_response(response, include_tool_messages=include_tool_messages)
626
+ if agent_response == []:
627
+ # If no message could be extracted, likely the format changed, fallback to the original response in that case
628
+ if logger:
629
+ logger.warning(
630
+ f"Empty agent response extracted, likely due to input schema change. Falling back to using the original response: {response}"
631
+ )
632
+ return response
633
+ return "\n".join(agent_response)
634
+ except:
635
+ # If the agent response cannot be parsed for whatever reason (e.g. the converter format changed), the original response is returned
636
+ # This is a fallback to ensure that the evaluation can still proceed. See comments on reformat_conversation_history for more details.
637
+ if logger:
638
+ logger.warning(f"Agent response could not be parsed, falling back to original response: {response}")
639
+ return response
640
+
641
+
642
+ def reformat_tool_definitions(tool_definitions, logger=None):
643
+ try:
644
+ output_lines = ["TOOL_DEFINITIONS:"]
645
+ for tool in tool_definitions:
646
+ name = tool.get("name", "unnamed_tool")
647
+ desc = tool.get("description", "").strip()
648
+ params = tool.get("parameters", {}).get("properties", {})
649
+ param_names = ", ".join(params.keys()) if params else "no parameters"
650
+ output_lines.append(f"- {name}: {desc} (inputs: {param_names})")
651
+ return "\n".join(output_lines)
652
+ except Exception as e:
653
+ # If the tool definitions cannot be parsed for whatever reason, the original tool definitions are returned
654
+ # This is a fallback to ensure that the evaluation can still proceed. See comments on reformat_conversation_history for more details.
655
+ if logger:
656
+ logger.warning(
657
+ f"Tool definitions could not be parsed, falling back to original definitions: {tool_definitions}"
658
+ )
659
+ return tool_definitions
660
+
661
+
484
662
  def upload(path: str, container_client: ContainerClient, logger=None):
485
663
  """Upload files or directories to Azure Blob Storage using a container client.
486
664
 
@@ -509,7 +687,7 @@ def upload(path: str, container_client: ContainerClient, logger=None):
509
687
  local_paths = []
510
688
 
511
689
  if os.path.isdir(path):
512
- for (root, _, filenames) in os.walk(path):
690
+ for root, _, filenames in os.walk(path):
513
691
  upload_path = ""
514
692
  if root != path:
515
693
  rel_path = os.path.relpath(root, path)
@@ -81,6 +81,7 @@ class _AggregationType(enum.Enum):
81
81
  SUM = "sum"
82
82
  CUSTOM = "custom"
83
83
 
84
+
84
85
  class TokenScope(str, enum.Enum):
85
86
  """Defines the scope of the token used to access Azure resources."""
86
87
 
@@ -114,4 +115,4 @@ BINARY_AGGREGATE_SUFFIX = "binary_aggregate"
114
115
 
115
116
  AOAI_COLUMN_NAME = "aoai"
116
117
  DEFAULT_OAI_EVAL_RUN_NAME = "AI_SDK_EVAL_RUN"
117
- DEFAULT_AOAI_API_VERSION = "2025-04-01-preview" # Unfortunately relying on preview version for now.
118
+ DEFAULT_AOAI_API_VERSION = "2025-04-01-preview" # Unfortunately relying on preview version for now.
@@ -1,3 +1,3 @@
1
1
  # ---------------------------------------------------------
2
2
  # Copyright (c) Microsoft Corporation. All rights reserved.
3
- # ---------------------------------------------------------
3
+ # ---------------------------------------------------------
@@ -718,6 +718,7 @@ class AIAgentConverter:
718
718
 
719
719
  return AIAgentConverter._convert_from_conversation(data, run_id)
720
720
 
721
+
721
722
  @experimental
722
723
  class AIAgentDataRetriever:
723
724
  # Maximum items to fetch in a single AI Services API call (imposed by the service).
@@ -748,6 +749,7 @@ class AIAgentDataRetriever:
748
749
  def _list_run_ids_chronological(self, thread_id: str) -> List[str]:
749
750
  pass
750
751
 
752
+
751
753
  @experimental
752
754
  class LegacyAgentDataRetriever(AIAgentDataRetriever):
753
755
 
@@ -768,7 +770,8 @@ class LegacyAgentDataRetriever(AIAgentDataRetriever):
768
770
  after = None
769
771
  while has_more:
770
772
  messages = self.project_client.agents.list_messages(
771
- thread_id=thread_id, limit=self._AI_SERVICES_API_MAX_LIMIT, order="asc", after=after)
773
+ thread_id=thread_id, limit=self._AI_SERVICES_API_MAX_LIMIT, order="asc", after=after
774
+ )
772
775
  has_more = messages.has_more
773
776
  after = messages.last_id
774
777
  if messages.data:
@@ -812,6 +815,7 @@ class LegacyAgentDataRetriever(AIAgentDataRetriever):
812
815
  def _get_run(self, thread_id: str, run_id: str):
813
816
  return self.project_client.agents.get_run(thread_id=thread_id, run_id=run_id)
814
817
 
818
+
815
819
  @experimental
816
820
  class FDPAgentDataRetriever(AIAgentDataRetriever):
817
821
 
@@ -833,16 +837,13 @@ class FDPAgentDataRetriever(AIAgentDataRetriever):
833
837
 
834
838
  def _list_run_steps_chronological(self, thread_id: str, run_id: str):
835
839
 
836
- return self.project_client.agents.run_steps.list(
837
- thread_id=thread_id,
838
- run_id=run_id,
839
- limit=self._AI_SERVICES_API_MAX_LIMIT,
840
- order="asc"
841
- )
840
+ return self.project_client.agents.run_steps.list(
841
+ thread_id=thread_id, run_id=run_id, limit=self._AI_SERVICES_API_MAX_LIMIT, order="asc"
842
+ )
842
843
 
843
844
  def _list_run_ids_chronological(self, thread_id: str) -> List[str]:
844
845
  runs = self.project_client.agents.runs.list(thread_id=thread_id, order="asc")
845
846
  return [run.id for run in runs]
846
847
 
847
848
  def _get_run(self, thread_id: str, run_id: str):
848
- return self.project_client.agents.runs.get(thread_id=thread_id, run_id=run_id)
849
+ return self.project_client.agents.runs.get(thread_id=thread_id, run_id=run_id)
@@ -20,6 +20,7 @@ _SYSTEM = "system"
20
20
  _USER = "user"
21
21
  _AGENT = "assistant"
22
22
  _TOOL = "tool"
23
+ _DEVELOPER = "developer" # part of the semantic kernel
23
24
 
24
25
  # Constant definitions for what tool details include.
25
26
  _TOOL_CALL = "tool_call"
@@ -81,6 +82,7 @@ _BUILT_IN_PARAMS = {
81
82
  },
82
83
  }
83
84
 
85
+
84
86
  class Message(BaseModel):
85
87
  """Represents a message in a conversation with agents, assistants, and tools. We need to export these structures
86
88
  to JSON for evaluators and we have custom fields such as createdAt, run_id, and tool_call_id, so we cannot use
@@ -123,6 +125,17 @@ class UserMessage(Message):
123
125
  role: str = _USER
124
126
 
125
127
 
128
+ class SKDeveloperMessage(Message):
129
+ """Represents a developer message in a conversation with agents, assistants, and tools.
130
+ This is used in the context of Semantic Kernel (SK) agents.
131
+
132
+ :param role: The role of the message sender, which is always 'developer'.
133
+ :type role: str
134
+ """
135
+
136
+ role: str = _DEVELOPER
137
+
138
+
126
139
  class ToolMessage(Message):
127
140
  """Represents a tool message in a conversation with agents, assistants, and tools.
128
141
 
@@ -139,6 +152,19 @@ class ToolMessage(Message):
139
152
  tool_call_id: Optional[str] = None
140
153
 
141
154
 
155
+ class SKToolMessage(Message):
156
+ """Represents a tool message in the context of a Semantic Kernel (SK) agent.
157
+
158
+ :param role: The role of the message sender, which is always 'tool'.
159
+ :type role: str
160
+ :param tool_call_id: The ID of the tool call associated with the message. Optional.
161
+ :type tool_call_id: Optional[str]
162
+ """
163
+
164
+ role: str = _TOOL
165
+ tool_call_id: Optional[str] = None
166
+
167
+
142
168
  class AssistantMessage(Message):
143
169
  """Represents an assistant message.
144
170
 
@@ -152,6 +178,26 @@ class AssistantMessage(Message):
152
178
  role: str = _AGENT
153
179
 
154
180
 
181
+ class SKAssistantMessage(Message):
182
+ """Represents an assistant message in the context of a Semantic Kernel (SK) agent.
183
+
184
+ :param role: The role of the message sender, which is always 'assistant'.
185
+ :type role: str
186
+ """
187
+
188
+ role: str = _AGENT
189
+
190
+
191
+ class SKAssistantMessage(Message):
192
+ """Represents an assistant message in the context of a Semantic Kernel (SK) agent.
193
+
194
+ :param role: The role of the message sender, which is always 'assistant'.
195
+ :type role: str
196
+ """
197
+
198
+ role: str = _AGENT
199
+
200
+
155
201
  class ToolDefinition(BaseModel):
156
202
  """Represents a tool definition that will be used in the agent.
157
203