uipath 2.1.108__py3-none-any.whl → 2.1.109__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of uipath might be problematic. Click here for more details.

Files changed (69) hide show
  1. uipath/_cli/__init__.py +4 -0
  2. uipath/_cli/_evals/_console_progress_reporter.py +2 -2
  3. uipath/_cli/_evals/_evaluator_factory.py +314 -29
  4. uipath/_cli/_evals/_helpers.py +194 -0
  5. uipath/_cli/_evals/_models/_evaluation_set.py +73 -7
  6. uipath/_cli/_evals/_models/_evaluator.py +183 -9
  7. uipath/_cli/_evals/_models/_evaluator_base_params.py +3 -3
  8. uipath/_cli/_evals/_models/_output.py +87 -3
  9. uipath/_cli/_evals/_progress_reporter.py +288 -28
  10. uipath/_cli/_evals/_runtime.py +80 -26
  11. uipath/_cli/_evals/mocks/input_mocker.py +1 -3
  12. uipath/_cli/_evals/mocks/llm_mocker.py +2 -2
  13. uipath/_cli/_evals/mocks/mocker_factory.py +2 -2
  14. uipath/_cli/_evals/mocks/mockito_mocker.py +2 -2
  15. uipath/_cli/_evals/mocks/mocks.py +5 -3
  16. uipath/_cli/_push/models.py +17 -0
  17. uipath/_cli/_push/sw_file_handler.py +336 -3
  18. uipath/_cli/_templates/custom_evaluator.py.template +65 -0
  19. uipath/_cli/_utils/_eval_set.py +30 -9
  20. uipath/_cli/_utils/_resources.py +21 -0
  21. uipath/_cli/_utils/_studio_project.py +18 -0
  22. uipath/_cli/cli_add.py +114 -0
  23. uipath/_cli/cli_eval.py +5 -1
  24. uipath/_cli/cli_pull.py +11 -26
  25. uipath/_cli/cli_push.py +2 -0
  26. uipath/_cli/cli_register.py +45 -0
  27. uipath/_events/_events.py +6 -5
  28. uipath/_utils/constants.py +4 -0
  29. uipath/eval/_helpers/evaluators_helpers.py +494 -0
  30. uipath/eval/_helpers/helpers.py +30 -2
  31. uipath/eval/evaluators/__init__.py +60 -5
  32. uipath/eval/evaluators/base_evaluator.py +546 -44
  33. uipath/eval/evaluators/contains_evaluator.py +80 -0
  34. uipath/eval/evaluators/exact_match_evaluator.py +43 -12
  35. uipath/eval/evaluators/json_similarity_evaluator.py +41 -12
  36. uipath/eval/evaluators/legacy_base_evaluator.py +89 -0
  37. uipath/eval/evaluators/{deterministic_evaluator_base.py → legacy_deterministic_evaluator_base.py} +2 -2
  38. uipath/eval/evaluators/legacy_exact_match_evaluator.py +37 -0
  39. uipath/eval/evaluators/legacy_json_similarity_evaluator.py +151 -0
  40. uipath/eval/evaluators/legacy_llm_as_judge_evaluator.py +137 -0
  41. uipath/eval/evaluators/{trajectory_evaluator.py → legacy_trajectory_evaluator.py} +5 -6
  42. uipath/eval/evaluators/llm_as_judge_evaluator.py +143 -78
  43. uipath/eval/evaluators/llm_judge_output_evaluator.py +112 -0
  44. uipath/eval/evaluators/llm_judge_trajectory_evaluator.py +142 -0
  45. uipath/eval/evaluators/output_evaluator.py +117 -0
  46. uipath/eval/evaluators/tool_call_args_evaluator.py +82 -0
  47. uipath/eval/evaluators/tool_call_count_evaluator.py +87 -0
  48. uipath/eval/evaluators/tool_call_order_evaluator.py +84 -0
  49. uipath/eval/evaluators/tool_call_output_evaluator.py +87 -0
  50. uipath/eval/evaluators_types/ContainsEvaluator.json +73 -0
  51. uipath/eval/evaluators_types/ExactMatchEvaluator.json +89 -0
  52. uipath/eval/evaluators_types/JsonSimilarityEvaluator.json +81 -0
  53. uipath/eval/evaluators_types/LLMJudgeOutputEvaluator.json +110 -0
  54. uipath/eval/evaluators_types/LLMJudgeSimulationTrajectoryEvaluator.json +88 -0
  55. uipath/eval/evaluators_types/LLMJudgeStrictJSONSimilarityOutputEvaluator.json +110 -0
  56. uipath/eval/evaluators_types/LLMJudgeTrajectoryEvaluator.json +88 -0
  57. uipath/eval/evaluators_types/ToolCallArgsEvaluator.json +131 -0
  58. uipath/eval/evaluators_types/ToolCallCountEvaluator.json +104 -0
  59. uipath/eval/evaluators_types/ToolCallOrderEvaluator.json +100 -0
  60. uipath/eval/evaluators_types/ToolCallOutputEvaluator.json +124 -0
  61. uipath/eval/evaluators_types/generate_types.py +31 -0
  62. uipath/eval/models/__init__.py +16 -1
  63. uipath/eval/models/llm_judge_types.py +196 -0
  64. uipath/eval/models/models.py +109 -7
  65. {uipath-2.1.108.dist-info → uipath-2.1.109.dist-info}/METADATA +1 -1
  66. {uipath-2.1.108.dist-info → uipath-2.1.109.dist-info}/RECORD +69 -37
  67. {uipath-2.1.108.dist-info → uipath-2.1.109.dist-info}/WHEEL +0 -0
  68. {uipath-2.1.108.dist-info → uipath-2.1.109.dist-info}/entry_points.txt +0 -0
  69. {uipath-2.1.108.dist-info → uipath-2.1.109.dist-info}/licenses/LICENSE +0 -0
uipath/_cli/cli_eval.py CHANGED
@@ -130,7 +130,11 @@ def eval(
130
130
 
131
131
  eval_context.no_report = no_report
132
132
  eval_context.workers = workers
133
- eval_context.eval_set = eval_set or EvalHelpers.auto_discover_eval_set()
133
+
134
+ # Load eval set to resolve the path
135
+ eval_set_path = eval_set or EvalHelpers.auto_discover_eval_set()
136
+ _, resolved_eval_set_path = EvalHelpers.load_eval_set(eval_set_path, eval_ids)
137
+ eval_context.eval_set = resolved_eval_set_path
134
138
  eval_context.eval_ids = eval_ids
135
139
 
136
140
  console_reporter = ConsoleProgressReporter()
uipath/_cli/cli_pull.py CHANGED
@@ -24,20 +24,6 @@ from ._utils._project_files import ProjectPullError, pull_project
24
24
  console = ConsoleLogger()
25
25
 
26
26
 
27
- class InteractiveConflictHandler:
28
- """Handler that prompts user for each conflict."""
29
-
30
- def __init__(self, console: ConsoleLogger):
31
- self.console = console
32
-
33
- def should_overwrite(
34
- self, file_path: str, local_hash: str, remote_hash: str
35
- ) -> bool:
36
- self.console.warning(f" File {file_path} differs from remote version.")
37
- response = click.confirm("Do you want to overwrite it?", default=False)
38
- return response
39
-
40
-
41
27
  @click.command()
42
28
  @click.argument(
43
29
  "root",
@@ -66,22 +52,21 @@ def pull(root: Path) -> None:
66
52
  project_id = os.getenv(UIPATH_PROJECT_ID)
67
53
  if not project_id:
68
54
  console.error("UIPATH_PROJECT_ID environment variable not found.")
55
+ return
69
56
 
70
- default_download_configuration = {
57
+ download_configuration = {
71
58
  "source_code": root,
72
59
  "evals": root / "evals",
73
60
  }
74
61
 
75
- async def pull_with_updates():
76
- try:
77
- async for update in pull_project(
78
- project_id,
79
- default_download_configuration,
80
- InteractiveConflictHandler(console),
81
- ):
62
+ try:
63
+
64
+ async def run_pull():
65
+ async for update in pull_project(project_id, download_configuration):
66
+ console.info(f"Processing: {update.file_path}")
82
67
  console.info(update.message)
83
- except ProjectPullError as e:
84
- console.error(e.message, include_traceback=True)
85
68
 
86
- with console.spinner("Pulling UiPath project files..."):
87
- asyncio.run(pull_with_updates())
69
+ asyncio.run(run_pull())
70
+ console.success("Project pulled successfully")
71
+ except ProjectPullError as e:
72
+ console.error(f"Failed to pull UiPath project: {str(e)}")
uipath/_cli/cli_push.py CHANGED
@@ -61,6 +61,8 @@ async def upload_source_files_to_project(
61
61
  async for update in sw_file_handler.upload_source_files(settings):
62
62
  yield update
63
63
 
64
+ await sw_file_handler.upload_coded_evals_files()
65
+
64
66
 
65
67
  @click.command()
66
68
  @click.argument(
@@ -0,0 +1,45 @@
1
+ # type: ignore
2
+ import logging
3
+
4
+ import click
5
+
6
+ from ..telemetry import track
7
+ from ._evals._helpers import register_evaluator
8
+ from ._utils._console import ConsoleLogger
9
+ from ._utils._resources import Resources
10
+
11
+ logger = logging.getLogger(__name__)
12
+ console = ConsoleLogger()
13
+
14
+
15
+ @click.command()
16
+ @click.argument("resource", required=True)
17
+ @click.argument("args", nargs=-1)
18
+ @track
19
+ def register(resource: str, args: tuple[str]) -> None:
20
+ """Register a local resource.
21
+
22
+ Examples:
23
+ uipath register evaluator my-custom-evaluator.py
24
+ """
25
+ match Resources.from_string(resource):
26
+ case Resources.EVALUATOR:
27
+ usage_hint = f"Usage: {click.style('uipath register evaluator <evaluator_file_name> (ex. my_custom_evaluator.py)', fg='cyan')}"
28
+ if len(args) < 1:
29
+ console.hint(usage_hint)
30
+ console.error("Missing required argument: evaluator_file_name.")
31
+ return
32
+ if len(args) > 1:
33
+ console.hint(usage_hint)
34
+ console.error(
35
+ f"Too many arguments provided: {args}. Expected only evaluator_file_name (ex. my_custom_evaluator.py)"
36
+ )
37
+
38
+ filename = args[0]
39
+
40
+ if not isinstance(filename, str) or not filename.strip():
41
+ console.hint(usage_hint)
42
+ console.error("Invalid filename: must be a non-empty string")
43
+ return
44
+
45
+ register_evaluator(filename)
uipath/_events/_events.py CHANGED
@@ -3,9 +3,9 @@ from enum import Enum
3
3
  from typing import Any, Dict, List, Optional, Union
4
4
 
5
5
  from opentelemetry.sdk.trace import ReadableSpan
6
- from pydantic import BaseModel, ConfigDict, Field, model_validator
6
+ from pydantic import BaseModel, ConfigDict, Field, SkipValidation, model_validator
7
7
 
8
- from uipath._cli._evals._models._evaluation_set import EvaluationItem
8
+ from uipath._cli._evals._models._evaluation_set import AnyEvaluationItem, AnyEvaluator
9
9
  from uipath.eval.models import EvalItemResult
10
10
 
11
11
 
@@ -21,12 +21,13 @@ class EvalSetRunCreatedEvent(BaseModel):
21
21
  entrypoint: str
22
22
  eval_set_id: str
23
23
  no_of_evals: int
24
- evaluators: List[Any]
24
+ # skip validation to avoid abstract class instantiation
25
+ evaluators: SkipValidation[List[AnyEvaluator]]
25
26
 
26
27
 
27
28
  class EvalRunCreatedEvent(BaseModel):
28
29
  execution_id: str
29
- eval_item: EvaluationItem
30
+ eval_item: AnyEvaluationItem
30
31
 
31
32
 
32
33
  class EvalItemExceptionDetails(BaseModel):
@@ -40,7 +41,7 @@ class EvalRunUpdatedEvent(BaseModel):
40
41
  model_config = ConfigDict(arbitrary_types_allowed=True)
41
42
 
42
43
  execution_id: str
43
- eval_item: EvaluationItem
44
+ eval_item: AnyEvaluationItem
44
45
  eval_results: List[EvalItemResult]
45
46
  success: bool
46
47
  agent_output: Any
@@ -1,6 +1,7 @@
1
1
  # Environment variables
2
2
  DOTENV_FILE = ".env"
3
3
  ENV_BASE_URL = "UIPATH_URL"
4
+ ENV_EVAL_BACKEND_URL = "UIPATH_EVAL_BACKEND_URL"
4
5
  ENV_UNATTENDED_USER_ACCESS_TOKEN = "UNATTENDED_USER_ACCESS_TOKEN"
5
6
  ENV_UIPATH_ACCESS_TOKEN = "UIPATH_ACCESS_TOKEN"
6
7
  ENV_FOLDER_KEY = "UIPATH_FOLDER_KEY"
@@ -46,3 +47,6 @@ COMMUNITY_agents_SUFFIX = "-community-agents"
46
47
 
47
48
  # File names
48
49
  UIPATH_CONFIG_FILE = "uipath.json"
50
+
51
+ # Evaluators
52
+ CUSTOM_EVALUATOR_PREFIX = "file://"
@@ -0,0 +1,494 @@
1
+ import ast
2
+ import json
3
+ from collections.abc import Mapping, Sequence
4
+ from datetime import datetime
5
+ from typing import Any
6
+
7
+ from opentelemetry.sdk.trace import ReadableSpan
8
+
9
+ from ..models import (
10
+ ToolCall,
11
+ ToolOutput,
12
+ )
13
+
14
+ COMPARATOR_MAPPINGS = {
15
+ ">": "gt",
16
+ "<": "lt",
17
+ ">=": "ge",
18
+ "<=": "le",
19
+ "=": "eq",
20
+ "==": "eq",
21
+ "!=": "ne",
22
+ }
23
+
24
+ COMMUNITY_agents_SUFFIX = "-community-agents"
25
+
26
+
27
+ def extract_tool_calls_names(spans: Sequence[ReadableSpan]) -> list[str]:
28
+ """Extract the tool call names from execution spans IN ORDER.
29
+
30
+ Args:
31
+ spans: List of ReadableSpan objects from agent execution.
32
+
33
+ Returns:
34
+ List of tool names in the order they were called.
35
+ """
36
+ tool_calls_names = []
37
+
38
+ for span in spans:
39
+ # Check for tool.name attribute first
40
+ if span.attributes and (tool_name := span.attributes.get("tool.name")):
41
+ tool_calls_names.append(str(tool_name))
42
+
43
+ return tool_calls_names
44
+
45
+
46
+ def extract_tool_calls(spans: Sequence[ReadableSpan]) -> list[ToolCall]:
47
+ """Extract the tool calls from execution spans with their arguments.
48
+
49
+ Args:
50
+ spans: List of ReadableSpan objects from agent execution.
51
+
52
+ Returns:
53
+ Dict of tool calls with their arguments.
54
+ """
55
+ tool_calls = []
56
+
57
+ for span in spans:
58
+ if span.attributes and (tool_name := span.attributes.get("tool.name")):
59
+ try:
60
+ input_value: Any = span.attributes.get("input.value", {})
61
+ # Ensure input_value is a string before parsing
62
+ if isinstance(input_value, str):
63
+ arguments = ast.literal_eval(input_value)
64
+ elif isinstance(input_value, dict):
65
+ arguments = input_value
66
+ else:
67
+ arguments = {}
68
+ tool_calls.append(ToolCall(name=str(tool_name), args=arguments))
69
+ except (json.JSONDecodeError, SyntaxError, ValueError):
70
+ # Handle case where input.value is not valid JSON/Python syntax
71
+ tool_calls.append(ToolCall(name=str(tool_name), args={}))
72
+
73
+ return tool_calls
74
+
75
+
76
+ def extract_tool_calls_outputs(spans: Sequence[ReadableSpan]) -> list[ToolOutput]:
77
+ """Extract the outputs of the tool calls from execution spans.
78
+
79
+ Args:
80
+ spans: List of ReadableSpan objects from agent execution.
81
+
82
+ Returns:
83
+ List of tool calls outputs.
84
+ """
85
+ # After span normalization, the output.value should always be a dict with a content field
86
+ # We keep this list of potential output keys for extensibility purposes (e.g. frameworks without span normalization)
87
+ potential_output_keys = ["content"]
88
+ tool_calls_outputs = []
89
+ for span in spans:
90
+ if span.attributes and (tool_name := span.attributes.get("tool.name")):
91
+ output = span.attributes.get("output.value", "")
92
+ final_output = ""
93
+
94
+ # Handle different output formats
95
+ if isinstance(output, str):
96
+ try:
97
+ # Try to parse as JSON and extract content field
98
+ parsed_output = json.loads(output)
99
+ if isinstance(parsed_output, dict):
100
+ for key in potential_output_keys:
101
+ if key in parsed_output:
102
+ final_output = parsed_output[key]
103
+ break
104
+ else:
105
+ # If parsed JSON is not a dict, use the original string
106
+ final_output = output
107
+ except (json.JSONDecodeError, ValueError):
108
+ # If parsing fails, use the string as-is
109
+ final_output = output
110
+ elif isinstance(output, dict):
111
+ # If output is already a dict, extract content field
112
+ for key in potential_output_keys:
113
+ if key in output:
114
+ final_output = output.get(key, "")
115
+ break
116
+ else:
117
+ final_output = str(output)
118
+
119
+ tool_calls_outputs.append(
120
+ ToolOutput(
121
+ name=str(tool_name),
122
+ output=str(final_output) if final_output else "",
123
+ )
124
+ )
125
+ return tool_calls_outputs
126
+
127
+
128
+ def tool_calls_order_score(
129
+ actual_tool_calls_names: Sequence[str],
130
+ expected_tool_calls_names: Sequence[str],
131
+ strict: bool = False,
132
+ ) -> tuple[float, dict[str, Any]]:
133
+ """The function calculates a score based on LCS applied to the order of the tool calls.
134
+
135
+ It calculates the longest common subsequence between the actual tool calls
136
+ and the expected tool calls and returns the ratio of the LCS length to the number of
137
+ expected calls.
138
+
139
+ Args:
140
+ actual_tool_calls_names: List of tool names in the actual order
141
+ expected_tool_calls_names: List of tool names in the expected order
142
+ strict: If True, the function will return 0 if the actual calls do not match the expected calls exactly
143
+
144
+ Returns:
145
+ tuple[float, dict]: Ratio of the LCS length to the number of expected, and the justification dict
146
+ """
147
+ justification = {
148
+ "actual_tool_calls_order": list(actual_tool_calls_names),
149
+ "expected_tool_calls_order": list(expected_tool_calls_names),
150
+ "lcs": [],
151
+ }
152
+
153
+ # Handle empty cases
154
+ if not expected_tool_calls_names and not actual_tool_calls_names:
155
+ return 1.0, justification
156
+ elif not expected_tool_calls_names or not actual_tool_calls_names:
157
+ return 0.0, justification
158
+
159
+ # Handle exact match
160
+ if expected_tool_calls_names == actual_tool_calls_names:
161
+ justification["lcs"] = list(actual_tool_calls_names)
162
+ return 1.0, justification
163
+
164
+ # Handle strict mode - only perfect matches allowed
165
+ if strict:
166
+ return 0.0, justification
167
+
168
+ # Calculate LCS with full DP table for efficient reconstruction
169
+ m, n = len(actual_tool_calls_names), len(expected_tool_calls_names)
170
+ dp = [[0] * (n + 1) for _ in range(m + 1)]
171
+
172
+ # Build DP table - O(m*n)
173
+ for i in range(1, m + 1):
174
+ for j in range(1, n + 1):
175
+ if actual_tool_calls_names[i - 1] == expected_tool_calls_names[j - 1]:
176
+ dp[i][j] = dp[i - 1][j - 1] + 1
177
+ else:
178
+ dp[i][j] = max(dp[i - 1][j], dp[i][j - 1])
179
+
180
+ # Reconstruct LCS - O(m+n)
181
+ lcs = []
182
+ i, j = m, n
183
+ while i > 0 and j > 0:
184
+ if actual_tool_calls_names[i - 1] == expected_tool_calls_names[j - 1]:
185
+ lcs.append(actual_tool_calls_names[i - 1])
186
+ i -= 1
187
+ j -= 1
188
+ elif dp[i - 1][j] > dp[i][j - 1]:
189
+ i -= 1
190
+ else:
191
+ j -= 1
192
+
193
+ lcs.reverse() # Reverse to get correct order
194
+ lcs_length = len(lcs)
195
+ justification["lcs"] = lcs
196
+ return lcs_length / n, justification
197
+
198
+
199
+ def tool_calls_count_score(
200
+ actual_tool_calls_count: Mapping[str, int],
201
+ expected_tool_calls_count: Mapping[str, tuple[str, int]],
202
+ strict: bool = False,
203
+ justification_key: str = "explained_tool_calls_count",
204
+ ) -> tuple[float, dict[str, Any]]:
205
+ """Check if the expected tool call counts match the actual tool call counts.
206
+
207
+ Args:
208
+ actual_tool_calls_count: Mapping of tool names to their actual call counts.
209
+ expected_tool_calls_count: Mapping of tool names to expected (comparator, count) tuples.
210
+ strict: If True, the function will return 0 if not all expected tool calls are matched.
211
+ justification_key: Key to use for the justification in the returned dict.
212
+
213
+ Returns:
214
+ tuple[float, dict]: Score based on the number of matches, and the justification dict.
215
+ """
216
+ if not expected_tool_calls_count and not actual_tool_calls_count:
217
+ return 1.0, {
218
+ justification_key: {
219
+ "_result": "Both expected and actual tool calls are empty"
220
+ }
221
+ }
222
+ elif not expected_tool_calls_count or not actual_tool_calls_count:
223
+ return 0.0, {
224
+ justification_key: {
225
+ "_result": "Either expected or actual tool calls are empty"
226
+ }
227
+ }
228
+
229
+ score = 0.0
230
+ justifications: dict[str, Any] = {justification_key: {}}
231
+ for tool_name, (
232
+ expected_comparator,
233
+ expected_count,
234
+ ) in expected_tool_calls_count.items():
235
+ actual_count = actual_tool_calls_count.get(tool_name, 0.0)
236
+ comparator = f"__{COMPARATOR_MAPPINGS[expected_comparator]}__"
237
+ to_add = float(getattr(actual_count, comparator)(expected_count))
238
+
239
+ justifications[justification_key][tool_name] = (
240
+ f"Actual: {actual_count}, Expected: {expected_count}, Score: {to_add}"
241
+ )
242
+ if strict and to_add == 0.0:
243
+ # When strict is True, if the actual count does not match the expected count, return 0
244
+ # The justification should only include the breaching tool name
245
+ return 0.0, {
246
+ justification_key: {
247
+ tool_name: justifications[justification_key][tool_name]
248
+ }
249
+ }
250
+ score += to_add
251
+ return score / len(expected_tool_calls_count), justifications
252
+
253
+
254
+ def tool_calls_args_score(
255
+ actual_tool_calls: list[ToolCall],
256
+ expected_tool_calls: list[ToolCall],
257
+ strict: bool = False,
258
+ subset: bool = False,
259
+ justification_key: str = "explained_tool_calls_args",
260
+ ) -> tuple[float, dict[str, Any]]:
261
+ """Check if the expected tool calls are correctly called with matching arguments.
262
+
263
+ This function does not check the order of the tool calls!
264
+
265
+ Args:
266
+ actual_tool_calls: List of actual tool calls with their arguments.
267
+ expected_tool_calls: List of expected tool calls with their arguments.
268
+ strict: If True, the function will return 0 if not all expected tool calls are matched.
269
+ subset: If True, the function will check if the expected args are a subset of the actual args.
270
+ justification_key: Key to use for the justification in the returned dict.
271
+
272
+ Returns:
273
+ tuple[float, dict]: Score based on the number of matches, and the justification dict.
274
+ """
275
+ if not expected_tool_calls and not actual_tool_calls:
276
+ return 1.0, {
277
+ justification_key: {
278
+ "_result": "Both expected and actual tool calls are empty"
279
+ }
280
+ }
281
+ elif not expected_tool_calls or not actual_tool_calls:
282
+ return 0.0, {
283
+ justification_key: {
284
+ "_result": "Either expected or actual tool calls are empty"
285
+ }
286
+ }
287
+
288
+ cnt = 0
289
+ visited: set[int] = set()
290
+ justifications: dict[str, Any] = {justification_key: {}}
291
+ tool_counters: dict[str, int] = {}
292
+
293
+ for expected_tool_call in expected_tool_calls:
294
+ for idx, call in enumerate(actual_tool_calls):
295
+ if call.name == expected_tool_call.name and idx not in visited:
296
+ # Get or initialize counter for this tool name
297
+ tool_counters[call.name] = tool_counters.get(call.name, 0)
298
+ tool_key = f"{call.name}_{tool_counters[call.name]}"
299
+ tool_counters[call.name] += 1
300
+
301
+ # Check arguments based on mode
302
+ # The linter highlights a few problems here due to using lambdas, but they're safe to ignore
303
+ # Breaking this down into proper functions would unnecessarily make the code more complex
304
+ if subset:
305
+ # Subset mode: safely check if all expected args exist and match
306
+ args_check = ( # noqa: E731
307
+ lambda k, v: k in call.args # noqa: B023
308
+ and call.args[k] == v # noqa: B023
309
+ )
310
+ else:
311
+ # Exact mode: direct access (may raise KeyError)
312
+ args_check = lambda k, v: call.args[k] == v # noqa: E731, B023
313
+
314
+ try:
315
+ args_match = all(
316
+ args_check(k, v) for k, v in expected_tool_call.args.items()
317
+ )
318
+ except KeyError:
319
+ # Only possible in exact mode when key is missing
320
+ args_match = False
321
+
322
+ justifications[justification_key][tool_key] = (
323
+ f"Actual: {call.args}, Expected: {expected_tool_call.args}, Score: {float(args_match)}"
324
+ )
325
+ if args_match:
326
+ cnt += 1
327
+ visited.add(idx)
328
+ break
329
+ # In case of mismatch, DON'T add to visited in non-strict mode
330
+ # so this actual tool call can be matched against other expected calls
331
+
332
+ return (
333
+ cnt / len(expected_tool_calls)
334
+ if not strict
335
+ else float(cnt == len(expected_tool_calls))
336
+ ), justifications
337
+
338
+
339
+ def tool_calls_output_score(
340
+ actual_tool_calls_outputs: list[ToolOutput],
341
+ expected_tool_calls_outputs: list[ToolOutput],
342
+ strict: bool = False,
343
+ justification_key: str = "explained_tool_calls_outputs",
344
+ ) -> tuple[float, dict[str, Any]]:
345
+ """Check if the expected tool calls are correctly called, where expected args must be a subset of actual args.
346
+
347
+ Args:
348
+ actual_tool_calls_outputs: List of actual tool calls outputs.
349
+ expected_tool_calls_outputs: List of expected tool calls outputs.
350
+ strict: If True, the function will return 0 if not all expected tool calls are matched.
351
+
352
+ Returns:
353
+ tuple[float, str]: Score based on the number of matches, and the justification.
354
+ """
355
+ if not expected_tool_calls_outputs and not actual_tool_calls_outputs:
356
+ return 1.0, {
357
+ justification_key: {
358
+ "_result": "Both expected and actual tool calls outputs are empty"
359
+ }
360
+ }
361
+ elif not expected_tool_calls_outputs or not actual_tool_calls_outputs:
362
+ return 0.0, {
363
+ justification_key: {
364
+ "_result": "Either expected or actual tool calls outputs are empty"
365
+ }
366
+ }
367
+
368
+ cnt = 0.0
369
+ justifications: dict[str, Any] = {justification_key: {}}
370
+ visited: set[int] = set()
371
+ tool_counters: dict[str, int] = {}
372
+
373
+ for expected_tool_call_output in expected_tool_calls_outputs:
374
+ matched = False
375
+
376
+ # Look through ALL actual tool calls to find a match
377
+ for idx, actual_tool_call_output in enumerate(actual_tool_calls_outputs):
378
+ if idx in visited:
379
+ continue
380
+ if actual_tool_call_output.name == expected_tool_call_output.name:
381
+ # Get or initialize counter for this tool name
382
+ tool_counters[actual_tool_call_output.name] = tool_counters.get(
383
+ actual_tool_call_output.name, 0
384
+ )
385
+ tool_key = f"{actual_tool_call_output.name}_{tool_counters[actual_tool_call_output.name]}"
386
+ tool_counters[actual_tool_call_output.name] += 1
387
+
388
+ justifications[justification_key][tool_key] = (
389
+ f"Actual: {actual_tool_call_output.output}, Expected: {expected_tool_call_output.output}, Score: {float(actual_tool_call_output.output == expected_tool_call_output.output)}"
390
+ )
391
+
392
+ if actual_tool_call_output.output == expected_tool_call_output.output:
393
+ # Perfect match found
394
+ cnt += 1.0
395
+ visited.add(idx)
396
+ matched = True
397
+ break
398
+ elif strict:
399
+ # In strict mode, any mismatch returns 0 immediately
400
+ return 0.0, {
401
+ justification_key: {
402
+ tool_key: justifications[justification_key][tool_key]
403
+ }
404
+ }
405
+ # In non-strict mode with mismatch, continue looking for perfect match
406
+ # DON'T add to visited, DON'T break
407
+
408
+ # If no match found and we're in strict mode, return 0
409
+ if not matched and strict:
410
+ return 0.0, {
411
+ justification_key: {
412
+ "_result": f"No matching actual tool call found for expected {expected_tool_call_output.name}"
413
+ }
414
+ }
415
+
416
+ return (
417
+ cnt / len(expected_tool_calls_outputs)
418
+ if not strict
419
+ else float(cnt == len(expected_tool_calls_outputs))
420
+ ), justifications
421
+
422
+
423
+ def trace_to_str(agent_trace: Sequence[ReadableSpan]) -> str:
424
+ """Convert OTEL spans to a platform-style agent run history string.
425
+
426
+ Creates a similar structure to LangChain message processing but using OTEL spans.
427
+ Only processes tool spans (spans with 'tool.name' attribute).
428
+
429
+ Args:
430
+ agent_trace: List of ReadableSpan objects from the agent execution
431
+
432
+ Returns:
433
+ String representation of the agent run history in platform format
434
+ """
435
+ platform_history = []
436
+ seen_tool_calls = set()
437
+
438
+ for span in agent_trace:
439
+ if span.attributes and (tool_name := span.attributes.get("tool.name")):
440
+ # Get span timing information
441
+ start_time = span.start_time
442
+ end_time = span.end_time
443
+
444
+ # Convert nanoseconds to datetime if needed
445
+ if isinstance(start_time, int):
446
+ start_timestamp = datetime.fromtimestamp(start_time / 1e9)
447
+ else:
448
+ start_timestamp = start_time # type:ignore
449
+
450
+ if isinstance(end_time, int):
451
+ end_timestamp = datetime.fromtimestamp(end_time / 1e9)
452
+ else:
453
+ end_timestamp = end_time # type:ignore
454
+
455
+ timestamp_str = (
456
+ start_timestamp.strftime("%Y-%m-%d %H:%M:%S") if start_timestamp else ""
457
+ )
458
+
459
+ # Get tool call information
460
+ tool_args: Any = span.attributes.get("input.value", {})
461
+ tool_result = str(span.attributes.get("output.value", {})).strip()
462
+
463
+ span_id = (
464
+ span.context.span_id
465
+ if span.context
466
+ else str(hash(f"{tool_name}_{timestamp_str}"))
467
+ )
468
+
469
+ # De-duplicate tool calls based on span ID
470
+ if span_id in seen_tool_calls:
471
+ continue
472
+ seen_tool_calls.add(span_id)
473
+
474
+ # Add tool selection (equivalent to AIMessage with tool_calls)
475
+ platform_history.append(f"[{timestamp_str}] LLM Response:")
476
+ platform_history.append(" Agent Selected 1 Tool(s):")
477
+ platform_history.append("")
478
+ platform_history.append(f" Tool: {tool_name}")
479
+ platform_history.append(f" Arguments: {str(tool_args)}")
480
+ platform_history.append("")
481
+
482
+ # Add tool response (equivalent to ToolMessage)
483
+ end_timestamp_str = (
484
+ end_timestamp.strftime("%Y-%m-%d %H:%M:%S")
485
+ if end_timestamp
486
+ else timestamp_str
487
+ )
488
+ platform_history.append(
489
+ f"[{end_timestamp_str}] Tool Call Response - {tool_name}:"
490
+ )
491
+ platform_history.append(f"{tool_result}")
492
+ platform_history.append("")
493
+
494
+ return "\n".join(platform_history)