uipath 2.1.107__py3-none-any.whl → 2.1.109__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of uipath might be problematic. Click here for more details.

Files changed (72) hide show
  1. uipath/_cli/__init__.py +4 -0
  2. uipath/_cli/_evals/_console_progress_reporter.py +2 -2
  3. uipath/_cli/_evals/_evaluator_factory.py +314 -29
  4. uipath/_cli/_evals/_helpers.py +194 -0
  5. uipath/_cli/_evals/_models/_evaluation_set.py +73 -7
  6. uipath/_cli/_evals/_models/_evaluator.py +183 -9
  7. uipath/_cli/_evals/_models/_evaluator_base_params.py +3 -3
  8. uipath/_cli/_evals/_models/_output.py +87 -3
  9. uipath/_cli/_evals/_progress_reporter.py +288 -28
  10. uipath/_cli/_evals/_runtime.py +80 -26
  11. uipath/_cli/_evals/mocks/input_mocker.py +1 -3
  12. uipath/_cli/_evals/mocks/llm_mocker.py +2 -2
  13. uipath/_cli/_evals/mocks/mocker_factory.py +2 -2
  14. uipath/_cli/_evals/mocks/mockito_mocker.py +2 -2
  15. uipath/_cli/_evals/mocks/mocks.py +5 -3
  16. uipath/_cli/_push/models.py +17 -0
  17. uipath/_cli/_push/sw_file_handler.py +336 -3
  18. uipath/_cli/_runtime/_contracts.py +25 -5
  19. uipath/_cli/_templates/custom_evaluator.py.template +65 -0
  20. uipath/_cli/_utils/_eval_set.py +30 -9
  21. uipath/_cli/_utils/_resources.py +21 -0
  22. uipath/_cli/_utils/_studio_project.py +18 -0
  23. uipath/_cli/cli_add.py +114 -0
  24. uipath/_cli/cli_eval.py +5 -1
  25. uipath/_cli/cli_pull.py +11 -26
  26. uipath/_cli/cli_push.py +2 -0
  27. uipath/_cli/cli_register.py +45 -0
  28. uipath/_events/_events.py +6 -5
  29. uipath/_resources/SDK_REFERENCE.md +0 -97
  30. uipath/_uipath.py +10 -37
  31. uipath/_utils/constants.py +4 -0
  32. uipath/eval/_helpers/evaluators_helpers.py +494 -0
  33. uipath/eval/_helpers/helpers.py +30 -2
  34. uipath/eval/evaluators/__init__.py +60 -5
  35. uipath/eval/evaluators/base_evaluator.py +546 -44
  36. uipath/eval/evaluators/contains_evaluator.py +80 -0
  37. uipath/eval/evaluators/exact_match_evaluator.py +43 -12
  38. uipath/eval/evaluators/json_similarity_evaluator.py +41 -12
  39. uipath/eval/evaluators/legacy_base_evaluator.py +89 -0
  40. uipath/eval/evaluators/{deterministic_evaluator_base.py → legacy_deterministic_evaluator_base.py} +2 -2
  41. uipath/eval/evaluators/legacy_exact_match_evaluator.py +37 -0
  42. uipath/eval/evaluators/legacy_json_similarity_evaluator.py +151 -0
  43. uipath/eval/evaluators/legacy_llm_as_judge_evaluator.py +137 -0
  44. uipath/eval/evaluators/{trajectory_evaluator.py → legacy_trajectory_evaluator.py} +5 -6
  45. uipath/eval/evaluators/llm_as_judge_evaluator.py +143 -78
  46. uipath/eval/evaluators/llm_judge_output_evaluator.py +112 -0
  47. uipath/eval/evaluators/llm_judge_trajectory_evaluator.py +142 -0
  48. uipath/eval/evaluators/output_evaluator.py +117 -0
  49. uipath/eval/evaluators/tool_call_args_evaluator.py +82 -0
  50. uipath/eval/evaluators/tool_call_count_evaluator.py +87 -0
  51. uipath/eval/evaluators/tool_call_order_evaluator.py +84 -0
  52. uipath/eval/evaluators/tool_call_output_evaluator.py +87 -0
  53. uipath/eval/evaluators_types/ContainsEvaluator.json +73 -0
  54. uipath/eval/evaluators_types/ExactMatchEvaluator.json +89 -0
  55. uipath/eval/evaluators_types/JsonSimilarityEvaluator.json +81 -0
  56. uipath/eval/evaluators_types/LLMJudgeOutputEvaluator.json +110 -0
  57. uipath/eval/evaluators_types/LLMJudgeSimulationTrajectoryEvaluator.json +88 -0
  58. uipath/eval/evaluators_types/LLMJudgeStrictJSONSimilarityOutputEvaluator.json +110 -0
  59. uipath/eval/evaluators_types/LLMJudgeTrajectoryEvaluator.json +88 -0
  60. uipath/eval/evaluators_types/ToolCallArgsEvaluator.json +131 -0
  61. uipath/eval/evaluators_types/ToolCallCountEvaluator.json +104 -0
  62. uipath/eval/evaluators_types/ToolCallOrderEvaluator.json +100 -0
  63. uipath/eval/evaluators_types/ToolCallOutputEvaluator.json +124 -0
  64. uipath/eval/evaluators_types/generate_types.py +31 -0
  65. uipath/eval/models/__init__.py +16 -1
  66. uipath/eval/models/llm_judge_types.py +196 -0
  67. uipath/eval/models/models.py +109 -7
  68. {uipath-2.1.107.dist-info → uipath-2.1.109.dist-info}/METADATA +1 -1
  69. {uipath-2.1.107.dist-info → uipath-2.1.109.dist-info}/RECORD +72 -40
  70. {uipath-2.1.107.dist-info → uipath-2.1.109.dist-info}/WHEEL +0 -0
  71. {uipath-2.1.107.dist-info → uipath-2.1.109.dist-info}/entry_points.txt +0 -0
  72. {uipath-2.1.107.dist-info → uipath-2.1.109.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,494 @@
1
+ import ast
2
+ import json
3
+ from collections.abc import Mapping, Sequence
4
+ from datetime import datetime
5
+ from typing import Any
6
+
7
+ from opentelemetry.sdk.trace import ReadableSpan
8
+
9
+ from ..models import (
10
+ ToolCall,
11
+ ToolOutput,
12
+ )
13
+
14
+ COMPARATOR_MAPPINGS = {
15
+ ">": "gt",
16
+ "<": "lt",
17
+ ">=": "ge",
18
+ "<=": "le",
19
+ "=": "eq",
20
+ "==": "eq",
21
+ "!=": "ne",
22
+ }
23
+
24
+ COMMUNITY_agents_SUFFIX = "-community-agents"
25
+
26
+
27
+ def extract_tool_calls_names(spans: Sequence[ReadableSpan]) -> list[str]:
28
+ """Extract the tool call names from execution spans IN ORDER.
29
+
30
+ Args:
31
+ spans: List of ReadableSpan objects from agent execution.
32
+
33
+ Returns:
34
+ List of tool names in the order they were called.
35
+ """
36
+ tool_calls_names = []
37
+
38
+ for span in spans:
39
+ # Check for tool.name attribute first
40
+ if span.attributes and (tool_name := span.attributes.get("tool.name")):
41
+ tool_calls_names.append(str(tool_name))
42
+
43
+ return tool_calls_names
44
+
45
+
46
+ def extract_tool_calls(spans: Sequence[ReadableSpan]) -> list[ToolCall]:
47
+ """Extract the tool calls from execution spans with their arguments.
48
+
49
+ Args:
50
+ spans: List of ReadableSpan objects from agent execution.
51
+
52
+ Returns:
53
+ Dict of tool calls with their arguments.
54
+ """
55
+ tool_calls = []
56
+
57
+ for span in spans:
58
+ if span.attributes and (tool_name := span.attributes.get("tool.name")):
59
+ try:
60
+ input_value: Any = span.attributes.get("input.value", {})
61
+ # Ensure input_value is a string before parsing
62
+ if isinstance(input_value, str):
63
+ arguments = ast.literal_eval(input_value)
64
+ elif isinstance(input_value, dict):
65
+ arguments = input_value
66
+ else:
67
+ arguments = {}
68
+ tool_calls.append(ToolCall(name=str(tool_name), args=arguments))
69
+ except (json.JSONDecodeError, SyntaxError, ValueError):
70
+ # Handle case where input.value is not valid JSON/Python syntax
71
+ tool_calls.append(ToolCall(name=str(tool_name), args={}))
72
+
73
+ return tool_calls
74
+
75
+
76
+ def extract_tool_calls_outputs(spans: Sequence[ReadableSpan]) -> list[ToolOutput]:
77
+ """Extract the outputs of the tool calls from execution spans.
78
+
79
+ Args:
80
+ spans: List of ReadableSpan objects from agent execution.
81
+
82
+ Returns:
83
+ List of tool calls outputs.
84
+ """
85
+ # After span normalization, the output.value should always be a dict with a content field
86
+ # We keep this list of potential output keys for extensibility purposes (e.g. frameworks without span normalization)
87
+ potential_output_keys = ["content"]
88
+ tool_calls_outputs = []
89
+ for span in spans:
90
+ if span.attributes and (tool_name := span.attributes.get("tool.name")):
91
+ output = span.attributes.get("output.value", "")
92
+ final_output = ""
93
+
94
+ # Handle different output formats
95
+ if isinstance(output, str):
96
+ try:
97
+ # Try to parse as JSON and extract content field
98
+ parsed_output = json.loads(output)
99
+ if isinstance(parsed_output, dict):
100
+ for key in potential_output_keys:
101
+ if key in parsed_output:
102
+ final_output = parsed_output[key]
103
+ break
104
+ else:
105
+ # If parsed JSON is not a dict, use the original string
106
+ final_output = output
107
+ except (json.JSONDecodeError, ValueError):
108
+ # If parsing fails, use the string as-is
109
+ final_output = output
110
+ elif isinstance(output, dict):
111
+ # If output is already a dict, extract content field
112
+ for key in potential_output_keys:
113
+ if key in output:
114
+ final_output = output.get(key, "")
115
+ break
116
+ else:
117
+ final_output = str(output)
118
+
119
+ tool_calls_outputs.append(
120
+ ToolOutput(
121
+ name=str(tool_name),
122
+ output=str(final_output) if final_output else "",
123
+ )
124
+ )
125
+ return tool_calls_outputs
126
+
127
+
128
+ def tool_calls_order_score(
129
+ actual_tool_calls_names: Sequence[str],
130
+ expected_tool_calls_names: Sequence[str],
131
+ strict: bool = False,
132
+ ) -> tuple[float, dict[str, Any]]:
133
+ """The function calculates a score based on LCS applied to the order of the tool calls.
134
+
135
+ It calculates the longest common subsequence between the actual tool calls
136
+ and the expected tool calls and returns the ratio of the LCS length to the number of
137
+ expected calls.
138
+
139
+ Args:
140
+ actual_tool_calls_names: List of tool names in the actual order
141
+ expected_tool_calls_names: List of tool names in the expected order
142
+ strict: If True, the function will return 0 if the actual calls do not match the expected calls exactly
143
+
144
+ Returns:
145
+ tuple[float, dict]: Ratio of the LCS length to the number of expected, and the justification dict
146
+ """
147
+ justification = {
148
+ "actual_tool_calls_order": list(actual_tool_calls_names),
149
+ "expected_tool_calls_order": list(expected_tool_calls_names),
150
+ "lcs": [],
151
+ }
152
+
153
+ # Handle empty cases
154
+ if not expected_tool_calls_names and not actual_tool_calls_names:
155
+ return 1.0, justification
156
+ elif not expected_tool_calls_names or not actual_tool_calls_names:
157
+ return 0.0, justification
158
+
159
+ # Handle exact match
160
+ if expected_tool_calls_names == actual_tool_calls_names:
161
+ justification["lcs"] = list(actual_tool_calls_names)
162
+ return 1.0, justification
163
+
164
+ # Handle strict mode - only perfect matches allowed
165
+ if strict:
166
+ return 0.0, justification
167
+
168
+ # Calculate LCS with full DP table for efficient reconstruction
169
+ m, n = len(actual_tool_calls_names), len(expected_tool_calls_names)
170
+ dp = [[0] * (n + 1) for _ in range(m + 1)]
171
+
172
+ # Build DP table - O(m*n)
173
+ for i in range(1, m + 1):
174
+ for j in range(1, n + 1):
175
+ if actual_tool_calls_names[i - 1] == expected_tool_calls_names[j - 1]:
176
+ dp[i][j] = dp[i - 1][j - 1] + 1
177
+ else:
178
+ dp[i][j] = max(dp[i - 1][j], dp[i][j - 1])
179
+
180
+ # Reconstruct LCS - O(m+n)
181
+ lcs = []
182
+ i, j = m, n
183
+ while i > 0 and j > 0:
184
+ if actual_tool_calls_names[i - 1] == expected_tool_calls_names[j - 1]:
185
+ lcs.append(actual_tool_calls_names[i - 1])
186
+ i -= 1
187
+ j -= 1
188
+ elif dp[i - 1][j] > dp[i][j - 1]:
189
+ i -= 1
190
+ else:
191
+ j -= 1
192
+
193
+ lcs.reverse() # Reverse to get correct order
194
+ lcs_length = len(lcs)
195
+ justification["lcs"] = lcs
196
+ return lcs_length / n, justification
197
+
198
+
199
+ def tool_calls_count_score(
200
+ actual_tool_calls_count: Mapping[str, int],
201
+ expected_tool_calls_count: Mapping[str, tuple[str, int]],
202
+ strict: bool = False,
203
+ justification_key: str = "explained_tool_calls_count",
204
+ ) -> tuple[float, dict[str, Any]]:
205
+ """Check if the expected tool call counts match the actual tool call counts.
206
+
207
+ Args:
208
+ actual_tool_calls_count: Mapping of tool names to their actual call counts.
209
+ expected_tool_calls_count: Mapping of tool names to expected (comparator, count) tuples.
210
+ strict: If True, the function will return 0 if not all expected tool calls are matched.
211
+ justification_key: Key to use for the justification in the returned dict.
212
+
213
+ Returns:
214
+ tuple[float, dict]: Score based on the number of matches, and the justification dict.
215
+ """
216
+ if not expected_tool_calls_count and not actual_tool_calls_count:
217
+ return 1.0, {
218
+ justification_key: {
219
+ "_result": "Both expected and actual tool calls are empty"
220
+ }
221
+ }
222
+ elif not expected_tool_calls_count or not actual_tool_calls_count:
223
+ return 0.0, {
224
+ justification_key: {
225
+ "_result": "Either expected or actual tool calls are empty"
226
+ }
227
+ }
228
+
229
+ score = 0.0
230
+ justifications: dict[str, Any] = {justification_key: {}}
231
+ for tool_name, (
232
+ expected_comparator,
233
+ expected_count,
234
+ ) in expected_tool_calls_count.items():
235
+ actual_count = actual_tool_calls_count.get(tool_name, 0.0)
236
+ comparator = f"__{COMPARATOR_MAPPINGS[expected_comparator]}__"
237
+ to_add = float(getattr(actual_count, comparator)(expected_count))
238
+
239
+ justifications[justification_key][tool_name] = (
240
+ f"Actual: {actual_count}, Expected: {expected_count}, Score: {to_add}"
241
+ )
242
+ if strict and to_add == 0.0:
243
+ # When strict is True, if the actual count does not match the expected count, return 0
244
+ # The justification should only include the breaching tool name
245
+ return 0.0, {
246
+ justification_key: {
247
+ tool_name: justifications[justification_key][tool_name]
248
+ }
249
+ }
250
+ score += to_add
251
+ return score / len(expected_tool_calls_count), justifications
252
+
253
+
254
+ def tool_calls_args_score(
255
+ actual_tool_calls: list[ToolCall],
256
+ expected_tool_calls: list[ToolCall],
257
+ strict: bool = False,
258
+ subset: bool = False,
259
+ justification_key: str = "explained_tool_calls_args",
260
+ ) -> tuple[float, dict[str, Any]]:
261
+ """Check if the expected tool calls are correctly called with matching arguments.
262
+
263
+ This function does not check the order of the tool calls!
264
+
265
+ Args:
266
+ actual_tool_calls: List of actual tool calls with their arguments.
267
+ expected_tool_calls: List of expected tool calls with their arguments.
268
+ strict: If True, the function will return 0 if not all expected tool calls are matched.
269
+ subset: If True, the function will check if the expected args are a subset of the actual args.
270
+ justification_key: Key to use for the justification in the returned dict.
271
+
272
+ Returns:
273
+ tuple[float, dict]: Score based on the number of matches, and the justification dict.
274
+ """
275
+ if not expected_tool_calls and not actual_tool_calls:
276
+ return 1.0, {
277
+ justification_key: {
278
+ "_result": "Both expected and actual tool calls are empty"
279
+ }
280
+ }
281
+ elif not expected_tool_calls or not actual_tool_calls:
282
+ return 0.0, {
283
+ justification_key: {
284
+ "_result": "Either expected or actual tool calls are empty"
285
+ }
286
+ }
287
+
288
+ cnt = 0
289
+ visited: set[int] = set()
290
+ justifications: dict[str, Any] = {justification_key: {}}
291
+ tool_counters: dict[str, int] = {}
292
+
293
+ for expected_tool_call in expected_tool_calls:
294
+ for idx, call in enumerate(actual_tool_calls):
295
+ if call.name == expected_tool_call.name and idx not in visited:
296
+ # Get or initialize counter for this tool name
297
+ tool_counters[call.name] = tool_counters.get(call.name, 0)
298
+ tool_key = f"{call.name}_{tool_counters[call.name]}"
299
+ tool_counters[call.name] += 1
300
+
301
+ # Check arguments based on mode
302
+ # The linter highlights a few problems here due to using lambdas, but they're safe to ignore
303
+ # Breaking this down into proper functions would unnecessarily make the code more complex
304
+ if subset:
305
+ # Subset mode: safely check if all expected args exist and match
306
+ args_check = ( # noqa: E731
307
+ lambda k, v: k in call.args # noqa: B023
308
+ and call.args[k] == v # noqa: B023
309
+ )
310
+ else:
311
+ # Exact mode: direct access (may raise KeyError)
312
+ args_check = lambda k, v: call.args[k] == v # noqa: E731, B023
313
+
314
+ try:
315
+ args_match = all(
316
+ args_check(k, v) for k, v in expected_tool_call.args.items()
317
+ )
318
+ except KeyError:
319
+ # Only possible in exact mode when key is missing
320
+ args_match = False
321
+
322
+ justifications[justification_key][tool_key] = (
323
+ f"Actual: {call.args}, Expected: {expected_tool_call.args}, Score: {float(args_match)}"
324
+ )
325
+ if args_match:
326
+ cnt += 1
327
+ visited.add(idx)
328
+ break
329
+ # In case of mismatch, DON'T add to visited in non-strict mode
330
+ # so this actual tool call can be matched against other expected calls
331
+
332
+ return (
333
+ cnt / len(expected_tool_calls)
334
+ if not strict
335
+ else float(cnt == len(expected_tool_calls))
336
+ ), justifications
337
+
338
+
339
+ def tool_calls_output_score(
340
+ actual_tool_calls_outputs: list[ToolOutput],
341
+ expected_tool_calls_outputs: list[ToolOutput],
342
+ strict: bool = False,
343
+ justification_key: str = "explained_tool_calls_outputs",
344
+ ) -> tuple[float, dict[str, Any]]:
345
+ """Check if the expected tool calls are correctly called, where expected args must be a subset of actual args.
346
+
347
+ Args:
348
+ actual_tool_calls_outputs: List of actual tool calls outputs.
349
+ expected_tool_calls_outputs: List of expected tool calls outputs.
350
+ strict: If True, the function will return 0 if not all expected tool calls are matched.
351
+
352
+ Returns:
353
+ tuple[float, str]: Score based on the number of matches, and the justification.
354
+ """
355
+ if not expected_tool_calls_outputs and not actual_tool_calls_outputs:
356
+ return 1.0, {
357
+ justification_key: {
358
+ "_result": "Both expected and actual tool calls outputs are empty"
359
+ }
360
+ }
361
+ elif not expected_tool_calls_outputs or not actual_tool_calls_outputs:
362
+ return 0.0, {
363
+ justification_key: {
364
+ "_result": "Either expected or actual tool calls outputs are empty"
365
+ }
366
+ }
367
+
368
+ cnt = 0.0
369
+ justifications: dict[str, Any] = {justification_key: {}}
370
+ visited: set[int] = set()
371
+ tool_counters: dict[str, int] = {}
372
+
373
+ for expected_tool_call_output in expected_tool_calls_outputs:
374
+ matched = False
375
+
376
+ # Look through ALL actual tool calls to find a match
377
+ for idx, actual_tool_call_output in enumerate(actual_tool_calls_outputs):
378
+ if idx in visited:
379
+ continue
380
+ if actual_tool_call_output.name == expected_tool_call_output.name:
381
+ # Get or initialize counter for this tool name
382
+ tool_counters[actual_tool_call_output.name] = tool_counters.get(
383
+ actual_tool_call_output.name, 0
384
+ )
385
+ tool_key = f"{actual_tool_call_output.name}_{tool_counters[actual_tool_call_output.name]}"
386
+ tool_counters[actual_tool_call_output.name] += 1
387
+
388
+ justifications[justification_key][tool_key] = (
389
+ f"Actual: {actual_tool_call_output.output}, Expected: {expected_tool_call_output.output}, Score: {float(actual_tool_call_output.output == expected_tool_call_output.output)}"
390
+ )
391
+
392
+ if actual_tool_call_output.output == expected_tool_call_output.output:
393
+ # Perfect match found
394
+ cnt += 1.0
395
+ visited.add(idx)
396
+ matched = True
397
+ break
398
+ elif strict:
399
+ # In strict mode, any mismatch returns 0 immediately
400
+ return 0.0, {
401
+ justification_key: {
402
+ tool_key: justifications[justification_key][tool_key]
403
+ }
404
+ }
405
+ # In non-strict mode with mismatch, continue looking for perfect match
406
+ # DON'T add to visited, DON'T break
407
+
408
+ # If no match found and we're in strict mode, return 0
409
+ if not matched and strict:
410
+ return 0.0, {
411
+ justification_key: {
412
+ "_result": f"No matching actual tool call found for expected {expected_tool_call_output.name}"
413
+ }
414
+ }
415
+
416
+ return (
417
+ cnt / len(expected_tool_calls_outputs)
418
+ if not strict
419
+ else float(cnt == len(expected_tool_calls_outputs))
420
+ ), justifications
421
+
422
+
423
+ def trace_to_str(agent_trace: Sequence[ReadableSpan]) -> str:
424
+ """Convert OTEL spans to a platform-style agent run history string.
425
+
426
+ Creates a similar structure to LangChain message processing but using OTEL spans.
427
+ Only processes tool spans (spans with 'tool.name' attribute).
428
+
429
+ Args:
430
+ agent_trace: List of ReadableSpan objects from the agent execution
431
+
432
+ Returns:
433
+ String representation of the agent run history in platform format
434
+ """
435
+ platform_history = []
436
+ seen_tool_calls = set()
437
+
438
+ for span in agent_trace:
439
+ if span.attributes and (tool_name := span.attributes.get("tool.name")):
440
+ # Get span timing information
441
+ start_time = span.start_time
442
+ end_time = span.end_time
443
+
444
+ # Convert nanoseconds to datetime if needed
445
+ if isinstance(start_time, int):
446
+ start_timestamp = datetime.fromtimestamp(start_time / 1e9)
447
+ else:
448
+ start_timestamp = start_time # type:ignore
449
+
450
+ if isinstance(end_time, int):
451
+ end_timestamp = datetime.fromtimestamp(end_time / 1e9)
452
+ else:
453
+ end_timestamp = end_time # type:ignore
454
+
455
+ timestamp_str = (
456
+ start_timestamp.strftime("%Y-%m-%d %H:%M:%S") if start_timestamp else ""
457
+ )
458
+
459
+ # Get tool call information
460
+ tool_args: Any = span.attributes.get("input.value", {})
461
+ tool_result = str(span.attributes.get("output.value", {})).strip()
462
+
463
+ span_id = (
464
+ span.context.span_id
465
+ if span.context
466
+ else str(hash(f"{tool_name}_{timestamp_str}"))
467
+ )
468
+
469
+ # De-duplicate tool calls based on span ID
470
+ if span_id in seen_tool_calls:
471
+ continue
472
+ seen_tool_calls.add(span_id)
473
+
474
+ # Add tool selection (equivalent to AIMessage with tool_calls)
475
+ platform_history.append(f"[{timestamp_str}] LLM Response:")
476
+ platform_history.append(" Agent Selected 1 Tool(s):")
477
+ platform_history.append("")
478
+ platform_history.append(f" Tool: {tool_name}")
479
+ platform_history.append(f" Arguments: {str(tool_args)}")
480
+ platform_history.append("")
481
+
482
+ # Add tool response (equivalent to ToolMessage)
483
+ end_timestamp_str = (
484
+ end_timestamp.strftime("%Y-%m-%d %H:%M:%S")
485
+ if end_timestamp
486
+ else timestamp_str
487
+ )
488
+ platform_history.append(
489
+ f"[{end_timestamp_str}] Tool Call Response - {tool_name}:"
490
+ )
491
+ platform_history.append(f"{tool_result}")
492
+ platform_history.append("")
493
+
494
+ return "\n".join(platform_history)
@@ -1,10 +1,13 @@
1
+ import functools
1
2
  import json
2
3
  import os
4
+ import time
5
+ from collections.abc import Callable
6
+ from typing import Any
3
7
 
4
8
  import click
5
9
 
6
- from uipath._cli._utils._console import ConsoleLogger
7
- from uipath._utils.constants import UIPATH_CONFIG_FILE
10
+ from ..models import ErrorEvaluationResult, EvaluationResult
8
11
 
9
12
 
10
13
  def auto_discover_entrypoint() -> str:
@@ -16,6 +19,9 @@ def auto_discover_entrypoint() -> str:
16
19
  Raises:
17
20
  ValueError: If no entrypoint found or multiple entrypoints exist
18
21
  """
22
+ from uipath._cli._utils._console import ConsoleLogger
23
+ from uipath._utils.constants import UIPATH_CONFIG_FILE
24
+
19
25
  console = ConsoleLogger()
20
26
 
21
27
  if not os.path.isfile(UIPATH_CONFIG_FILE):
@@ -45,3 +51,25 @@ def auto_discover_entrypoint() -> str:
45
51
  f"Auto-discovered agent entrypoint: {click.style(entrypoint, fg='cyan')}"
46
52
  )
47
53
  return entrypoint
54
+
55
+
56
+ def track_evaluation_metrics(func: Callable[..., Any]) -> Callable[..., Any]:
57
+ """Decorator to track evaluation metrics and handle errors gracefully."""
58
+
59
+ @functools.wraps(func)
60
+ async def wrapper(*args: Any, **kwargs: Any) -> EvaluationResult:
61
+ start_time = time.time()
62
+ try:
63
+ result = await func(*args, **kwargs)
64
+ except Exception as e:
65
+ result = ErrorEvaluationResult(
66
+ details="Exception thrown by evaluator: {}".format(e),
67
+ evaluation_time=time.time() - start_time,
68
+ )
69
+ end_time = time.time()
70
+ execution_time = end_time - start_time
71
+
72
+ result.evaluation_time = execution_time
73
+ return result
74
+
75
+ return wrapper
@@ -1,15 +1,70 @@
1
1
  """UiPath evaluator implementations for agent performance evaluation."""
2
2
 
3
- from .base_evaluator import BaseEvaluator
3
+ from typing import Any
4
+
5
+ # Current coded evaluators
6
+ from .base_evaluator import BaseEvaluationCriteria, BaseEvaluator, BaseEvaluatorConfig
7
+ from .contains_evaluator import ContainsEvaluator
4
8
  from .exact_match_evaluator import ExactMatchEvaluator
5
9
  from .json_similarity_evaluator import JsonSimilarityEvaluator
6
- from .llm_as_judge_evaluator import LlmAsAJudgeEvaluator
7
- from .trajectory_evaluator import TrajectoryEvaluator
10
+
11
+ # Legacy evaluators
12
+ from .legacy_base_evaluator import LegacyBaseEvaluator
13
+ from .legacy_exact_match_evaluator import LegacyExactMatchEvaluator
14
+ from .legacy_json_similarity_evaluator import LegacyJsonSimilarityEvaluator
15
+ from .legacy_llm_as_judge_evaluator import LegacyLlmAsAJudgeEvaluator
16
+ from .legacy_trajectory_evaluator import LegacyTrajectoryEvaluator
17
+ from .llm_judge_output_evaluator import (
18
+ BaseLLMOutputEvaluator,
19
+ LLMJudgeOutputEvaluator,
20
+ LLMJudgeStrictJSONSimilarityOutputEvaluator,
21
+ )
22
+ from .llm_judge_trajectory_evaluator import (
23
+ BaseLLMTrajectoryEvaluator,
24
+ LLMJudgeTrajectoryEvaluator,
25
+ LLMJudgeTrajectorySimulationEvaluator,
26
+ )
27
+ from .tool_call_args_evaluator import ToolCallArgsEvaluator
28
+ from .tool_call_count_evaluator import ToolCallCountEvaluator
29
+ from .tool_call_order_evaluator import ToolCallOrderEvaluator
30
+ from .tool_call_output_evaluator import ToolCallOutputEvaluator
31
+
32
+ EVALUATORS: list[type[BaseEvaluator[Any, Any, Any]]] = [
33
+ ExactMatchEvaluator,
34
+ ContainsEvaluator,
35
+ JsonSimilarityEvaluator,
36
+ LLMJudgeOutputEvaluator,
37
+ LLMJudgeStrictJSONSimilarityOutputEvaluator,
38
+ LLMJudgeTrajectoryEvaluator,
39
+ LLMJudgeTrajectorySimulationEvaluator,
40
+ ToolCallOrderEvaluator,
41
+ ToolCallArgsEvaluator,
42
+ ToolCallCountEvaluator,
43
+ ToolCallOutputEvaluator,
44
+ ]
8
45
 
9
46
  __all__ = [
47
+ # Legacy evaluators
48
+ "LegacyBaseEvaluator",
49
+ "LegacyExactMatchEvaluator",
50
+ "LegacyJsonSimilarityEvaluator",
51
+ "LegacyLlmAsAJudgeEvaluator",
52
+ "LegacyTrajectoryEvaluator",
53
+ # Current coded evaluators
10
54
  "BaseEvaluator",
55
+ "ContainsEvaluator",
11
56
  "ExactMatchEvaluator",
12
57
  "JsonSimilarityEvaluator",
13
- "LlmAsAJudgeEvaluator",
14
- "TrajectoryEvaluator",
58
+ "BaseLLMOutputEvaluator",
59
+ "LLMJudgeOutputEvaluator",
60
+ "LLMJudgeStrictJSONSimilarityOutputEvaluator",
61
+ "BaseLLMTrajectoryEvaluator",
62
+ "LLMJudgeTrajectoryEvaluator",
63
+ "LLMJudgeTrajectorySimulationEvaluator",
64
+ "ToolCallOrderEvaluator",
65
+ "ToolCallArgsEvaluator",
66
+ "ToolCallCountEvaluator",
67
+ "ToolCallOutputEvaluator",
68
+ "BaseEvaluationCriteria",
69
+ "BaseEvaluatorConfig",
15
70
  ]