uipath 2.1.108__py3-none-any.whl → 2.1.109__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of uipath might be problematic. Click here for more details.
- uipath/_cli/__init__.py +4 -0
- uipath/_cli/_evals/_console_progress_reporter.py +2 -2
- uipath/_cli/_evals/_evaluator_factory.py +314 -29
- uipath/_cli/_evals/_helpers.py +194 -0
- uipath/_cli/_evals/_models/_evaluation_set.py +73 -7
- uipath/_cli/_evals/_models/_evaluator.py +183 -9
- uipath/_cli/_evals/_models/_evaluator_base_params.py +3 -3
- uipath/_cli/_evals/_models/_output.py +87 -3
- uipath/_cli/_evals/_progress_reporter.py +288 -28
- uipath/_cli/_evals/_runtime.py +80 -26
- uipath/_cli/_evals/mocks/input_mocker.py +1 -3
- uipath/_cli/_evals/mocks/llm_mocker.py +2 -2
- uipath/_cli/_evals/mocks/mocker_factory.py +2 -2
- uipath/_cli/_evals/mocks/mockito_mocker.py +2 -2
- uipath/_cli/_evals/mocks/mocks.py +5 -3
- uipath/_cli/_push/models.py +17 -0
- uipath/_cli/_push/sw_file_handler.py +336 -3
- uipath/_cli/_templates/custom_evaluator.py.template +65 -0
- uipath/_cli/_utils/_eval_set.py +30 -9
- uipath/_cli/_utils/_resources.py +21 -0
- uipath/_cli/_utils/_studio_project.py +18 -0
- uipath/_cli/cli_add.py +114 -0
- uipath/_cli/cli_eval.py +5 -1
- uipath/_cli/cli_pull.py +11 -26
- uipath/_cli/cli_push.py +2 -0
- uipath/_cli/cli_register.py +45 -0
- uipath/_events/_events.py +6 -5
- uipath/_utils/constants.py +4 -0
- uipath/eval/_helpers/evaluators_helpers.py +494 -0
- uipath/eval/_helpers/helpers.py +30 -2
- uipath/eval/evaluators/__init__.py +60 -5
- uipath/eval/evaluators/base_evaluator.py +546 -44
- uipath/eval/evaluators/contains_evaluator.py +80 -0
- uipath/eval/evaluators/exact_match_evaluator.py +43 -12
- uipath/eval/evaluators/json_similarity_evaluator.py +41 -12
- uipath/eval/evaluators/legacy_base_evaluator.py +89 -0
- uipath/eval/evaluators/{deterministic_evaluator_base.py → legacy_deterministic_evaluator_base.py} +2 -2
- uipath/eval/evaluators/legacy_exact_match_evaluator.py +37 -0
- uipath/eval/evaluators/legacy_json_similarity_evaluator.py +151 -0
- uipath/eval/evaluators/legacy_llm_as_judge_evaluator.py +137 -0
- uipath/eval/evaluators/{trajectory_evaluator.py → legacy_trajectory_evaluator.py} +5 -6
- uipath/eval/evaluators/llm_as_judge_evaluator.py +143 -78
- uipath/eval/evaluators/llm_judge_output_evaluator.py +112 -0
- uipath/eval/evaluators/llm_judge_trajectory_evaluator.py +142 -0
- uipath/eval/evaluators/output_evaluator.py +117 -0
- uipath/eval/evaluators/tool_call_args_evaluator.py +82 -0
- uipath/eval/evaluators/tool_call_count_evaluator.py +87 -0
- uipath/eval/evaluators/tool_call_order_evaluator.py +84 -0
- uipath/eval/evaluators/tool_call_output_evaluator.py +87 -0
- uipath/eval/evaluators_types/ContainsEvaluator.json +73 -0
- uipath/eval/evaluators_types/ExactMatchEvaluator.json +89 -0
- uipath/eval/evaluators_types/JsonSimilarityEvaluator.json +81 -0
- uipath/eval/evaluators_types/LLMJudgeOutputEvaluator.json +110 -0
- uipath/eval/evaluators_types/LLMJudgeSimulationTrajectoryEvaluator.json +88 -0
- uipath/eval/evaluators_types/LLMJudgeStrictJSONSimilarityOutputEvaluator.json +110 -0
- uipath/eval/evaluators_types/LLMJudgeTrajectoryEvaluator.json +88 -0
- uipath/eval/evaluators_types/ToolCallArgsEvaluator.json +131 -0
- uipath/eval/evaluators_types/ToolCallCountEvaluator.json +104 -0
- uipath/eval/evaluators_types/ToolCallOrderEvaluator.json +100 -0
- uipath/eval/evaluators_types/ToolCallOutputEvaluator.json +124 -0
- uipath/eval/evaluators_types/generate_types.py +31 -0
- uipath/eval/models/__init__.py +16 -1
- uipath/eval/models/llm_judge_types.py +196 -0
- uipath/eval/models/models.py +109 -7
- {uipath-2.1.108.dist-info → uipath-2.1.109.dist-info}/METADATA +1 -1
- {uipath-2.1.108.dist-info → uipath-2.1.109.dist-info}/RECORD +69 -37
- {uipath-2.1.108.dist-info → uipath-2.1.109.dist-info}/WHEEL +0 -0
- {uipath-2.1.108.dist-info → uipath-2.1.109.dist-info}/entry_points.txt +0 -0
- {uipath-2.1.108.dist-info → uipath-2.1.109.dist-info}/licenses/LICENSE +0 -0
uipath/_cli/cli_eval.py
CHANGED
|
@@ -130,7 +130,11 @@ def eval(
|
|
|
130
130
|
|
|
131
131
|
eval_context.no_report = no_report
|
|
132
132
|
eval_context.workers = workers
|
|
133
|
-
|
|
133
|
+
|
|
134
|
+
# Load eval set to resolve the path
|
|
135
|
+
eval_set_path = eval_set or EvalHelpers.auto_discover_eval_set()
|
|
136
|
+
_, resolved_eval_set_path = EvalHelpers.load_eval_set(eval_set_path, eval_ids)
|
|
137
|
+
eval_context.eval_set = resolved_eval_set_path
|
|
134
138
|
eval_context.eval_ids = eval_ids
|
|
135
139
|
|
|
136
140
|
console_reporter = ConsoleProgressReporter()
|
uipath/_cli/cli_pull.py
CHANGED
|
@@ -24,20 +24,6 @@ from ._utils._project_files import ProjectPullError, pull_project
|
|
|
24
24
|
console = ConsoleLogger()
|
|
25
25
|
|
|
26
26
|
|
|
27
|
-
class InteractiveConflictHandler:
|
|
28
|
-
"""Handler that prompts user for each conflict."""
|
|
29
|
-
|
|
30
|
-
def __init__(self, console: ConsoleLogger):
|
|
31
|
-
self.console = console
|
|
32
|
-
|
|
33
|
-
def should_overwrite(
|
|
34
|
-
self, file_path: str, local_hash: str, remote_hash: str
|
|
35
|
-
) -> bool:
|
|
36
|
-
self.console.warning(f" File {file_path} differs from remote version.")
|
|
37
|
-
response = click.confirm("Do you want to overwrite it?", default=False)
|
|
38
|
-
return response
|
|
39
|
-
|
|
40
|
-
|
|
41
27
|
@click.command()
|
|
42
28
|
@click.argument(
|
|
43
29
|
"root",
|
|
@@ -66,22 +52,21 @@ def pull(root: Path) -> None:
|
|
|
66
52
|
project_id = os.getenv(UIPATH_PROJECT_ID)
|
|
67
53
|
if not project_id:
|
|
68
54
|
console.error("UIPATH_PROJECT_ID environment variable not found.")
|
|
55
|
+
return
|
|
69
56
|
|
|
70
|
-
|
|
57
|
+
download_configuration = {
|
|
71
58
|
"source_code": root,
|
|
72
59
|
"evals": root / "evals",
|
|
73
60
|
}
|
|
74
61
|
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
InteractiveConflictHandler(console),
|
|
81
|
-
):
|
|
62
|
+
try:
|
|
63
|
+
|
|
64
|
+
async def run_pull():
|
|
65
|
+
async for update in pull_project(project_id, download_configuration):
|
|
66
|
+
console.info(f"Processing: {update.file_path}")
|
|
82
67
|
console.info(update.message)
|
|
83
|
-
except ProjectPullError as e:
|
|
84
|
-
console.error(e.message, include_traceback=True)
|
|
85
68
|
|
|
86
|
-
|
|
87
|
-
|
|
69
|
+
asyncio.run(run_pull())
|
|
70
|
+
console.success("Project pulled successfully")
|
|
71
|
+
except ProjectPullError as e:
|
|
72
|
+
console.error(f"Failed to pull UiPath project: {str(e)}")
|
uipath/_cli/cli_push.py
CHANGED
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
# type: ignore
|
|
2
|
+
import logging
|
|
3
|
+
|
|
4
|
+
import click
|
|
5
|
+
|
|
6
|
+
from ..telemetry import track
|
|
7
|
+
from ._evals._helpers import register_evaluator
|
|
8
|
+
from ._utils._console import ConsoleLogger
|
|
9
|
+
from ._utils._resources import Resources
|
|
10
|
+
|
|
11
|
+
logger = logging.getLogger(__name__)
|
|
12
|
+
console = ConsoleLogger()
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@click.command()
|
|
16
|
+
@click.argument("resource", required=True)
|
|
17
|
+
@click.argument("args", nargs=-1)
|
|
18
|
+
@track
|
|
19
|
+
def register(resource: str, args: tuple[str]) -> None:
|
|
20
|
+
"""Register a local resource.
|
|
21
|
+
|
|
22
|
+
Examples:
|
|
23
|
+
uipath register evaluator my-custom-evaluator.py
|
|
24
|
+
"""
|
|
25
|
+
match Resources.from_string(resource):
|
|
26
|
+
case Resources.EVALUATOR:
|
|
27
|
+
usage_hint = f"Usage: {click.style('uipath register evaluator <evaluator_file_name> (ex. my_custom_evaluator.py)', fg='cyan')}"
|
|
28
|
+
if len(args) < 1:
|
|
29
|
+
console.hint(usage_hint)
|
|
30
|
+
console.error("Missing required argument: evaluator_file_name.")
|
|
31
|
+
return
|
|
32
|
+
if len(args) > 1:
|
|
33
|
+
console.hint(usage_hint)
|
|
34
|
+
console.error(
|
|
35
|
+
f"Too many arguments provided: {args}. Expected only evaluator_file_name (ex. my_custom_evaluator.py)"
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
filename = args[0]
|
|
39
|
+
|
|
40
|
+
if not isinstance(filename, str) or not filename.strip():
|
|
41
|
+
console.hint(usage_hint)
|
|
42
|
+
console.error("Invalid filename: must be a non-empty string")
|
|
43
|
+
return
|
|
44
|
+
|
|
45
|
+
register_evaluator(filename)
|
uipath/_events/_events.py
CHANGED
|
@@ -3,9 +3,9 @@ from enum import Enum
|
|
|
3
3
|
from typing import Any, Dict, List, Optional, Union
|
|
4
4
|
|
|
5
5
|
from opentelemetry.sdk.trace import ReadableSpan
|
|
6
|
-
from pydantic import BaseModel, ConfigDict, Field, model_validator
|
|
6
|
+
from pydantic import BaseModel, ConfigDict, Field, SkipValidation, model_validator
|
|
7
7
|
|
|
8
|
-
from uipath._cli._evals._models._evaluation_set import
|
|
8
|
+
from uipath._cli._evals._models._evaluation_set import AnyEvaluationItem, AnyEvaluator
|
|
9
9
|
from uipath.eval.models import EvalItemResult
|
|
10
10
|
|
|
11
11
|
|
|
@@ -21,12 +21,13 @@ class EvalSetRunCreatedEvent(BaseModel):
|
|
|
21
21
|
entrypoint: str
|
|
22
22
|
eval_set_id: str
|
|
23
23
|
no_of_evals: int
|
|
24
|
-
|
|
24
|
+
# skip validation to avoid abstract class instantiation
|
|
25
|
+
evaluators: SkipValidation[List[AnyEvaluator]]
|
|
25
26
|
|
|
26
27
|
|
|
27
28
|
class EvalRunCreatedEvent(BaseModel):
|
|
28
29
|
execution_id: str
|
|
29
|
-
eval_item:
|
|
30
|
+
eval_item: AnyEvaluationItem
|
|
30
31
|
|
|
31
32
|
|
|
32
33
|
class EvalItemExceptionDetails(BaseModel):
|
|
@@ -40,7 +41,7 @@ class EvalRunUpdatedEvent(BaseModel):
|
|
|
40
41
|
model_config = ConfigDict(arbitrary_types_allowed=True)
|
|
41
42
|
|
|
42
43
|
execution_id: str
|
|
43
|
-
eval_item:
|
|
44
|
+
eval_item: AnyEvaluationItem
|
|
44
45
|
eval_results: List[EvalItemResult]
|
|
45
46
|
success: bool
|
|
46
47
|
agent_output: Any
|
uipath/_utils/constants.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
# Environment variables
|
|
2
2
|
DOTENV_FILE = ".env"
|
|
3
3
|
ENV_BASE_URL = "UIPATH_URL"
|
|
4
|
+
ENV_EVAL_BACKEND_URL = "UIPATH_EVAL_BACKEND_URL"
|
|
4
5
|
ENV_UNATTENDED_USER_ACCESS_TOKEN = "UNATTENDED_USER_ACCESS_TOKEN"
|
|
5
6
|
ENV_UIPATH_ACCESS_TOKEN = "UIPATH_ACCESS_TOKEN"
|
|
6
7
|
ENV_FOLDER_KEY = "UIPATH_FOLDER_KEY"
|
|
@@ -46,3 +47,6 @@ COMMUNITY_agents_SUFFIX = "-community-agents"
|
|
|
46
47
|
|
|
47
48
|
# File names
|
|
48
49
|
UIPATH_CONFIG_FILE = "uipath.json"
|
|
50
|
+
|
|
51
|
+
# Evaluators
|
|
52
|
+
CUSTOM_EVALUATOR_PREFIX = "file://"
|
|
@@ -0,0 +1,494 @@
|
|
|
1
|
+
import ast
|
|
2
|
+
import json
|
|
3
|
+
from collections.abc import Mapping, Sequence
|
|
4
|
+
from datetime import datetime
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from opentelemetry.sdk.trace import ReadableSpan
|
|
8
|
+
|
|
9
|
+
from ..models import (
|
|
10
|
+
ToolCall,
|
|
11
|
+
ToolOutput,
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
COMPARATOR_MAPPINGS = {
|
|
15
|
+
">": "gt",
|
|
16
|
+
"<": "lt",
|
|
17
|
+
">=": "ge",
|
|
18
|
+
"<=": "le",
|
|
19
|
+
"=": "eq",
|
|
20
|
+
"==": "eq",
|
|
21
|
+
"!=": "ne",
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
COMMUNITY_agents_SUFFIX = "-community-agents"
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def extract_tool_calls_names(spans: Sequence[ReadableSpan]) -> list[str]:
|
|
28
|
+
"""Extract the tool call names from execution spans IN ORDER.
|
|
29
|
+
|
|
30
|
+
Args:
|
|
31
|
+
spans: List of ReadableSpan objects from agent execution.
|
|
32
|
+
|
|
33
|
+
Returns:
|
|
34
|
+
List of tool names in the order they were called.
|
|
35
|
+
"""
|
|
36
|
+
tool_calls_names = []
|
|
37
|
+
|
|
38
|
+
for span in spans:
|
|
39
|
+
# Check for tool.name attribute first
|
|
40
|
+
if span.attributes and (tool_name := span.attributes.get("tool.name")):
|
|
41
|
+
tool_calls_names.append(str(tool_name))
|
|
42
|
+
|
|
43
|
+
return tool_calls_names
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def extract_tool_calls(spans: Sequence[ReadableSpan]) -> list[ToolCall]:
|
|
47
|
+
"""Extract the tool calls from execution spans with their arguments.
|
|
48
|
+
|
|
49
|
+
Args:
|
|
50
|
+
spans: List of ReadableSpan objects from agent execution.
|
|
51
|
+
|
|
52
|
+
Returns:
|
|
53
|
+
Dict of tool calls with their arguments.
|
|
54
|
+
"""
|
|
55
|
+
tool_calls = []
|
|
56
|
+
|
|
57
|
+
for span in spans:
|
|
58
|
+
if span.attributes and (tool_name := span.attributes.get("tool.name")):
|
|
59
|
+
try:
|
|
60
|
+
input_value: Any = span.attributes.get("input.value", {})
|
|
61
|
+
# Ensure input_value is a string before parsing
|
|
62
|
+
if isinstance(input_value, str):
|
|
63
|
+
arguments = ast.literal_eval(input_value)
|
|
64
|
+
elif isinstance(input_value, dict):
|
|
65
|
+
arguments = input_value
|
|
66
|
+
else:
|
|
67
|
+
arguments = {}
|
|
68
|
+
tool_calls.append(ToolCall(name=str(tool_name), args=arguments))
|
|
69
|
+
except (json.JSONDecodeError, SyntaxError, ValueError):
|
|
70
|
+
# Handle case where input.value is not valid JSON/Python syntax
|
|
71
|
+
tool_calls.append(ToolCall(name=str(tool_name), args={}))
|
|
72
|
+
|
|
73
|
+
return tool_calls
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def extract_tool_calls_outputs(spans: Sequence[ReadableSpan]) -> list[ToolOutput]:
|
|
77
|
+
"""Extract the outputs of the tool calls from execution spans.
|
|
78
|
+
|
|
79
|
+
Args:
|
|
80
|
+
spans: List of ReadableSpan objects from agent execution.
|
|
81
|
+
|
|
82
|
+
Returns:
|
|
83
|
+
List of tool calls outputs.
|
|
84
|
+
"""
|
|
85
|
+
# After span normalization, the output.value should always be a dict with a content field
|
|
86
|
+
# We keep this list of potential output keys for extensibility purposes (e.g. frameworks without span normalization)
|
|
87
|
+
potential_output_keys = ["content"]
|
|
88
|
+
tool_calls_outputs = []
|
|
89
|
+
for span in spans:
|
|
90
|
+
if span.attributes and (tool_name := span.attributes.get("tool.name")):
|
|
91
|
+
output = span.attributes.get("output.value", "")
|
|
92
|
+
final_output = ""
|
|
93
|
+
|
|
94
|
+
# Handle different output formats
|
|
95
|
+
if isinstance(output, str):
|
|
96
|
+
try:
|
|
97
|
+
# Try to parse as JSON and extract content field
|
|
98
|
+
parsed_output = json.loads(output)
|
|
99
|
+
if isinstance(parsed_output, dict):
|
|
100
|
+
for key in potential_output_keys:
|
|
101
|
+
if key in parsed_output:
|
|
102
|
+
final_output = parsed_output[key]
|
|
103
|
+
break
|
|
104
|
+
else:
|
|
105
|
+
# If parsed JSON is not a dict, use the original string
|
|
106
|
+
final_output = output
|
|
107
|
+
except (json.JSONDecodeError, ValueError):
|
|
108
|
+
# If parsing fails, use the string as-is
|
|
109
|
+
final_output = output
|
|
110
|
+
elif isinstance(output, dict):
|
|
111
|
+
# If output is already a dict, extract content field
|
|
112
|
+
for key in potential_output_keys:
|
|
113
|
+
if key in output:
|
|
114
|
+
final_output = output.get(key, "")
|
|
115
|
+
break
|
|
116
|
+
else:
|
|
117
|
+
final_output = str(output)
|
|
118
|
+
|
|
119
|
+
tool_calls_outputs.append(
|
|
120
|
+
ToolOutput(
|
|
121
|
+
name=str(tool_name),
|
|
122
|
+
output=str(final_output) if final_output else "",
|
|
123
|
+
)
|
|
124
|
+
)
|
|
125
|
+
return tool_calls_outputs
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def tool_calls_order_score(
|
|
129
|
+
actual_tool_calls_names: Sequence[str],
|
|
130
|
+
expected_tool_calls_names: Sequence[str],
|
|
131
|
+
strict: bool = False,
|
|
132
|
+
) -> tuple[float, dict[str, Any]]:
|
|
133
|
+
"""The function calculates a score based on LCS applied to the order of the tool calls.
|
|
134
|
+
|
|
135
|
+
It calculates the longest common subsequence between the actual tool calls
|
|
136
|
+
and the expected tool calls and returns the ratio of the LCS length to the number of
|
|
137
|
+
expected calls.
|
|
138
|
+
|
|
139
|
+
Args:
|
|
140
|
+
actual_tool_calls_names: List of tool names in the actual order
|
|
141
|
+
expected_tool_calls_names: List of tool names in the expected order
|
|
142
|
+
strict: If True, the function will return 0 if the actual calls do not match the expected calls exactly
|
|
143
|
+
|
|
144
|
+
Returns:
|
|
145
|
+
tuple[float, dict]: Ratio of the LCS length to the number of expected, and the justification dict
|
|
146
|
+
"""
|
|
147
|
+
justification = {
|
|
148
|
+
"actual_tool_calls_order": list(actual_tool_calls_names),
|
|
149
|
+
"expected_tool_calls_order": list(expected_tool_calls_names),
|
|
150
|
+
"lcs": [],
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
# Handle empty cases
|
|
154
|
+
if not expected_tool_calls_names and not actual_tool_calls_names:
|
|
155
|
+
return 1.0, justification
|
|
156
|
+
elif not expected_tool_calls_names or not actual_tool_calls_names:
|
|
157
|
+
return 0.0, justification
|
|
158
|
+
|
|
159
|
+
# Handle exact match
|
|
160
|
+
if expected_tool_calls_names == actual_tool_calls_names:
|
|
161
|
+
justification["lcs"] = list(actual_tool_calls_names)
|
|
162
|
+
return 1.0, justification
|
|
163
|
+
|
|
164
|
+
# Handle strict mode - only perfect matches allowed
|
|
165
|
+
if strict:
|
|
166
|
+
return 0.0, justification
|
|
167
|
+
|
|
168
|
+
# Calculate LCS with full DP table for efficient reconstruction
|
|
169
|
+
m, n = len(actual_tool_calls_names), len(expected_tool_calls_names)
|
|
170
|
+
dp = [[0] * (n + 1) for _ in range(m + 1)]
|
|
171
|
+
|
|
172
|
+
# Build DP table - O(m*n)
|
|
173
|
+
for i in range(1, m + 1):
|
|
174
|
+
for j in range(1, n + 1):
|
|
175
|
+
if actual_tool_calls_names[i - 1] == expected_tool_calls_names[j - 1]:
|
|
176
|
+
dp[i][j] = dp[i - 1][j - 1] + 1
|
|
177
|
+
else:
|
|
178
|
+
dp[i][j] = max(dp[i - 1][j], dp[i][j - 1])
|
|
179
|
+
|
|
180
|
+
# Reconstruct LCS - O(m+n)
|
|
181
|
+
lcs = []
|
|
182
|
+
i, j = m, n
|
|
183
|
+
while i > 0 and j > 0:
|
|
184
|
+
if actual_tool_calls_names[i - 1] == expected_tool_calls_names[j - 1]:
|
|
185
|
+
lcs.append(actual_tool_calls_names[i - 1])
|
|
186
|
+
i -= 1
|
|
187
|
+
j -= 1
|
|
188
|
+
elif dp[i - 1][j] > dp[i][j - 1]:
|
|
189
|
+
i -= 1
|
|
190
|
+
else:
|
|
191
|
+
j -= 1
|
|
192
|
+
|
|
193
|
+
lcs.reverse() # Reverse to get correct order
|
|
194
|
+
lcs_length = len(lcs)
|
|
195
|
+
justification["lcs"] = lcs
|
|
196
|
+
return lcs_length / n, justification
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
def tool_calls_count_score(
|
|
200
|
+
actual_tool_calls_count: Mapping[str, int],
|
|
201
|
+
expected_tool_calls_count: Mapping[str, tuple[str, int]],
|
|
202
|
+
strict: bool = False,
|
|
203
|
+
justification_key: str = "explained_tool_calls_count",
|
|
204
|
+
) -> tuple[float, dict[str, Any]]:
|
|
205
|
+
"""Check if the expected tool call counts match the actual tool call counts.
|
|
206
|
+
|
|
207
|
+
Args:
|
|
208
|
+
actual_tool_calls_count: Mapping of tool names to their actual call counts.
|
|
209
|
+
expected_tool_calls_count: Mapping of tool names to expected (comparator, count) tuples.
|
|
210
|
+
strict: If True, the function will return 0 if not all expected tool calls are matched.
|
|
211
|
+
justification_key: Key to use for the justification in the returned dict.
|
|
212
|
+
|
|
213
|
+
Returns:
|
|
214
|
+
tuple[float, dict]: Score based on the number of matches, and the justification dict.
|
|
215
|
+
"""
|
|
216
|
+
if not expected_tool_calls_count and not actual_tool_calls_count:
|
|
217
|
+
return 1.0, {
|
|
218
|
+
justification_key: {
|
|
219
|
+
"_result": "Both expected and actual tool calls are empty"
|
|
220
|
+
}
|
|
221
|
+
}
|
|
222
|
+
elif not expected_tool_calls_count or not actual_tool_calls_count:
|
|
223
|
+
return 0.0, {
|
|
224
|
+
justification_key: {
|
|
225
|
+
"_result": "Either expected or actual tool calls are empty"
|
|
226
|
+
}
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
score = 0.0
|
|
230
|
+
justifications: dict[str, Any] = {justification_key: {}}
|
|
231
|
+
for tool_name, (
|
|
232
|
+
expected_comparator,
|
|
233
|
+
expected_count,
|
|
234
|
+
) in expected_tool_calls_count.items():
|
|
235
|
+
actual_count = actual_tool_calls_count.get(tool_name, 0.0)
|
|
236
|
+
comparator = f"__{COMPARATOR_MAPPINGS[expected_comparator]}__"
|
|
237
|
+
to_add = float(getattr(actual_count, comparator)(expected_count))
|
|
238
|
+
|
|
239
|
+
justifications[justification_key][tool_name] = (
|
|
240
|
+
f"Actual: {actual_count}, Expected: {expected_count}, Score: {to_add}"
|
|
241
|
+
)
|
|
242
|
+
if strict and to_add == 0.0:
|
|
243
|
+
# When strict is True, if the actual count does not match the expected count, return 0
|
|
244
|
+
# The justification should only include the breaching tool name
|
|
245
|
+
return 0.0, {
|
|
246
|
+
justification_key: {
|
|
247
|
+
tool_name: justifications[justification_key][tool_name]
|
|
248
|
+
}
|
|
249
|
+
}
|
|
250
|
+
score += to_add
|
|
251
|
+
return score / len(expected_tool_calls_count), justifications
|
|
252
|
+
|
|
253
|
+
|
|
254
|
+
def tool_calls_args_score(
|
|
255
|
+
actual_tool_calls: list[ToolCall],
|
|
256
|
+
expected_tool_calls: list[ToolCall],
|
|
257
|
+
strict: bool = False,
|
|
258
|
+
subset: bool = False,
|
|
259
|
+
justification_key: str = "explained_tool_calls_args",
|
|
260
|
+
) -> tuple[float, dict[str, Any]]:
|
|
261
|
+
"""Check if the expected tool calls are correctly called with matching arguments.
|
|
262
|
+
|
|
263
|
+
This function does not check the order of the tool calls!
|
|
264
|
+
|
|
265
|
+
Args:
|
|
266
|
+
actual_tool_calls: List of actual tool calls with their arguments.
|
|
267
|
+
expected_tool_calls: List of expected tool calls with their arguments.
|
|
268
|
+
strict: If True, the function will return 0 if not all expected tool calls are matched.
|
|
269
|
+
subset: If True, the function will check if the expected args are a subset of the actual args.
|
|
270
|
+
justification_key: Key to use for the justification in the returned dict.
|
|
271
|
+
|
|
272
|
+
Returns:
|
|
273
|
+
tuple[float, dict]: Score based on the number of matches, and the justification dict.
|
|
274
|
+
"""
|
|
275
|
+
if not expected_tool_calls and not actual_tool_calls:
|
|
276
|
+
return 1.0, {
|
|
277
|
+
justification_key: {
|
|
278
|
+
"_result": "Both expected and actual tool calls are empty"
|
|
279
|
+
}
|
|
280
|
+
}
|
|
281
|
+
elif not expected_tool_calls or not actual_tool_calls:
|
|
282
|
+
return 0.0, {
|
|
283
|
+
justification_key: {
|
|
284
|
+
"_result": "Either expected or actual tool calls are empty"
|
|
285
|
+
}
|
|
286
|
+
}
|
|
287
|
+
|
|
288
|
+
cnt = 0
|
|
289
|
+
visited: set[int] = set()
|
|
290
|
+
justifications: dict[str, Any] = {justification_key: {}}
|
|
291
|
+
tool_counters: dict[str, int] = {}
|
|
292
|
+
|
|
293
|
+
for expected_tool_call in expected_tool_calls:
|
|
294
|
+
for idx, call in enumerate(actual_tool_calls):
|
|
295
|
+
if call.name == expected_tool_call.name and idx not in visited:
|
|
296
|
+
# Get or initialize counter for this tool name
|
|
297
|
+
tool_counters[call.name] = tool_counters.get(call.name, 0)
|
|
298
|
+
tool_key = f"{call.name}_{tool_counters[call.name]}"
|
|
299
|
+
tool_counters[call.name] += 1
|
|
300
|
+
|
|
301
|
+
# Check arguments based on mode
|
|
302
|
+
# The linter highlights a few problems here due to using lambdas, but they're safe to ignore
|
|
303
|
+
# Breaking this down into proper functions would unnecessarily make the code more complex
|
|
304
|
+
if subset:
|
|
305
|
+
# Subset mode: safely check if all expected args exist and match
|
|
306
|
+
args_check = ( # noqa: E731
|
|
307
|
+
lambda k, v: k in call.args # noqa: B023
|
|
308
|
+
and call.args[k] == v # noqa: B023
|
|
309
|
+
)
|
|
310
|
+
else:
|
|
311
|
+
# Exact mode: direct access (may raise KeyError)
|
|
312
|
+
args_check = lambda k, v: call.args[k] == v # noqa: E731, B023
|
|
313
|
+
|
|
314
|
+
try:
|
|
315
|
+
args_match = all(
|
|
316
|
+
args_check(k, v) for k, v in expected_tool_call.args.items()
|
|
317
|
+
)
|
|
318
|
+
except KeyError:
|
|
319
|
+
# Only possible in exact mode when key is missing
|
|
320
|
+
args_match = False
|
|
321
|
+
|
|
322
|
+
justifications[justification_key][tool_key] = (
|
|
323
|
+
f"Actual: {call.args}, Expected: {expected_tool_call.args}, Score: {float(args_match)}"
|
|
324
|
+
)
|
|
325
|
+
if args_match:
|
|
326
|
+
cnt += 1
|
|
327
|
+
visited.add(idx)
|
|
328
|
+
break
|
|
329
|
+
# In case of mismatch, DON'T add to visited in non-strict mode
|
|
330
|
+
# so this actual tool call can be matched against other expected calls
|
|
331
|
+
|
|
332
|
+
return (
|
|
333
|
+
cnt / len(expected_tool_calls)
|
|
334
|
+
if not strict
|
|
335
|
+
else float(cnt == len(expected_tool_calls))
|
|
336
|
+
), justifications
|
|
337
|
+
|
|
338
|
+
|
|
339
|
+
def tool_calls_output_score(
|
|
340
|
+
actual_tool_calls_outputs: list[ToolOutput],
|
|
341
|
+
expected_tool_calls_outputs: list[ToolOutput],
|
|
342
|
+
strict: bool = False,
|
|
343
|
+
justification_key: str = "explained_tool_calls_outputs",
|
|
344
|
+
) -> tuple[float, dict[str, Any]]:
|
|
345
|
+
"""Check if the expected tool calls are correctly called, where expected args must be a subset of actual args.
|
|
346
|
+
|
|
347
|
+
Args:
|
|
348
|
+
actual_tool_calls_outputs: List of actual tool calls outputs.
|
|
349
|
+
expected_tool_calls_outputs: List of expected tool calls outputs.
|
|
350
|
+
strict: If True, the function will return 0 if not all expected tool calls are matched.
|
|
351
|
+
|
|
352
|
+
Returns:
|
|
353
|
+
tuple[float, str]: Score based on the number of matches, and the justification.
|
|
354
|
+
"""
|
|
355
|
+
if not expected_tool_calls_outputs and not actual_tool_calls_outputs:
|
|
356
|
+
return 1.0, {
|
|
357
|
+
justification_key: {
|
|
358
|
+
"_result": "Both expected and actual tool calls outputs are empty"
|
|
359
|
+
}
|
|
360
|
+
}
|
|
361
|
+
elif not expected_tool_calls_outputs or not actual_tool_calls_outputs:
|
|
362
|
+
return 0.0, {
|
|
363
|
+
justification_key: {
|
|
364
|
+
"_result": "Either expected or actual tool calls outputs are empty"
|
|
365
|
+
}
|
|
366
|
+
}
|
|
367
|
+
|
|
368
|
+
cnt = 0.0
|
|
369
|
+
justifications: dict[str, Any] = {justification_key: {}}
|
|
370
|
+
visited: set[int] = set()
|
|
371
|
+
tool_counters: dict[str, int] = {}
|
|
372
|
+
|
|
373
|
+
for expected_tool_call_output in expected_tool_calls_outputs:
|
|
374
|
+
matched = False
|
|
375
|
+
|
|
376
|
+
# Look through ALL actual tool calls to find a match
|
|
377
|
+
for idx, actual_tool_call_output in enumerate(actual_tool_calls_outputs):
|
|
378
|
+
if idx in visited:
|
|
379
|
+
continue
|
|
380
|
+
if actual_tool_call_output.name == expected_tool_call_output.name:
|
|
381
|
+
# Get or initialize counter for this tool name
|
|
382
|
+
tool_counters[actual_tool_call_output.name] = tool_counters.get(
|
|
383
|
+
actual_tool_call_output.name, 0
|
|
384
|
+
)
|
|
385
|
+
tool_key = f"{actual_tool_call_output.name}_{tool_counters[actual_tool_call_output.name]}"
|
|
386
|
+
tool_counters[actual_tool_call_output.name] += 1
|
|
387
|
+
|
|
388
|
+
justifications[justification_key][tool_key] = (
|
|
389
|
+
f"Actual: {actual_tool_call_output.output}, Expected: {expected_tool_call_output.output}, Score: {float(actual_tool_call_output.output == expected_tool_call_output.output)}"
|
|
390
|
+
)
|
|
391
|
+
|
|
392
|
+
if actual_tool_call_output.output == expected_tool_call_output.output:
|
|
393
|
+
# Perfect match found
|
|
394
|
+
cnt += 1.0
|
|
395
|
+
visited.add(idx)
|
|
396
|
+
matched = True
|
|
397
|
+
break
|
|
398
|
+
elif strict:
|
|
399
|
+
# In strict mode, any mismatch returns 0 immediately
|
|
400
|
+
return 0.0, {
|
|
401
|
+
justification_key: {
|
|
402
|
+
tool_key: justifications[justification_key][tool_key]
|
|
403
|
+
}
|
|
404
|
+
}
|
|
405
|
+
# In non-strict mode with mismatch, continue looking for perfect match
|
|
406
|
+
# DON'T add to visited, DON'T break
|
|
407
|
+
|
|
408
|
+
# If no match found and we're in strict mode, return 0
|
|
409
|
+
if not matched and strict:
|
|
410
|
+
return 0.0, {
|
|
411
|
+
justification_key: {
|
|
412
|
+
"_result": f"No matching actual tool call found for expected {expected_tool_call_output.name}"
|
|
413
|
+
}
|
|
414
|
+
}
|
|
415
|
+
|
|
416
|
+
return (
|
|
417
|
+
cnt / len(expected_tool_calls_outputs)
|
|
418
|
+
if not strict
|
|
419
|
+
else float(cnt == len(expected_tool_calls_outputs))
|
|
420
|
+
), justifications
|
|
421
|
+
|
|
422
|
+
|
|
423
|
+
def trace_to_str(agent_trace: Sequence[ReadableSpan]) -> str:
|
|
424
|
+
"""Convert OTEL spans to a platform-style agent run history string.
|
|
425
|
+
|
|
426
|
+
Creates a similar structure to LangChain message processing but using OTEL spans.
|
|
427
|
+
Only processes tool spans (spans with 'tool.name' attribute).
|
|
428
|
+
|
|
429
|
+
Args:
|
|
430
|
+
agent_trace: List of ReadableSpan objects from the agent execution
|
|
431
|
+
|
|
432
|
+
Returns:
|
|
433
|
+
String representation of the agent run history in platform format
|
|
434
|
+
"""
|
|
435
|
+
platform_history = []
|
|
436
|
+
seen_tool_calls = set()
|
|
437
|
+
|
|
438
|
+
for span in agent_trace:
|
|
439
|
+
if span.attributes and (tool_name := span.attributes.get("tool.name")):
|
|
440
|
+
# Get span timing information
|
|
441
|
+
start_time = span.start_time
|
|
442
|
+
end_time = span.end_time
|
|
443
|
+
|
|
444
|
+
# Convert nanoseconds to datetime if needed
|
|
445
|
+
if isinstance(start_time, int):
|
|
446
|
+
start_timestamp = datetime.fromtimestamp(start_time / 1e9)
|
|
447
|
+
else:
|
|
448
|
+
start_timestamp = start_time # type:ignore
|
|
449
|
+
|
|
450
|
+
if isinstance(end_time, int):
|
|
451
|
+
end_timestamp = datetime.fromtimestamp(end_time / 1e9)
|
|
452
|
+
else:
|
|
453
|
+
end_timestamp = end_time # type:ignore
|
|
454
|
+
|
|
455
|
+
timestamp_str = (
|
|
456
|
+
start_timestamp.strftime("%Y-%m-%d %H:%M:%S") if start_timestamp else ""
|
|
457
|
+
)
|
|
458
|
+
|
|
459
|
+
# Get tool call information
|
|
460
|
+
tool_args: Any = span.attributes.get("input.value", {})
|
|
461
|
+
tool_result = str(span.attributes.get("output.value", {})).strip()
|
|
462
|
+
|
|
463
|
+
span_id = (
|
|
464
|
+
span.context.span_id
|
|
465
|
+
if span.context
|
|
466
|
+
else str(hash(f"{tool_name}_{timestamp_str}"))
|
|
467
|
+
)
|
|
468
|
+
|
|
469
|
+
# De-duplicate tool calls based on span ID
|
|
470
|
+
if span_id in seen_tool_calls:
|
|
471
|
+
continue
|
|
472
|
+
seen_tool_calls.add(span_id)
|
|
473
|
+
|
|
474
|
+
# Add tool selection (equivalent to AIMessage with tool_calls)
|
|
475
|
+
platform_history.append(f"[{timestamp_str}] LLM Response:")
|
|
476
|
+
platform_history.append(" Agent Selected 1 Tool(s):")
|
|
477
|
+
platform_history.append("")
|
|
478
|
+
platform_history.append(f" Tool: {tool_name}")
|
|
479
|
+
platform_history.append(f" Arguments: {str(tool_args)}")
|
|
480
|
+
platform_history.append("")
|
|
481
|
+
|
|
482
|
+
# Add tool response (equivalent to ToolMessage)
|
|
483
|
+
end_timestamp_str = (
|
|
484
|
+
end_timestamp.strftime("%Y-%m-%d %H:%M:%S")
|
|
485
|
+
if end_timestamp
|
|
486
|
+
else timestamp_str
|
|
487
|
+
)
|
|
488
|
+
platform_history.append(
|
|
489
|
+
f"[{end_timestamp_str}] Tool Call Response - {tool_name}:"
|
|
490
|
+
)
|
|
491
|
+
platform_history.append(f"{tool_result}")
|
|
492
|
+
platform_history.append("")
|
|
493
|
+
|
|
494
|
+
return "\n".join(platform_history)
|