uipath 2.1.107__py3-none-any.whl → 2.1.109__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of uipath might be problematic. Click here for more details.
- uipath/_cli/__init__.py +4 -0
- uipath/_cli/_evals/_console_progress_reporter.py +2 -2
- uipath/_cli/_evals/_evaluator_factory.py +314 -29
- uipath/_cli/_evals/_helpers.py +194 -0
- uipath/_cli/_evals/_models/_evaluation_set.py +73 -7
- uipath/_cli/_evals/_models/_evaluator.py +183 -9
- uipath/_cli/_evals/_models/_evaluator_base_params.py +3 -3
- uipath/_cli/_evals/_models/_output.py +87 -3
- uipath/_cli/_evals/_progress_reporter.py +288 -28
- uipath/_cli/_evals/_runtime.py +80 -26
- uipath/_cli/_evals/mocks/input_mocker.py +1 -3
- uipath/_cli/_evals/mocks/llm_mocker.py +2 -2
- uipath/_cli/_evals/mocks/mocker_factory.py +2 -2
- uipath/_cli/_evals/mocks/mockito_mocker.py +2 -2
- uipath/_cli/_evals/mocks/mocks.py +5 -3
- uipath/_cli/_push/models.py +17 -0
- uipath/_cli/_push/sw_file_handler.py +336 -3
- uipath/_cli/_runtime/_contracts.py +25 -5
- uipath/_cli/_templates/custom_evaluator.py.template +65 -0
- uipath/_cli/_utils/_eval_set.py +30 -9
- uipath/_cli/_utils/_resources.py +21 -0
- uipath/_cli/_utils/_studio_project.py +18 -0
- uipath/_cli/cli_add.py +114 -0
- uipath/_cli/cli_eval.py +5 -1
- uipath/_cli/cli_pull.py +11 -26
- uipath/_cli/cli_push.py +2 -0
- uipath/_cli/cli_register.py +45 -0
- uipath/_events/_events.py +6 -5
- uipath/_resources/SDK_REFERENCE.md +0 -97
- uipath/_uipath.py +10 -37
- uipath/_utils/constants.py +4 -0
- uipath/eval/_helpers/evaluators_helpers.py +494 -0
- uipath/eval/_helpers/helpers.py +30 -2
- uipath/eval/evaluators/__init__.py +60 -5
- uipath/eval/evaluators/base_evaluator.py +546 -44
- uipath/eval/evaluators/contains_evaluator.py +80 -0
- uipath/eval/evaluators/exact_match_evaluator.py +43 -12
- uipath/eval/evaluators/json_similarity_evaluator.py +41 -12
- uipath/eval/evaluators/legacy_base_evaluator.py +89 -0
- uipath/eval/evaluators/{deterministic_evaluator_base.py → legacy_deterministic_evaluator_base.py} +2 -2
- uipath/eval/evaluators/legacy_exact_match_evaluator.py +37 -0
- uipath/eval/evaluators/legacy_json_similarity_evaluator.py +151 -0
- uipath/eval/evaluators/legacy_llm_as_judge_evaluator.py +137 -0
- uipath/eval/evaluators/{trajectory_evaluator.py → legacy_trajectory_evaluator.py} +5 -6
- uipath/eval/evaluators/llm_as_judge_evaluator.py +143 -78
- uipath/eval/evaluators/llm_judge_output_evaluator.py +112 -0
- uipath/eval/evaluators/llm_judge_trajectory_evaluator.py +142 -0
- uipath/eval/evaluators/output_evaluator.py +117 -0
- uipath/eval/evaluators/tool_call_args_evaluator.py +82 -0
- uipath/eval/evaluators/tool_call_count_evaluator.py +87 -0
- uipath/eval/evaluators/tool_call_order_evaluator.py +84 -0
- uipath/eval/evaluators/tool_call_output_evaluator.py +87 -0
- uipath/eval/evaluators_types/ContainsEvaluator.json +73 -0
- uipath/eval/evaluators_types/ExactMatchEvaluator.json +89 -0
- uipath/eval/evaluators_types/JsonSimilarityEvaluator.json +81 -0
- uipath/eval/evaluators_types/LLMJudgeOutputEvaluator.json +110 -0
- uipath/eval/evaluators_types/LLMJudgeSimulationTrajectoryEvaluator.json +88 -0
- uipath/eval/evaluators_types/LLMJudgeStrictJSONSimilarityOutputEvaluator.json +110 -0
- uipath/eval/evaluators_types/LLMJudgeTrajectoryEvaluator.json +88 -0
- uipath/eval/evaluators_types/ToolCallArgsEvaluator.json +131 -0
- uipath/eval/evaluators_types/ToolCallCountEvaluator.json +104 -0
- uipath/eval/evaluators_types/ToolCallOrderEvaluator.json +100 -0
- uipath/eval/evaluators_types/ToolCallOutputEvaluator.json +124 -0
- uipath/eval/evaluators_types/generate_types.py +31 -0
- uipath/eval/models/__init__.py +16 -1
- uipath/eval/models/llm_judge_types.py +196 -0
- uipath/eval/models/models.py +109 -7
- {uipath-2.1.107.dist-info → uipath-2.1.109.dist-info}/METADATA +1 -1
- {uipath-2.1.107.dist-info → uipath-2.1.109.dist-info}/RECORD +72 -40
- {uipath-2.1.107.dist-info → uipath-2.1.109.dist-info}/WHEEL +0 -0
- {uipath-2.1.107.dist-info → uipath-2.1.109.dist-info}/entry_points.txt +0 -0
- {uipath-2.1.107.dist-info → uipath-2.1.109.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
"""Base class for all output evaluator configurations."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
from typing import Any, TypeVar, Union
|
|
5
|
+
|
|
6
|
+
from pydantic import Field
|
|
7
|
+
|
|
8
|
+
from ..models import AgentExecution
|
|
9
|
+
from ..models.models import UiPathEvaluationError, UiPathEvaluationErrorCategory
|
|
10
|
+
from .base_evaluator import (
|
|
11
|
+
BaseEvaluationCriteria,
|
|
12
|
+
BaseEvaluator,
|
|
13
|
+
BaseEvaluatorConfig,
|
|
14
|
+
BaseEvaluatorJustification,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class OutputEvaluationCriteria(BaseEvaluationCriteria):
|
|
19
|
+
"""Base class for all output evaluation criteria."""
|
|
20
|
+
|
|
21
|
+
expected_output: dict[str, Any] | str
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
T = TypeVar("T", bound=BaseEvaluationCriteria)
|
|
25
|
+
T_OutputCriteria = TypeVar("T_OutputCriteria", bound=OutputEvaluationCriteria)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class OutputEvaluatorConfig(BaseEvaluatorConfig[T]):
|
|
29
|
+
"""Base class for all output evaluator configurations.
|
|
30
|
+
|
|
31
|
+
Generic over T to allow subclasses to define their own
|
|
32
|
+
specific output evaluation criteria types while maintaining type safety.
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
target_output_key: str = Field(
|
|
36
|
+
default="*", description="Key to extract output from agent execution"
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
C = TypeVar("C", bound=OutputEvaluatorConfig[Any])
|
|
41
|
+
J = TypeVar("J", bound=Union[str, None, BaseEvaluatorJustification])
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class BaseOutputEvaluator(BaseEvaluator[T, C, J]):
|
|
45
|
+
"""Abstract base class for all output evaluators.
|
|
46
|
+
|
|
47
|
+
Generic Parameters:
|
|
48
|
+
T_OutputCriteria: The output evaluation criteria type
|
|
49
|
+
C: The output evaluator config type (bound to OutputEvaluatorConfig[T_OutputCriteria])
|
|
50
|
+
J: The justification type
|
|
51
|
+
"""
|
|
52
|
+
|
|
53
|
+
def _get_actual_output(self, agent_execution: AgentExecution) -> Any:
|
|
54
|
+
"""Get the actual output from the agent execution."""
|
|
55
|
+
if self.evaluator_config.target_output_key != "*":
|
|
56
|
+
try:
|
|
57
|
+
return agent_execution.agent_output[
|
|
58
|
+
self.evaluator_config.target_output_key
|
|
59
|
+
]
|
|
60
|
+
except KeyError as e:
|
|
61
|
+
raise UiPathEvaluationError(
|
|
62
|
+
code="TARGET_OUTPUT_KEY_NOT_FOUND",
|
|
63
|
+
title="Target output key not found in actual output",
|
|
64
|
+
detail=f"Error: {e}",
|
|
65
|
+
category=UiPathEvaluationErrorCategory.USER,
|
|
66
|
+
) from e
|
|
67
|
+
return agent_execution.agent_output
|
|
68
|
+
|
|
69
|
+
def _get_full_expected_output(self, evaluation_criteria: T) -> Any:
|
|
70
|
+
"""Get the full expected output from the evaluation criteria."""
|
|
71
|
+
raise UiPathEvaluationError(
|
|
72
|
+
code="NOT_IMPLEMENTED",
|
|
73
|
+
title="This method was not implemented by the subclass.",
|
|
74
|
+
detail="This method was not implemented by the subclass.",
|
|
75
|
+
category=UiPathEvaluationErrorCategory.SYSTEM,
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
def _get_expected_output(self, evaluation_criteria: T) -> Any:
|
|
79
|
+
"""Load the expected output from the evaluation criteria."""
|
|
80
|
+
expected_output = self._get_full_expected_output(evaluation_criteria)
|
|
81
|
+
if self.evaluator_config.target_output_key != "*":
|
|
82
|
+
if isinstance(expected_output, str):
|
|
83
|
+
try:
|
|
84
|
+
expected_output = json.loads(expected_output)
|
|
85
|
+
except json.JSONDecodeError as e:
|
|
86
|
+
raise UiPathEvaluationError(
|
|
87
|
+
code="INVALID_EXPECTED_OUTPUT",
|
|
88
|
+
title="When target output key is not '*', expected output must be a dictionary or a valid JSON string",
|
|
89
|
+
detail=f"Error: {e}",
|
|
90
|
+
category=UiPathEvaluationErrorCategory.USER,
|
|
91
|
+
) from e
|
|
92
|
+
try:
|
|
93
|
+
expected_output = expected_output[
|
|
94
|
+
self.evaluator_config.target_output_key
|
|
95
|
+
]
|
|
96
|
+
except KeyError as e:
|
|
97
|
+
raise UiPathEvaluationError(
|
|
98
|
+
code="TARGET_OUTPUT_KEY_NOT_FOUND",
|
|
99
|
+
title="Target output key not found in expected output",
|
|
100
|
+
detail=f"Error: {e}",
|
|
101
|
+
category=UiPathEvaluationErrorCategory.USER,
|
|
102
|
+
) from e
|
|
103
|
+
return expected_output
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
class OutputEvaluator(BaseOutputEvaluator[T_OutputCriteria, C, J]):
|
|
107
|
+
"""Abstract base class for all output evaluators.
|
|
108
|
+
|
|
109
|
+
Generic Parameters:
|
|
110
|
+
T_OutputCriteria: The output evaluation criteria type
|
|
111
|
+
C: The output evaluator config type (bound to OutputEvaluatorConfig[T_OutputCriteria])
|
|
112
|
+
J: The justification type
|
|
113
|
+
"""
|
|
114
|
+
|
|
115
|
+
def _get_full_expected_output(self, evaluation_criteria: T_OutputCriteria) -> Any:
|
|
116
|
+
"""Get the full expected output from the evaluation criteria."""
|
|
117
|
+
return evaluation_criteria.expected_output
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
"""Tool call order evaluator for validating correct sequence of tool calls."""
|
|
2
|
+
|
|
3
|
+
from .._helpers.evaluators_helpers import (
|
|
4
|
+
extract_tool_calls,
|
|
5
|
+
tool_calls_args_score,
|
|
6
|
+
)
|
|
7
|
+
from ..models import AgentExecution, EvaluationResult, NumericEvaluationResult, ToolCall
|
|
8
|
+
from ..models.models import EvaluatorType
|
|
9
|
+
from .base_evaluator import (
|
|
10
|
+
BaseEvaluationCriteria,
|
|
11
|
+
BaseEvaluator,
|
|
12
|
+
BaseEvaluatorConfig,
|
|
13
|
+
BaseEvaluatorJustification,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class ToolCallArgsEvaluationCriteria(BaseEvaluationCriteria):
|
|
18
|
+
"""Evaluation criteria for the tool call order evaluator."""
|
|
19
|
+
|
|
20
|
+
# TODO: name field of ToolCall needs to be validated such that it contains only the tools available
|
|
21
|
+
tool_calls: list[ToolCall]
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class ToolCallArgsEvaluatorConfig(BaseEvaluatorConfig[ToolCallArgsEvaluationCriteria]):
|
|
25
|
+
"""Configuration for the tool call count evaluator."""
|
|
26
|
+
|
|
27
|
+
name: str = "ToolCallArgsEvaluator"
|
|
28
|
+
strict: bool = False
|
|
29
|
+
subset: bool = False
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class ToolCallArgsEvaluatorJustification(BaseEvaluatorJustification):
|
|
33
|
+
"""Justification for the tool call args evaluator."""
|
|
34
|
+
|
|
35
|
+
explained_tool_calls_args: dict[str, str]
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class ToolCallArgsEvaluator(
|
|
39
|
+
BaseEvaluator[
|
|
40
|
+
ToolCallArgsEvaluationCriteria,
|
|
41
|
+
ToolCallArgsEvaluatorConfig,
|
|
42
|
+
ToolCallArgsEvaluatorJustification,
|
|
43
|
+
]
|
|
44
|
+
):
|
|
45
|
+
"""Evaluator that checks if the tool calls are in the correct order.
|
|
46
|
+
|
|
47
|
+
This evaluator returns True if the tool calls are in the correct order, and False otherwise.
|
|
48
|
+
"""
|
|
49
|
+
|
|
50
|
+
@classmethod
|
|
51
|
+
def get_evaluator_id(cls) -> str:
|
|
52
|
+
"""Get the evaluator id."""
|
|
53
|
+
return EvaluatorType.TOOL_CALL_ARGS.value
|
|
54
|
+
|
|
55
|
+
async def evaluate(
|
|
56
|
+
self,
|
|
57
|
+
agent_execution: AgentExecution,
|
|
58
|
+
evaluation_criteria: ToolCallArgsEvaluationCriteria,
|
|
59
|
+
) -> EvaluationResult:
|
|
60
|
+
"""Evaluate if the tool calls are in the correct order.
|
|
61
|
+
|
|
62
|
+
Args:
|
|
63
|
+
agent_execution: The execution details containing:
|
|
64
|
+
- agent_input: The input received by the agent
|
|
65
|
+
- agent_output: The final output of the agent
|
|
66
|
+
- agent_trace: The execution spans to use for the evaluation
|
|
67
|
+
evaluation_criteria: The criteria to evaluate
|
|
68
|
+
Returns:
|
|
69
|
+
EvaluationResult: Boolean result indicating correct tool call order (True/False)
|
|
70
|
+
"""
|
|
71
|
+
tool_calls_order = extract_tool_calls(agent_execution.agent_trace)
|
|
72
|
+
score, justification = tool_calls_args_score(
|
|
73
|
+
tool_calls_order,
|
|
74
|
+
evaluation_criteria.tool_calls,
|
|
75
|
+
self.evaluator_config.strict,
|
|
76
|
+
self.evaluator_config.subset,
|
|
77
|
+
)
|
|
78
|
+
validated_justification = self.validate_justification(justification)
|
|
79
|
+
return NumericEvaluationResult(
|
|
80
|
+
score=score,
|
|
81
|
+
details=validated_justification,
|
|
82
|
+
)
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
"""Tool call count evaluator for validating expected tool usage patterns."""
|
|
2
|
+
|
|
3
|
+
from collections import Counter
|
|
4
|
+
|
|
5
|
+
from .._helpers.evaluators_helpers import (
|
|
6
|
+
extract_tool_calls_names,
|
|
7
|
+
tool_calls_count_score,
|
|
8
|
+
)
|
|
9
|
+
from ..models import AgentExecution, EvaluationResult, NumericEvaluationResult
|
|
10
|
+
from ..models.models import EvaluatorType
|
|
11
|
+
from .base_evaluator import (
|
|
12
|
+
BaseEvaluationCriteria,
|
|
13
|
+
BaseEvaluator,
|
|
14
|
+
BaseEvaluatorConfig,
|
|
15
|
+
BaseEvaluatorJustification,
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class ToolCallCountEvaluationCriteria(BaseEvaluationCriteria):
|
|
20
|
+
"""Evaluation criteria for the tool call count evaluator."""
|
|
21
|
+
|
|
22
|
+
# TODO: str field needs to be validated against some criteria that allows ">x", "<x", ">=x", "<=x", "x"
|
|
23
|
+
tool_calls_count: dict[str, tuple[str, int]]
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class ToolCallCountEvaluatorConfig(
|
|
27
|
+
BaseEvaluatorConfig[ToolCallCountEvaluationCriteria]
|
|
28
|
+
):
|
|
29
|
+
"""Configuration for the tool call count evaluator."""
|
|
30
|
+
|
|
31
|
+
name: str = "ToolCallCountEvaluator"
|
|
32
|
+
strict: bool = False
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class ToolCallCountEvaluatorJustification(BaseEvaluatorJustification):
|
|
36
|
+
"""Justification for the tool call count evaluator."""
|
|
37
|
+
|
|
38
|
+
explained_tool_calls_count: dict[str, str]
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class ToolCallCountEvaluator(
|
|
42
|
+
BaseEvaluator[
|
|
43
|
+
ToolCallCountEvaluationCriteria,
|
|
44
|
+
ToolCallCountEvaluatorConfig,
|
|
45
|
+
ToolCallCountEvaluatorJustification,
|
|
46
|
+
]
|
|
47
|
+
):
|
|
48
|
+
"""Evaluator that checks if the tool calls match the expected count.
|
|
49
|
+
|
|
50
|
+
This evaluator returns a score based on how well the actual tool call counts
|
|
51
|
+
match the expected counts specified in the criteria.
|
|
52
|
+
"""
|
|
53
|
+
|
|
54
|
+
@classmethod
|
|
55
|
+
def get_evaluator_id(cls) -> str:
|
|
56
|
+
"""Get the evaluator id."""
|
|
57
|
+
return EvaluatorType.TOOL_CALL_COUNT.value
|
|
58
|
+
|
|
59
|
+
async def evaluate(
|
|
60
|
+
self,
|
|
61
|
+
agent_execution: AgentExecution,
|
|
62
|
+
evaluation_criteria: ToolCallCountEvaluationCriteria,
|
|
63
|
+
) -> EvaluationResult:
|
|
64
|
+
"""Evaluate if the tool calls are in the correct order.
|
|
65
|
+
|
|
66
|
+
Args:
|
|
67
|
+
agent_execution: The execution details containing:
|
|
68
|
+
- agent_input: The input received by the agent
|
|
69
|
+
- agent_output: The final output of the agent
|
|
70
|
+
- agent_trace: The execution spans to use for the evaluation
|
|
71
|
+
evaluation_criteria: The criteria to evaluate
|
|
72
|
+
Returns:
|
|
73
|
+
EvaluationResult: Boolean result indicating correct tool call order (True/False)
|
|
74
|
+
"""
|
|
75
|
+
tool_calls_count = Counter(
|
|
76
|
+
extract_tool_calls_names(agent_execution.agent_trace)
|
|
77
|
+
)
|
|
78
|
+
score, justification = tool_calls_count_score(
|
|
79
|
+
tool_calls_count,
|
|
80
|
+
evaluation_criteria.tool_calls_count,
|
|
81
|
+
self.evaluator_config.strict,
|
|
82
|
+
)
|
|
83
|
+
validated_justification = self.validate_justification(justification)
|
|
84
|
+
return NumericEvaluationResult(
|
|
85
|
+
score=score,
|
|
86
|
+
details=validated_justification,
|
|
87
|
+
)
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
"""Tool call order evaluator for validating correct sequence of tool calls."""
|
|
2
|
+
|
|
3
|
+
from .._helpers.evaluators_helpers import (
|
|
4
|
+
extract_tool_calls_names,
|
|
5
|
+
tool_calls_order_score,
|
|
6
|
+
)
|
|
7
|
+
from ..models import AgentExecution, EvaluationResult, NumericEvaluationResult
|
|
8
|
+
from ..models.models import EvaluatorType
|
|
9
|
+
from .base_evaluator import (
|
|
10
|
+
BaseEvaluationCriteria,
|
|
11
|
+
BaseEvaluator,
|
|
12
|
+
BaseEvaluatorConfig,
|
|
13
|
+
BaseEvaluatorJustification,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class ToolCallOrderEvaluationCriteria(BaseEvaluationCriteria):
|
|
18
|
+
"""Evaluation criteria for the tool call order evaluator."""
|
|
19
|
+
|
|
20
|
+
# TODO: str field needs to be validated such that it contains only the tools available
|
|
21
|
+
tool_calls_order: list[str]
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class ToolCallOrderEvaluatorConfig(
|
|
25
|
+
BaseEvaluatorConfig[ToolCallOrderEvaluationCriteria]
|
|
26
|
+
):
|
|
27
|
+
"""Configuration for the tool call count evaluator."""
|
|
28
|
+
|
|
29
|
+
name: str = "ToolCallOrderEvaluator"
|
|
30
|
+
strict: bool = False
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class ToolCallOrderEvaluatorJustification(BaseEvaluatorJustification):
|
|
34
|
+
"""Justification for the tool call order evaluator."""
|
|
35
|
+
|
|
36
|
+
actual_tool_calls_order: list[str]
|
|
37
|
+
expected_tool_calls_order: list[str]
|
|
38
|
+
lcs: list[str]
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class ToolCallOrderEvaluator(
|
|
42
|
+
BaseEvaluator[
|
|
43
|
+
ToolCallOrderEvaluationCriteria,
|
|
44
|
+
ToolCallOrderEvaluatorConfig,
|
|
45
|
+
ToolCallOrderEvaluatorJustification,
|
|
46
|
+
]
|
|
47
|
+
):
|
|
48
|
+
"""Evaluator that checks if the tool calls are in the correct order.
|
|
49
|
+
|
|
50
|
+
This evaluator returns True if the tool calls are in the correct order, and False otherwise.
|
|
51
|
+
"""
|
|
52
|
+
|
|
53
|
+
@classmethod
|
|
54
|
+
def get_evaluator_id(cls) -> str:
|
|
55
|
+
"""Get the evaluator id."""
|
|
56
|
+
return EvaluatorType.TOOL_CALL_ORDER.value
|
|
57
|
+
|
|
58
|
+
async def evaluate(
|
|
59
|
+
self,
|
|
60
|
+
agent_execution: AgentExecution,
|
|
61
|
+
evaluation_criteria: ToolCallOrderEvaluationCriteria,
|
|
62
|
+
) -> EvaluationResult:
|
|
63
|
+
"""Evaluate if the tool calls are in the correct order.
|
|
64
|
+
|
|
65
|
+
Args:
|
|
66
|
+
agent_execution: The execution details containing:
|
|
67
|
+
- agent_input: The input received by the agent
|
|
68
|
+
- agent_output: The final output of the agent
|
|
69
|
+
- agent_trace: The execution spans to use for the evaluation
|
|
70
|
+
evaluation_criteria: The criteria to evaluate
|
|
71
|
+
Returns:
|
|
72
|
+
EvaluationResult: Boolean result indicating correct tool call order (True/False)
|
|
73
|
+
"""
|
|
74
|
+
tool_calls_order = extract_tool_calls_names(agent_execution.agent_trace)
|
|
75
|
+
score, justification = tool_calls_order_score(
|
|
76
|
+
tool_calls_order,
|
|
77
|
+
evaluation_criteria.tool_calls_order,
|
|
78
|
+
self.evaluator_config.strict,
|
|
79
|
+
)
|
|
80
|
+
validated_justification = self.validate_justification(justification)
|
|
81
|
+
return NumericEvaluationResult(
|
|
82
|
+
score=score,
|
|
83
|
+
details=validated_justification,
|
|
84
|
+
)
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
"""Tool call order evaluator for validating correct sequence of tool calls."""
|
|
2
|
+
|
|
3
|
+
from .._helpers.evaluators_helpers import (
|
|
4
|
+
extract_tool_calls_outputs,
|
|
5
|
+
tool_calls_output_score,
|
|
6
|
+
)
|
|
7
|
+
from ..models import (
|
|
8
|
+
AgentExecution,
|
|
9
|
+
EvaluationResult,
|
|
10
|
+
NumericEvaluationResult,
|
|
11
|
+
ToolOutput,
|
|
12
|
+
)
|
|
13
|
+
from ..models.models import EvaluatorType
|
|
14
|
+
from .base_evaluator import (
|
|
15
|
+
BaseEvaluationCriteria,
|
|
16
|
+
BaseEvaluator,
|
|
17
|
+
BaseEvaluatorConfig,
|
|
18
|
+
BaseEvaluatorJustification,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class ToolCallOutputEvaluationCriteria(BaseEvaluationCriteria):
|
|
23
|
+
"""Evaluation criteria for the tool call order evaluator."""
|
|
24
|
+
|
|
25
|
+
# TODO: name field of ToolCall needs to be validated such that it contains only the tools available
|
|
26
|
+
tool_outputs: list[ToolOutput]
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class ToolCallOutputEvaluatorConfig(
|
|
30
|
+
BaseEvaluatorConfig[ToolCallOutputEvaluationCriteria]
|
|
31
|
+
):
|
|
32
|
+
"""Configuration for the tool call count evaluator."""
|
|
33
|
+
|
|
34
|
+
name: str = "ToolCallOutputEvaluator"
|
|
35
|
+
strict: bool = False
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class ToolCallOutputEvaluatorJustification(BaseEvaluatorJustification):
|
|
39
|
+
"""Justification for the tool call output evaluator."""
|
|
40
|
+
|
|
41
|
+
explained_tool_calls_outputs: dict[str, str]
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class ToolCallOutputEvaluator(
|
|
45
|
+
BaseEvaluator[
|
|
46
|
+
ToolCallOutputEvaluationCriteria,
|
|
47
|
+
ToolCallOutputEvaluatorConfig,
|
|
48
|
+
ToolCallOutputEvaluatorJustification,
|
|
49
|
+
]
|
|
50
|
+
):
|
|
51
|
+
"""Evaluator that checks if the tool calls are in the correct order.
|
|
52
|
+
|
|
53
|
+
This evaluator returns True if the tool calls are in the correct order, and False otherwise.
|
|
54
|
+
"""
|
|
55
|
+
|
|
56
|
+
@classmethod
|
|
57
|
+
def get_evaluator_id(cls) -> str:
|
|
58
|
+
"""Get the evaluator id."""
|
|
59
|
+
return EvaluatorType.TOOL_CALL_OUTPUT.value
|
|
60
|
+
|
|
61
|
+
async def evaluate(
|
|
62
|
+
self,
|
|
63
|
+
agent_execution: AgentExecution,
|
|
64
|
+
evaluation_criteria: ToolCallOutputEvaluationCriteria,
|
|
65
|
+
) -> EvaluationResult:
|
|
66
|
+
"""Evaluate if the tool calls are in the correct order.
|
|
67
|
+
|
|
68
|
+
Args:
|
|
69
|
+
agent_execution: The execution details containing:
|
|
70
|
+
- agent_input: The input received by the agent
|
|
71
|
+
- agent_output: The final output of the agent
|
|
72
|
+
- agent_trace: The execution spans to use for the evaluation
|
|
73
|
+
evaluation_criteria: The criteria to evaluate
|
|
74
|
+
Returns:
|
|
75
|
+
EvaluationResult: Boolean result indicating correct tool call order (True/False)
|
|
76
|
+
"""
|
|
77
|
+
tool_calls_outputs = extract_tool_calls_outputs(agent_execution.agent_trace)
|
|
78
|
+
score, justification = tool_calls_output_score(
|
|
79
|
+
tool_calls_outputs,
|
|
80
|
+
evaluation_criteria.tool_outputs,
|
|
81
|
+
self.evaluator_config.strict,
|
|
82
|
+
)
|
|
83
|
+
validated_justification = self.validate_justification(justification)
|
|
84
|
+
return NumericEvaluationResult(
|
|
85
|
+
score=score,
|
|
86
|
+
details=validated_justification,
|
|
87
|
+
)
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
{
|
|
2
|
+
"evaluatorTypeId": "uipath-contains",
|
|
3
|
+
"evaluatorConfigSchema": {
|
|
4
|
+
"$defs": {
|
|
5
|
+
"ContainsEvaluationCriteria": {
|
|
6
|
+
"description": "Evaluation criteria for the contains evaluator.",
|
|
7
|
+
"properties": {
|
|
8
|
+
"search_text": {
|
|
9
|
+
"title": "Search Text",
|
|
10
|
+
"type": "string"
|
|
11
|
+
}
|
|
12
|
+
},
|
|
13
|
+
"required": [
|
|
14
|
+
"search_text"
|
|
15
|
+
],
|
|
16
|
+
"title": "ContainsEvaluationCriteria",
|
|
17
|
+
"type": "object"
|
|
18
|
+
}
|
|
19
|
+
},
|
|
20
|
+
"description": "Configuration for the contains evaluator.",
|
|
21
|
+
"properties": {
|
|
22
|
+
"name": {
|
|
23
|
+
"default": "ContainsEvaluator",
|
|
24
|
+
"title": "Name",
|
|
25
|
+
"type": "string"
|
|
26
|
+
},
|
|
27
|
+
"default_evaluation_criteria": {
|
|
28
|
+
"anyOf": [
|
|
29
|
+
{
|
|
30
|
+
"$ref": "#/$defs/ContainsEvaluationCriteria"
|
|
31
|
+
},
|
|
32
|
+
{
|
|
33
|
+
"type": "null"
|
|
34
|
+
}
|
|
35
|
+
],
|
|
36
|
+
"default": null
|
|
37
|
+
},
|
|
38
|
+
"target_output_key": {
|
|
39
|
+
"default": "*",
|
|
40
|
+
"description": "Key to extract output from agent execution",
|
|
41
|
+
"title": "Target Output Key",
|
|
42
|
+
"type": "string"
|
|
43
|
+
},
|
|
44
|
+
"case_sensitive": {
|
|
45
|
+
"default": false,
|
|
46
|
+
"title": "Case Sensitive",
|
|
47
|
+
"type": "boolean"
|
|
48
|
+
},
|
|
49
|
+
"negated": {
|
|
50
|
+
"default": false,
|
|
51
|
+
"title": "Negated",
|
|
52
|
+
"type": "boolean"
|
|
53
|
+
}
|
|
54
|
+
},
|
|
55
|
+
"title": "ContainsEvaluatorConfig",
|
|
56
|
+
"type": "object"
|
|
57
|
+
},
|
|
58
|
+
"evaluationCriteriaSchema": {
|
|
59
|
+
"description": "Evaluation criteria for the contains evaluator.",
|
|
60
|
+
"properties": {
|
|
61
|
+
"search_text": {
|
|
62
|
+
"title": "Search Text",
|
|
63
|
+
"type": "string"
|
|
64
|
+
}
|
|
65
|
+
},
|
|
66
|
+
"required": [
|
|
67
|
+
"search_text"
|
|
68
|
+
],
|
|
69
|
+
"title": "ContainsEvaluationCriteria",
|
|
70
|
+
"type": "object"
|
|
71
|
+
},
|
|
72
|
+
"justificationSchema": {}
|
|
73
|
+
}
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
{
|
|
2
|
+
"evaluatorTypeId": "uipath-exact-match",
|
|
3
|
+
"evaluatorConfigSchema": {
|
|
4
|
+
"$defs": {
|
|
5
|
+
"OutputEvaluationCriteria": {
|
|
6
|
+
"description": "Base class for all output evaluation criteria.",
|
|
7
|
+
"properties": {
|
|
8
|
+
"expected_output": {
|
|
9
|
+
"anyOf": [
|
|
10
|
+
{
|
|
11
|
+
"additionalProperties": true,
|
|
12
|
+
"type": "object"
|
|
13
|
+
},
|
|
14
|
+
{
|
|
15
|
+
"type": "string"
|
|
16
|
+
}
|
|
17
|
+
],
|
|
18
|
+
"title": "Expected Output"
|
|
19
|
+
}
|
|
20
|
+
},
|
|
21
|
+
"required": [
|
|
22
|
+
"expected_output"
|
|
23
|
+
],
|
|
24
|
+
"title": "OutputEvaluationCriteria",
|
|
25
|
+
"type": "object"
|
|
26
|
+
}
|
|
27
|
+
},
|
|
28
|
+
"description": "Configuration for the exact match evaluator.",
|
|
29
|
+
"properties": {
|
|
30
|
+
"name": {
|
|
31
|
+
"default": "ExactMatchEvaluator",
|
|
32
|
+
"title": "Name",
|
|
33
|
+
"type": "string"
|
|
34
|
+
},
|
|
35
|
+
"default_evaluation_criteria": {
|
|
36
|
+
"anyOf": [
|
|
37
|
+
{
|
|
38
|
+
"$ref": "#/$defs/OutputEvaluationCriteria"
|
|
39
|
+
},
|
|
40
|
+
{
|
|
41
|
+
"type": "null"
|
|
42
|
+
}
|
|
43
|
+
],
|
|
44
|
+
"default": null
|
|
45
|
+
},
|
|
46
|
+
"target_output_key": {
|
|
47
|
+
"default": "*",
|
|
48
|
+
"description": "Key to extract output from agent execution",
|
|
49
|
+
"title": "Target Output Key",
|
|
50
|
+
"type": "string"
|
|
51
|
+
},
|
|
52
|
+
"case_sensitive": {
|
|
53
|
+
"default": false,
|
|
54
|
+
"title": "Case Sensitive",
|
|
55
|
+
"type": "boolean"
|
|
56
|
+
},
|
|
57
|
+
"negated": {
|
|
58
|
+
"default": false,
|
|
59
|
+
"title": "Negated",
|
|
60
|
+
"type": "boolean"
|
|
61
|
+
}
|
|
62
|
+
},
|
|
63
|
+
"title": "ExactMatchEvaluatorConfig",
|
|
64
|
+
"type": "object"
|
|
65
|
+
},
|
|
66
|
+
"evaluationCriteriaSchema": {
|
|
67
|
+
"description": "Base class for all output evaluation criteria.",
|
|
68
|
+
"properties": {
|
|
69
|
+
"expected_output": {
|
|
70
|
+
"anyOf": [
|
|
71
|
+
{
|
|
72
|
+
"additionalProperties": true,
|
|
73
|
+
"type": "object"
|
|
74
|
+
},
|
|
75
|
+
{
|
|
76
|
+
"type": "string"
|
|
77
|
+
}
|
|
78
|
+
],
|
|
79
|
+
"title": "Expected Output"
|
|
80
|
+
}
|
|
81
|
+
},
|
|
82
|
+
"required": [
|
|
83
|
+
"expected_output"
|
|
84
|
+
],
|
|
85
|
+
"title": "OutputEvaluationCriteria",
|
|
86
|
+
"type": "object"
|
|
87
|
+
},
|
|
88
|
+
"justificationSchema": {}
|
|
89
|
+
}
|