uipath 2.1.108__py3-none-any.whl → 2.1.109__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of uipath might be problematic. Click here for more details.
- uipath/_cli/__init__.py +4 -0
- uipath/_cli/_evals/_console_progress_reporter.py +2 -2
- uipath/_cli/_evals/_evaluator_factory.py +314 -29
- uipath/_cli/_evals/_helpers.py +194 -0
- uipath/_cli/_evals/_models/_evaluation_set.py +73 -7
- uipath/_cli/_evals/_models/_evaluator.py +183 -9
- uipath/_cli/_evals/_models/_evaluator_base_params.py +3 -3
- uipath/_cli/_evals/_models/_output.py +87 -3
- uipath/_cli/_evals/_progress_reporter.py +288 -28
- uipath/_cli/_evals/_runtime.py +80 -26
- uipath/_cli/_evals/mocks/input_mocker.py +1 -3
- uipath/_cli/_evals/mocks/llm_mocker.py +2 -2
- uipath/_cli/_evals/mocks/mocker_factory.py +2 -2
- uipath/_cli/_evals/mocks/mockito_mocker.py +2 -2
- uipath/_cli/_evals/mocks/mocks.py +5 -3
- uipath/_cli/_push/models.py +17 -0
- uipath/_cli/_push/sw_file_handler.py +336 -3
- uipath/_cli/_templates/custom_evaluator.py.template +65 -0
- uipath/_cli/_utils/_eval_set.py +30 -9
- uipath/_cli/_utils/_resources.py +21 -0
- uipath/_cli/_utils/_studio_project.py +18 -0
- uipath/_cli/cli_add.py +114 -0
- uipath/_cli/cli_eval.py +5 -1
- uipath/_cli/cli_pull.py +11 -26
- uipath/_cli/cli_push.py +2 -0
- uipath/_cli/cli_register.py +45 -0
- uipath/_events/_events.py +6 -5
- uipath/_utils/constants.py +4 -0
- uipath/eval/_helpers/evaluators_helpers.py +494 -0
- uipath/eval/_helpers/helpers.py +30 -2
- uipath/eval/evaluators/__init__.py +60 -5
- uipath/eval/evaluators/base_evaluator.py +546 -44
- uipath/eval/evaluators/contains_evaluator.py +80 -0
- uipath/eval/evaluators/exact_match_evaluator.py +43 -12
- uipath/eval/evaluators/json_similarity_evaluator.py +41 -12
- uipath/eval/evaluators/legacy_base_evaluator.py +89 -0
- uipath/eval/evaluators/{deterministic_evaluator_base.py → legacy_deterministic_evaluator_base.py} +2 -2
- uipath/eval/evaluators/legacy_exact_match_evaluator.py +37 -0
- uipath/eval/evaluators/legacy_json_similarity_evaluator.py +151 -0
- uipath/eval/evaluators/legacy_llm_as_judge_evaluator.py +137 -0
- uipath/eval/evaluators/{trajectory_evaluator.py → legacy_trajectory_evaluator.py} +5 -6
- uipath/eval/evaluators/llm_as_judge_evaluator.py +143 -78
- uipath/eval/evaluators/llm_judge_output_evaluator.py +112 -0
- uipath/eval/evaluators/llm_judge_trajectory_evaluator.py +142 -0
- uipath/eval/evaluators/output_evaluator.py +117 -0
- uipath/eval/evaluators/tool_call_args_evaluator.py +82 -0
- uipath/eval/evaluators/tool_call_count_evaluator.py +87 -0
- uipath/eval/evaluators/tool_call_order_evaluator.py +84 -0
- uipath/eval/evaluators/tool_call_output_evaluator.py +87 -0
- uipath/eval/evaluators_types/ContainsEvaluator.json +73 -0
- uipath/eval/evaluators_types/ExactMatchEvaluator.json +89 -0
- uipath/eval/evaluators_types/JsonSimilarityEvaluator.json +81 -0
- uipath/eval/evaluators_types/LLMJudgeOutputEvaluator.json +110 -0
- uipath/eval/evaluators_types/LLMJudgeSimulationTrajectoryEvaluator.json +88 -0
- uipath/eval/evaluators_types/LLMJudgeStrictJSONSimilarityOutputEvaluator.json +110 -0
- uipath/eval/evaluators_types/LLMJudgeTrajectoryEvaluator.json +88 -0
- uipath/eval/evaluators_types/ToolCallArgsEvaluator.json +131 -0
- uipath/eval/evaluators_types/ToolCallCountEvaluator.json +104 -0
- uipath/eval/evaluators_types/ToolCallOrderEvaluator.json +100 -0
- uipath/eval/evaluators_types/ToolCallOutputEvaluator.json +124 -0
- uipath/eval/evaluators_types/generate_types.py +31 -0
- uipath/eval/models/__init__.py +16 -1
- uipath/eval/models/llm_judge_types.py +196 -0
- uipath/eval/models/models.py +109 -7
- {uipath-2.1.108.dist-info → uipath-2.1.109.dist-info}/METADATA +1 -1
- {uipath-2.1.108.dist-info → uipath-2.1.109.dist-info}/RECORD +69 -37
- {uipath-2.1.108.dist-info → uipath-2.1.109.dist-info}/WHEEL +0 -0
- {uipath-2.1.108.dist-info → uipath-2.1.109.dist-info}/entry_points.txt +0 -0
- {uipath-2.1.108.dist-info → uipath-2.1.109.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
{
|
|
2
|
+
"evaluatorTypeId": "uipath-json-similarity",
|
|
3
|
+
"evaluatorConfigSchema": {
|
|
4
|
+
"$defs": {
|
|
5
|
+
"OutputEvaluationCriteria": {
|
|
6
|
+
"description": "Base class for all output evaluation criteria.",
|
|
7
|
+
"properties": {
|
|
8
|
+
"expected_output": {
|
|
9
|
+
"anyOf": [
|
|
10
|
+
{
|
|
11
|
+
"additionalProperties": true,
|
|
12
|
+
"type": "object"
|
|
13
|
+
},
|
|
14
|
+
{
|
|
15
|
+
"type": "string"
|
|
16
|
+
}
|
|
17
|
+
],
|
|
18
|
+
"title": "Expected Output"
|
|
19
|
+
}
|
|
20
|
+
},
|
|
21
|
+
"required": [
|
|
22
|
+
"expected_output"
|
|
23
|
+
],
|
|
24
|
+
"title": "OutputEvaluationCriteria",
|
|
25
|
+
"type": "object"
|
|
26
|
+
}
|
|
27
|
+
},
|
|
28
|
+
"description": "Configuration for the json similarity evaluator.",
|
|
29
|
+
"properties": {
|
|
30
|
+
"name": {
|
|
31
|
+
"default": "JsonSimilarityEvaluator",
|
|
32
|
+
"title": "Name",
|
|
33
|
+
"type": "string"
|
|
34
|
+
},
|
|
35
|
+
"default_evaluation_criteria": {
|
|
36
|
+
"anyOf": [
|
|
37
|
+
{
|
|
38
|
+
"$ref": "#/$defs/OutputEvaluationCriteria"
|
|
39
|
+
},
|
|
40
|
+
{
|
|
41
|
+
"type": "null"
|
|
42
|
+
}
|
|
43
|
+
],
|
|
44
|
+
"default": null
|
|
45
|
+
},
|
|
46
|
+
"target_output_key": {
|
|
47
|
+
"default": "*",
|
|
48
|
+
"description": "Key to extract output from agent execution",
|
|
49
|
+
"title": "Target Output Key",
|
|
50
|
+
"type": "string"
|
|
51
|
+
}
|
|
52
|
+
},
|
|
53
|
+
"title": "JsonSimilarityEvaluatorConfig",
|
|
54
|
+
"type": "object"
|
|
55
|
+
},
|
|
56
|
+
"evaluationCriteriaSchema": {
|
|
57
|
+
"description": "Base class for all output evaluation criteria.",
|
|
58
|
+
"properties": {
|
|
59
|
+
"expected_output": {
|
|
60
|
+
"anyOf": [
|
|
61
|
+
{
|
|
62
|
+
"additionalProperties": true,
|
|
63
|
+
"type": "object"
|
|
64
|
+
},
|
|
65
|
+
{
|
|
66
|
+
"type": "string"
|
|
67
|
+
}
|
|
68
|
+
],
|
|
69
|
+
"title": "Expected Output"
|
|
70
|
+
}
|
|
71
|
+
},
|
|
72
|
+
"required": [
|
|
73
|
+
"expected_output"
|
|
74
|
+
],
|
|
75
|
+
"title": "OutputEvaluationCriteria",
|
|
76
|
+
"type": "object"
|
|
77
|
+
},
|
|
78
|
+
"justificationSchema": {
|
|
79
|
+
"type": "string"
|
|
80
|
+
}
|
|
81
|
+
}
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
{
|
|
2
|
+
"evaluatorTypeId": "uipath-llm-judge-output-semantic-similarity",
|
|
3
|
+
"evaluatorConfigSchema": {
|
|
4
|
+
"$defs": {
|
|
5
|
+
"OutputEvaluationCriteria": {
|
|
6
|
+
"description": "Base class for all output evaluation criteria.",
|
|
7
|
+
"properties": {
|
|
8
|
+
"expected_output": {
|
|
9
|
+
"anyOf": [
|
|
10
|
+
{
|
|
11
|
+
"additionalProperties": true,
|
|
12
|
+
"type": "object"
|
|
13
|
+
},
|
|
14
|
+
{
|
|
15
|
+
"type": "string"
|
|
16
|
+
}
|
|
17
|
+
],
|
|
18
|
+
"title": "Expected Output"
|
|
19
|
+
}
|
|
20
|
+
},
|
|
21
|
+
"required": [
|
|
22
|
+
"expected_output"
|
|
23
|
+
],
|
|
24
|
+
"title": "OutputEvaluationCriteria",
|
|
25
|
+
"type": "object"
|
|
26
|
+
}
|
|
27
|
+
},
|
|
28
|
+
"description": "Configuration for the LLM judge output evaluator.",
|
|
29
|
+
"properties": {
|
|
30
|
+
"name": {
|
|
31
|
+
"default": "LLMJudgeOutputEvaluator",
|
|
32
|
+
"title": "Name",
|
|
33
|
+
"type": "string"
|
|
34
|
+
},
|
|
35
|
+
"default_evaluation_criteria": {
|
|
36
|
+
"anyOf": [
|
|
37
|
+
{
|
|
38
|
+
"$ref": "#/$defs/OutputEvaluationCriteria"
|
|
39
|
+
},
|
|
40
|
+
{
|
|
41
|
+
"type": "null"
|
|
42
|
+
}
|
|
43
|
+
],
|
|
44
|
+
"default": null
|
|
45
|
+
},
|
|
46
|
+
"prompt": {
|
|
47
|
+
"default": "As an expert evaluator, analyze the semantic similarity of these JSON contents to determine a score from 0-100. Focus on comparing the meaning and contextual equivalence of corresponding fields, accounting for alternative valid expressions, synonyms, and reasonable variations in language while maintaining high standards for accuracy and completeness. Provide your score with a justification, explaining briefly and concisely why you gave that score.\n----\nExpectedOutput:\n{{ExpectedOutput}}\n----\nActualOutput:\n{{ActualOutput}}",
|
|
48
|
+
"title": "Prompt",
|
|
49
|
+
"type": "string"
|
|
50
|
+
},
|
|
51
|
+
"model": {
|
|
52
|
+
"title": "Model",
|
|
53
|
+
"type": "string"
|
|
54
|
+
},
|
|
55
|
+
"temperature": {
|
|
56
|
+
"default": 0.0,
|
|
57
|
+
"title": "Temperature",
|
|
58
|
+
"type": "number"
|
|
59
|
+
},
|
|
60
|
+
"max_tokens": {
|
|
61
|
+
"anyOf": [
|
|
62
|
+
{
|
|
63
|
+
"type": "integer"
|
|
64
|
+
},
|
|
65
|
+
{
|
|
66
|
+
"type": "null"
|
|
67
|
+
}
|
|
68
|
+
],
|
|
69
|
+
"default": null,
|
|
70
|
+
"title": "Max Tokens"
|
|
71
|
+
},
|
|
72
|
+
"target_output_key": {
|
|
73
|
+
"default": "*",
|
|
74
|
+
"description": "Key to extract output from agent execution",
|
|
75
|
+
"title": "Target Output Key",
|
|
76
|
+
"type": "string"
|
|
77
|
+
}
|
|
78
|
+
},
|
|
79
|
+
"required": [
|
|
80
|
+
"model"
|
|
81
|
+
],
|
|
82
|
+
"title": "LLMJudgeOutputEvaluatorConfig",
|
|
83
|
+
"type": "object"
|
|
84
|
+
},
|
|
85
|
+
"evaluationCriteriaSchema": {
|
|
86
|
+
"description": "Base class for all output evaluation criteria.",
|
|
87
|
+
"properties": {
|
|
88
|
+
"expected_output": {
|
|
89
|
+
"anyOf": [
|
|
90
|
+
{
|
|
91
|
+
"additionalProperties": true,
|
|
92
|
+
"type": "object"
|
|
93
|
+
},
|
|
94
|
+
{
|
|
95
|
+
"type": "string"
|
|
96
|
+
}
|
|
97
|
+
],
|
|
98
|
+
"title": "Expected Output"
|
|
99
|
+
}
|
|
100
|
+
},
|
|
101
|
+
"required": [
|
|
102
|
+
"expected_output"
|
|
103
|
+
],
|
|
104
|
+
"title": "OutputEvaluationCriteria",
|
|
105
|
+
"type": "object"
|
|
106
|
+
},
|
|
107
|
+
"justificationSchema": {
|
|
108
|
+
"type": "string"
|
|
109
|
+
}
|
|
110
|
+
}
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
{
|
|
2
|
+
"evaluatorTypeId": "uipath-llm-judge-trajectory-simulation",
|
|
3
|
+
"evaluatorConfigSchema": {
|
|
4
|
+
"$defs": {
|
|
5
|
+
"TrajectoryEvaluationCriteria": {
|
|
6
|
+
"description": "Evaluation criteria for trajectory-based evaluations.",
|
|
7
|
+
"properties": {
|
|
8
|
+
"expected_agent_behavior": {
|
|
9
|
+
"title": "Expected Agent Behavior",
|
|
10
|
+
"type": "string"
|
|
11
|
+
}
|
|
12
|
+
},
|
|
13
|
+
"required": [
|
|
14
|
+
"expected_agent_behavior"
|
|
15
|
+
],
|
|
16
|
+
"title": "TrajectoryEvaluationCriteria",
|
|
17
|
+
"type": "object"
|
|
18
|
+
}
|
|
19
|
+
},
|
|
20
|
+
"description": "Configuration for the llm judge simulation trajectory evaluator.",
|
|
21
|
+
"properties": {
|
|
22
|
+
"name": {
|
|
23
|
+
"default": "LLMJudgeSimulationEvaluator",
|
|
24
|
+
"title": "Name",
|
|
25
|
+
"type": "string"
|
|
26
|
+
},
|
|
27
|
+
"default_evaluation_criteria": {
|
|
28
|
+
"anyOf": [
|
|
29
|
+
{
|
|
30
|
+
"$ref": "#/$defs/TrajectoryEvaluationCriteria"
|
|
31
|
+
},
|
|
32
|
+
{
|
|
33
|
+
"type": "null"
|
|
34
|
+
}
|
|
35
|
+
],
|
|
36
|
+
"default": null
|
|
37
|
+
},
|
|
38
|
+
"prompt": {
|
|
39
|
+
"default": "As an expert evaluator, determine how well the agent did on a scale of 0-100. Focus on if the simulation was successful and if the agent behaved according to the expected output accounting for alternative valid expressions, and reasonable variations in language while maintaining high standards for accuracy and completeness. Provide your score with a justification, explaining briefly and concisely why you gave that score.\n----\nAgentInput:\n{{UserOrSyntheticInput}}\n----\nSimulationInstructions:\n{{SimulationInstructions}}\n----\nExpectedAgentBehavior:\n{{ExpectedAgentBehavior}}\n----\nAgentRunHistory:\n{{AgentRunHistory}}\n",
|
|
40
|
+
"title": "Prompt",
|
|
41
|
+
"type": "string"
|
|
42
|
+
},
|
|
43
|
+
"model": {
|
|
44
|
+
"title": "Model",
|
|
45
|
+
"type": "string"
|
|
46
|
+
},
|
|
47
|
+
"temperature": {
|
|
48
|
+
"default": 0.0,
|
|
49
|
+
"title": "Temperature",
|
|
50
|
+
"type": "number"
|
|
51
|
+
},
|
|
52
|
+
"max_tokens": {
|
|
53
|
+
"anyOf": [
|
|
54
|
+
{
|
|
55
|
+
"type": "integer"
|
|
56
|
+
},
|
|
57
|
+
{
|
|
58
|
+
"type": "null"
|
|
59
|
+
}
|
|
60
|
+
],
|
|
61
|
+
"default": null,
|
|
62
|
+
"title": "Max Tokens"
|
|
63
|
+
}
|
|
64
|
+
},
|
|
65
|
+
"required": [
|
|
66
|
+
"model"
|
|
67
|
+
],
|
|
68
|
+
"title": "LLMJudgeSimulationEvaluatorConfig",
|
|
69
|
+
"type": "object"
|
|
70
|
+
},
|
|
71
|
+
"evaluationCriteriaSchema": {
|
|
72
|
+
"description": "Evaluation criteria for trajectory-based evaluations.",
|
|
73
|
+
"properties": {
|
|
74
|
+
"expected_agent_behavior": {
|
|
75
|
+
"title": "Expected Agent Behavior",
|
|
76
|
+
"type": "string"
|
|
77
|
+
}
|
|
78
|
+
},
|
|
79
|
+
"required": [
|
|
80
|
+
"expected_agent_behavior"
|
|
81
|
+
],
|
|
82
|
+
"title": "TrajectoryEvaluationCriteria",
|
|
83
|
+
"type": "object"
|
|
84
|
+
},
|
|
85
|
+
"justificationSchema": {
|
|
86
|
+
"type": "string"
|
|
87
|
+
}
|
|
88
|
+
}
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
{
|
|
2
|
+
"evaluatorTypeId": "uipath-llm-judge-output-strict-json-similarity",
|
|
3
|
+
"evaluatorConfigSchema": {
|
|
4
|
+
"$defs": {
|
|
5
|
+
"OutputEvaluationCriteria": {
|
|
6
|
+
"description": "Base class for all output evaluation criteria.",
|
|
7
|
+
"properties": {
|
|
8
|
+
"expected_output": {
|
|
9
|
+
"anyOf": [
|
|
10
|
+
{
|
|
11
|
+
"additionalProperties": true,
|
|
12
|
+
"type": "object"
|
|
13
|
+
},
|
|
14
|
+
{
|
|
15
|
+
"type": "string"
|
|
16
|
+
}
|
|
17
|
+
],
|
|
18
|
+
"title": "Expected Output"
|
|
19
|
+
}
|
|
20
|
+
},
|
|
21
|
+
"required": [
|
|
22
|
+
"expected_output"
|
|
23
|
+
],
|
|
24
|
+
"title": "OutputEvaluationCriteria",
|
|
25
|
+
"type": "object"
|
|
26
|
+
}
|
|
27
|
+
},
|
|
28
|
+
"description": "Configuration for the LLM judge strict JSON similarity output evaluator.",
|
|
29
|
+
"properties": {
|
|
30
|
+
"name": {
|
|
31
|
+
"default": "LLMJudgeStrictJSONSimilarityOutputEvaluator",
|
|
32
|
+
"title": "Name",
|
|
33
|
+
"type": "string"
|
|
34
|
+
},
|
|
35
|
+
"default_evaluation_criteria": {
|
|
36
|
+
"anyOf": [
|
|
37
|
+
{
|
|
38
|
+
"$ref": "#/$defs/OutputEvaluationCriteria"
|
|
39
|
+
},
|
|
40
|
+
{
|
|
41
|
+
"type": "null"
|
|
42
|
+
}
|
|
43
|
+
],
|
|
44
|
+
"default": null
|
|
45
|
+
},
|
|
46
|
+
"prompt": {
|
|
47
|
+
"default": "ExpectedOutput (ground truth):\n{{ExpectedOutput}}\n\nActualOutput (model answer):\n{{ActualOutput}}",
|
|
48
|
+
"title": "Prompt",
|
|
49
|
+
"type": "string"
|
|
50
|
+
},
|
|
51
|
+
"model": {
|
|
52
|
+
"title": "Model",
|
|
53
|
+
"type": "string"
|
|
54
|
+
},
|
|
55
|
+
"temperature": {
|
|
56
|
+
"default": 0.0,
|
|
57
|
+
"title": "Temperature",
|
|
58
|
+
"type": "number"
|
|
59
|
+
},
|
|
60
|
+
"max_tokens": {
|
|
61
|
+
"anyOf": [
|
|
62
|
+
{
|
|
63
|
+
"type": "integer"
|
|
64
|
+
},
|
|
65
|
+
{
|
|
66
|
+
"type": "null"
|
|
67
|
+
}
|
|
68
|
+
],
|
|
69
|
+
"default": null,
|
|
70
|
+
"title": "Max Tokens"
|
|
71
|
+
},
|
|
72
|
+
"target_output_key": {
|
|
73
|
+
"default": "*",
|
|
74
|
+
"description": "Key to extract output from agent execution",
|
|
75
|
+
"title": "Target Output Key",
|
|
76
|
+
"type": "string"
|
|
77
|
+
}
|
|
78
|
+
},
|
|
79
|
+
"required": [
|
|
80
|
+
"model"
|
|
81
|
+
],
|
|
82
|
+
"title": "LLMJudgeStrictJSONSimilarityOutputEvaluatorConfig",
|
|
83
|
+
"type": "object"
|
|
84
|
+
},
|
|
85
|
+
"evaluationCriteriaSchema": {
|
|
86
|
+
"description": "Base class for all output evaluation criteria.",
|
|
87
|
+
"properties": {
|
|
88
|
+
"expected_output": {
|
|
89
|
+
"anyOf": [
|
|
90
|
+
{
|
|
91
|
+
"additionalProperties": true,
|
|
92
|
+
"type": "object"
|
|
93
|
+
},
|
|
94
|
+
{
|
|
95
|
+
"type": "string"
|
|
96
|
+
}
|
|
97
|
+
],
|
|
98
|
+
"title": "Expected Output"
|
|
99
|
+
}
|
|
100
|
+
},
|
|
101
|
+
"required": [
|
|
102
|
+
"expected_output"
|
|
103
|
+
],
|
|
104
|
+
"title": "OutputEvaluationCriteria",
|
|
105
|
+
"type": "object"
|
|
106
|
+
},
|
|
107
|
+
"justificationSchema": {
|
|
108
|
+
"type": "string"
|
|
109
|
+
}
|
|
110
|
+
}
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
{
|
|
2
|
+
"evaluatorTypeId": "uipath-llm-judge-trajectory-similarity",
|
|
3
|
+
"evaluatorConfigSchema": {
|
|
4
|
+
"$defs": {
|
|
5
|
+
"TrajectoryEvaluationCriteria": {
|
|
6
|
+
"description": "Evaluation criteria for trajectory-based evaluations.",
|
|
7
|
+
"properties": {
|
|
8
|
+
"expected_agent_behavior": {
|
|
9
|
+
"title": "Expected Agent Behavior",
|
|
10
|
+
"type": "string"
|
|
11
|
+
}
|
|
12
|
+
},
|
|
13
|
+
"required": [
|
|
14
|
+
"expected_agent_behavior"
|
|
15
|
+
],
|
|
16
|
+
"title": "TrajectoryEvaluationCriteria",
|
|
17
|
+
"type": "object"
|
|
18
|
+
}
|
|
19
|
+
},
|
|
20
|
+
"description": "Configuration for the llm judge trajectory evaluator.",
|
|
21
|
+
"properties": {
|
|
22
|
+
"name": {
|
|
23
|
+
"default": "LLMJudgeTrajectoryEvaluator",
|
|
24
|
+
"title": "Name",
|
|
25
|
+
"type": "string"
|
|
26
|
+
},
|
|
27
|
+
"default_evaluation_criteria": {
|
|
28
|
+
"anyOf": [
|
|
29
|
+
{
|
|
30
|
+
"$ref": "#/$defs/TrajectoryEvaluationCriteria"
|
|
31
|
+
},
|
|
32
|
+
{
|
|
33
|
+
"type": "null"
|
|
34
|
+
}
|
|
35
|
+
],
|
|
36
|
+
"default": null
|
|
37
|
+
},
|
|
38
|
+
"prompt": {
|
|
39
|
+
"default": "As an expert evaluator, determine how well the agent performed on a scale of 0-100. Focus on whether the agent's actions and outputs matched the expected behavior, while allowing for alternative valid expressions and reasonable variations in language. Maintain high standards for accuracy and completeness. Provide your score with a brief and clear justification explaining your reasoning.\n----\nAgentInput:\n{{UserOrSyntheticInput}}\n----\nExpectedAgentBehavior:\n{{ExpectedAgentBehavior}}\n----\nAgentRunHistory:\n{{AgentRunHistory}}\n",
|
|
40
|
+
"title": "Prompt",
|
|
41
|
+
"type": "string"
|
|
42
|
+
},
|
|
43
|
+
"model": {
|
|
44
|
+
"title": "Model",
|
|
45
|
+
"type": "string"
|
|
46
|
+
},
|
|
47
|
+
"temperature": {
|
|
48
|
+
"default": 0.0,
|
|
49
|
+
"title": "Temperature",
|
|
50
|
+
"type": "number"
|
|
51
|
+
},
|
|
52
|
+
"max_tokens": {
|
|
53
|
+
"anyOf": [
|
|
54
|
+
{
|
|
55
|
+
"type": "integer"
|
|
56
|
+
},
|
|
57
|
+
{
|
|
58
|
+
"type": "null"
|
|
59
|
+
}
|
|
60
|
+
],
|
|
61
|
+
"default": null,
|
|
62
|
+
"title": "Max Tokens"
|
|
63
|
+
}
|
|
64
|
+
},
|
|
65
|
+
"required": [
|
|
66
|
+
"model"
|
|
67
|
+
],
|
|
68
|
+
"title": "LLMJudgeTrajectoryEvaluatorConfig",
|
|
69
|
+
"type": "object"
|
|
70
|
+
},
|
|
71
|
+
"evaluationCriteriaSchema": {
|
|
72
|
+
"description": "Evaluation criteria for trajectory-based evaluations.",
|
|
73
|
+
"properties": {
|
|
74
|
+
"expected_agent_behavior": {
|
|
75
|
+
"title": "Expected Agent Behavior",
|
|
76
|
+
"type": "string"
|
|
77
|
+
}
|
|
78
|
+
},
|
|
79
|
+
"required": [
|
|
80
|
+
"expected_agent_behavior"
|
|
81
|
+
],
|
|
82
|
+
"title": "TrajectoryEvaluationCriteria",
|
|
83
|
+
"type": "object"
|
|
84
|
+
},
|
|
85
|
+
"justificationSchema": {
|
|
86
|
+
"type": "string"
|
|
87
|
+
}
|
|
88
|
+
}
|
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
{
|
|
2
|
+
"evaluatorTypeId": "uipath-tool-call-args",
|
|
3
|
+
"evaluatorConfigSchema": {
|
|
4
|
+
"$defs": {
|
|
5
|
+
"ToolCall": {
|
|
6
|
+
"description": "Represents a tool call with its arguments.",
|
|
7
|
+
"properties": {
|
|
8
|
+
"name": {
|
|
9
|
+
"title": "Name",
|
|
10
|
+
"type": "string"
|
|
11
|
+
},
|
|
12
|
+
"args": {
|
|
13
|
+
"additionalProperties": true,
|
|
14
|
+
"title": "Args",
|
|
15
|
+
"type": "object"
|
|
16
|
+
}
|
|
17
|
+
},
|
|
18
|
+
"required": [
|
|
19
|
+
"name",
|
|
20
|
+
"args"
|
|
21
|
+
],
|
|
22
|
+
"title": "ToolCall",
|
|
23
|
+
"type": "object"
|
|
24
|
+
},
|
|
25
|
+
"ToolCallArgsEvaluationCriteria": {
|
|
26
|
+
"description": "Evaluation criteria for the tool call order evaluator.",
|
|
27
|
+
"properties": {
|
|
28
|
+
"tool_calls": {
|
|
29
|
+
"items": {
|
|
30
|
+
"$ref": "#/$defs/ToolCall"
|
|
31
|
+
},
|
|
32
|
+
"title": "Tool Calls",
|
|
33
|
+
"type": "array"
|
|
34
|
+
}
|
|
35
|
+
},
|
|
36
|
+
"required": [
|
|
37
|
+
"tool_calls"
|
|
38
|
+
],
|
|
39
|
+
"title": "ToolCallArgsEvaluationCriteria",
|
|
40
|
+
"type": "object"
|
|
41
|
+
}
|
|
42
|
+
},
|
|
43
|
+
"description": "Configuration for the tool call count evaluator.",
|
|
44
|
+
"properties": {
|
|
45
|
+
"name": {
|
|
46
|
+
"default": "ToolCallArgsEvaluator",
|
|
47
|
+
"title": "Name",
|
|
48
|
+
"type": "string"
|
|
49
|
+
},
|
|
50
|
+
"default_evaluation_criteria": {
|
|
51
|
+
"anyOf": [
|
|
52
|
+
{
|
|
53
|
+
"$ref": "#/$defs/ToolCallArgsEvaluationCriteria"
|
|
54
|
+
},
|
|
55
|
+
{
|
|
56
|
+
"type": "null"
|
|
57
|
+
}
|
|
58
|
+
],
|
|
59
|
+
"default": null
|
|
60
|
+
},
|
|
61
|
+
"strict": {
|
|
62
|
+
"default": false,
|
|
63
|
+
"title": "Strict",
|
|
64
|
+
"type": "boolean"
|
|
65
|
+
},
|
|
66
|
+
"subset": {
|
|
67
|
+
"default": false,
|
|
68
|
+
"title": "Subset",
|
|
69
|
+
"type": "boolean"
|
|
70
|
+
}
|
|
71
|
+
},
|
|
72
|
+
"title": "ToolCallArgsEvaluatorConfig",
|
|
73
|
+
"type": "object"
|
|
74
|
+
},
|
|
75
|
+
"evaluationCriteriaSchema": {
|
|
76
|
+
"$defs": {
|
|
77
|
+
"ToolCall": {
|
|
78
|
+
"description": "Represents a tool call with its arguments.",
|
|
79
|
+
"properties": {
|
|
80
|
+
"name": {
|
|
81
|
+
"title": "Name",
|
|
82
|
+
"type": "string"
|
|
83
|
+
},
|
|
84
|
+
"args": {
|
|
85
|
+
"additionalProperties": true,
|
|
86
|
+
"title": "Args",
|
|
87
|
+
"type": "object"
|
|
88
|
+
}
|
|
89
|
+
},
|
|
90
|
+
"required": [
|
|
91
|
+
"name",
|
|
92
|
+
"args"
|
|
93
|
+
],
|
|
94
|
+
"title": "ToolCall",
|
|
95
|
+
"type": "object"
|
|
96
|
+
}
|
|
97
|
+
},
|
|
98
|
+
"description": "Evaluation criteria for the tool call order evaluator.",
|
|
99
|
+
"properties": {
|
|
100
|
+
"tool_calls": {
|
|
101
|
+
"items": {
|
|
102
|
+
"$ref": "#/$defs/ToolCall"
|
|
103
|
+
},
|
|
104
|
+
"title": "Tool Calls",
|
|
105
|
+
"type": "array"
|
|
106
|
+
}
|
|
107
|
+
},
|
|
108
|
+
"required": [
|
|
109
|
+
"tool_calls"
|
|
110
|
+
],
|
|
111
|
+
"title": "ToolCallArgsEvaluationCriteria",
|
|
112
|
+
"type": "object"
|
|
113
|
+
},
|
|
114
|
+
"justificationSchema": {
|
|
115
|
+
"description": "Justification for the tool call args evaluator.",
|
|
116
|
+
"properties": {
|
|
117
|
+
"explained_tool_calls_args": {
|
|
118
|
+
"additionalProperties": {
|
|
119
|
+
"type": "string"
|
|
120
|
+
},
|
|
121
|
+
"title": "Explained Tool Calls Args",
|
|
122
|
+
"type": "object"
|
|
123
|
+
}
|
|
124
|
+
},
|
|
125
|
+
"required": [
|
|
126
|
+
"explained_tool_calls_args"
|
|
127
|
+
],
|
|
128
|
+
"title": "ToolCallArgsEvaluatorJustification",
|
|
129
|
+
"type": "object"
|
|
130
|
+
}
|
|
131
|
+
}
|