uipath 2.1.107__py3-none-any.whl → 2.1.109__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of uipath might be problematic. Click here for more details.

Files changed (72) hide show
  1. uipath/_cli/__init__.py +4 -0
  2. uipath/_cli/_evals/_console_progress_reporter.py +2 -2
  3. uipath/_cli/_evals/_evaluator_factory.py +314 -29
  4. uipath/_cli/_evals/_helpers.py +194 -0
  5. uipath/_cli/_evals/_models/_evaluation_set.py +73 -7
  6. uipath/_cli/_evals/_models/_evaluator.py +183 -9
  7. uipath/_cli/_evals/_models/_evaluator_base_params.py +3 -3
  8. uipath/_cli/_evals/_models/_output.py +87 -3
  9. uipath/_cli/_evals/_progress_reporter.py +288 -28
  10. uipath/_cli/_evals/_runtime.py +80 -26
  11. uipath/_cli/_evals/mocks/input_mocker.py +1 -3
  12. uipath/_cli/_evals/mocks/llm_mocker.py +2 -2
  13. uipath/_cli/_evals/mocks/mocker_factory.py +2 -2
  14. uipath/_cli/_evals/mocks/mockito_mocker.py +2 -2
  15. uipath/_cli/_evals/mocks/mocks.py +5 -3
  16. uipath/_cli/_push/models.py +17 -0
  17. uipath/_cli/_push/sw_file_handler.py +336 -3
  18. uipath/_cli/_runtime/_contracts.py +25 -5
  19. uipath/_cli/_templates/custom_evaluator.py.template +65 -0
  20. uipath/_cli/_utils/_eval_set.py +30 -9
  21. uipath/_cli/_utils/_resources.py +21 -0
  22. uipath/_cli/_utils/_studio_project.py +18 -0
  23. uipath/_cli/cli_add.py +114 -0
  24. uipath/_cli/cli_eval.py +5 -1
  25. uipath/_cli/cli_pull.py +11 -26
  26. uipath/_cli/cli_push.py +2 -0
  27. uipath/_cli/cli_register.py +45 -0
  28. uipath/_events/_events.py +6 -5
  29. uipath/_resources/SDK_REFERENCE.md +0 -97
  30. uipath/_uipath.py +10 -37
  31. uipath/_utils/constants.py +4 -0
  32. uipath/eval/_helpers/evaluators_helpers.py +494 -0
  33. uipath/eval/_helpers/helpers.py +30 -2
  34. uipath/eval/evaluators/__init__.py +60 -5
  35. uipath/eval/evaluators/base_evaluator.py +546 -44
  36. uipath/eval/evaluators/contains_evaluator.py +80 -0
  37. uipath/eval/evaluators/exact_match_evaluator.py +43 -12
  38. uipath/eval/evaluators/json_similarity_evaluator.py +41 -12
  39. uipath/eval/evaluators/legacy_base_evaluator.py +89 -0
  40. uipath/eval/evaluators/{deterministic_evaluator_base.py → legacy_deterministic_evaluator_base.py} +2 -2
  41. uipath/eval/evaluators/legacy_exact_match_evaluator.py +37 -0
  42. uipath/eval/evaluators/legacy_json_similarity_evaluator.py +151 -0
  43. uipath/eval/evaluators/legacy_llm_as_judge_evaluator.py +137 -0
  44. uipath/eval/evaluators/{trajectory_evaluator.py → legacy_trajectory_evaluator.py} +5 -6
  45. uipath/eval/evaluators/llm_as_judge_evaluator.py +143 -78
  46. uipath/eval/evaluators/llm_judge_output_evaluator.py +112 -0
  47. uipath/eval/evaluators/llm_judge_trajectory_evaluator.py +142 -0
  48. uipath/eval/evaluators/output_evaluator.py +117 -0
  49. uipath/eval/evaluators/tool_call_args_evaluator.py +82 -0
  50. uipath/eval/evaluators/tool_call_count_evaluator.py +87 -0
  51. uipath/eval/evaluators/tool_call_order_evaluator.py +84 -0
  52. uipath/eval/evaluators/tool_call_output_evaluator.py +87 -0
  53. uipath/eval/evaluators_types/ContainsEvaluator.json +73 -0
  54. uipath/eval/evaluators_types/ExactMatchEvaluator.json +89 -0
  55. uipath/eval/evaluators_types/JsonSimilarityEvaluator.json +81 -0
  56. uipath/eval/evaluators_types/LLMJudgeOutputEvaluator.json +110 -0
  57. uipath/eval/evaluators_types/LLMJudgeSimulationTrajectoryEvaluator.json +88 -0
  58. uipath/eval/evaluators_types/LLMJudgeStrictJSONSimilarityOutputEvaluator.json +110 -0
  59. uipath/eval/evaluators_types/LLMJudgeTrajectoryEvaluator.json +88 -0
  60. uipath/eval/evaluators_types/ToolCallArgsEvaluator.json +131 -0
  61. uipath/eval/evaluators_types/ToolCallCountEvaluator.json +104 -0
  62. uipath/eval/evaluators_types/ToolCallOrderEvaluator.json +100 -0
  63. uipath/eval/evaluators_types/ToolCallOutputEvaluator.json +124 -0
  64. uipath/eval/evaluators_types/generate_types.py +31 -0
  65. uipath/eval/models/__init__.py +16 -1
  66. uipath/eval/models/llm_judge_types.py +196 -0
  67. uipath/eval/models/models.py +109 -7
  68. {uipath-2.1.107.dist-info → uipath-2.1.109.dist-info}/METADATA +1 -1
  69. {uipath-2.1.107.dist-info → uipath-2.1.109.dist-info}/RECORD +72 -40
  70. {uipath-2.1.107.dist-info → uipath-2.1.109.dist-info}/WHEEL +0 -0
  71. {uipath-2.1.107.dist-info → uipath-2.1.109.dist-info}/entry_points.txt +0 -0
  72. {uipath-2.1.107.dist-info → uipath-2.1.109.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,104 @@
1
+ {
2
+ "evaluatorTypeId": "uipath-tool-call-count",
3
+ "evaluatorConfigSchema": {
4
+ "$defs": {
5
+ "ToolCallCountEvaluationCriteria": {
6
+ "description": "Evaluation criteria for the tool call count evaluator.",
7
+ "properties": {
8
+ "tool_calls_count": {
9
+ "additionalProperties": {
10
+ "maxItems": 2,
11
+ "minItems": 2,
12
+ "prefixItems": [
13
+ {
14
+ "type": "string"
15
+ },
16
+ {
17
+ "type": "integer"
18
+ }
19
+ ],
20
+ "type": "array"
21
+ },
22
+ "title": "Tool Calls Count",
23
+ "type": "object"
24
+ }
25
+ },
26
+ "required": [
27
+ "tool_calls_count"
28
+ ],
29
+ "title": "ToolCallCountEvaluationCriteria",
30
+ "type": "object"
31
+ }
32
+ },
33
+ "description": "Configuration for the tool call count evaluator.",
34
+ "properties": {
35
+ "name": {
36
+ "default": "ToolCallCountEvaluator",
37
+ "title": "Name",
38
+ "type": "string"
39
+ },
40
+ "default_evaluation_criteria": {
41
+ "anyOf": [
42
+ {
43
+ "$ref": "#/$defs/ToolCallCountEvaluationCriteria"
44
+ },
45
+ {
46
+ "type": "null"
47
+ }
48
+ ],
49
+ "default": null
50
+ },
51
+ "strict": {
52
+ "default": false,
53
+ "title": "Strict",
54
+ "type": "boolean"
55
+ }
56
+ },
57
+ "title": "ToolCallCountEvaluatorConfig",
58
+ "type": "object"
59
+ },
60
+ "evaluationCriteriaSchema": {
61
+ "description": "Evaluation criteria for the tool call count evaluator.",
62
+ "properties": {
63
+ "tool_calls_count": {
64
+ "additionalProperties": {
65
+ "maxItems": 2,
66
+ "minItems": 2,
67
+ "prefixItems": [
68
+ {
69
+ "type": "string"
70
+ },
71
+ {
72
+ "type": "integer"
73
+ }
74
+ ],
75
+ "type": "array"
76
+ },
77
+ "title": "Tool Calls Count",
78
+ "type": "object"
79
+ }
80
+ },
81
+ "required": [
82
+ "tool_calls_count"
83
+ ],
84
+ "title": "ToolCallCountEvaluationCriteria",
85
+ "type": "object"
86
+ },
87
+ "justificationSchema": {
88
+ "description": "Justification for the tool call count evaluator.",
89
+ "properties": {
90
+ "explained_tool_calls_count": {
91
+ "additionalProperties": {
92
+ "type": "string"
93
+ },
94
+ "title": "Explained Tool Calls Count",
95
+ "type": "object"
96
+ }
97
+ },
98
+ "required": [
99
+ "explained_tool_calls_count"
100
+ ],
101
+ "title": "ToolCallCountEvaluatorJustification",
102
+ "type": "object"
103
+ }
104
+ }
@@ -0,0 +1,100 @@
1
+ {
2
+ "evaluatorTypeId": "uipath-tool-call-order",
3
+ "evaluatorConfigSchema": {
4
+ "$defs": {
5
+ "ToolCallOrderEvaluationCriteria": {
6
+ "description": "Evaluation criteria for the tool call order evaluator.",
7
+ "properties": {
8
+ "tool_calls_order": {
9
+ "items": {
10
+ "type": "string"
11
+ },
12
+ "title": "Tool Calls Order",
13
+ "type": "array"
14
+ }
15
+ },
16
+ "required": [
17
+ "tool_calls_order"
18
+ ],
19
+ "title": "ToolCallOrderEvaluationCriteria",
20
+ "type": "object"
21
+ }
22
+ },
23
+ "description": "Configuration for the tool call count evaluator.",
24
+ "properties": {
25
+ "name": {
26
+ "default": "ToolCallOrderEvaluator",
27
+ "title": "Name",
28
+ "type": "string"
29
+ },
30
+ "default_evaluation_criteria": {
31
+ "anyOf": [
32
+ {
33
+ "$ref": "#/$defs/ToolCallOrderEvaluationCriteria"
34
+ },
35
+ {
36
+ "type": "null"
37
+ }
38
+ ],
39
+ "default": null
40
+ },
41
+ "strict": {
42
+ "default": false,
43
+ "title": "Strict",
44
+ "type": "boolean"
45
+ }
46
+ },
47
+ "title": "ToolCallOrderEvaluatorConfig",
48
+ "type": "object"
49
+ },
50
+ "evaluationCriteriaSchema": {
51
+ "description": "Evaluation criteria for the tool call order evaluator.",
52
+ "properties": {
53
+ "tool_calls_order": {
54
+ "items": {
55
+ "type": "string"
56
+ },
57
+ "title": "Tool Calls Order",
58
+ "type": "array"
59
+ }
60
+ },
61
+ "required": [
62
+ "tool_calls_order"
63
+ ],
64
+ "title": "ToolCallOrderEvaluationCriteria",
65
+ "type": "object"
66
+ },
67
+ "justificationSchema": {
68
+ "description": "Justification for the tool call order evaluator.",
69
+ "properties": {
70
+ "actual_tool_calls_order": {
71
+ "items": {
72
+ "type": "string"
73
+ },
74
+ "title": "Actual Tool Calls Order",
75
+ "type": "array"
76
+ },
77
+ "expected_tool_calls_order": {
78
+ "items": {
79
+ "type": "string"
80
+ },
81
+ "title": "Expected Tool Calls Order",
82
+ "type": "array"
83
+ },
84
+ "lcs": {
85
+ "items": {
86
+ "type": "string"
87
+ },
88
+ "title": "Lcs",
89
+ "type": "array"
90
+ }
91
+ },
92
+ "required": [
93
+ "actual_tool_calls_order",
94
+ "expected_tool_calls_order",
95
+ "lcs"
96
+ ],
97
+ "title": "ToolCallOrderEvaluatorJustification",
98
+ "type": "object"
99
+ }
100
+ }
@@ -0,0 +1,124 @@
1
+ {
2
+ "evaluatorTypeId": "uipath-tool-call-output",
3
+ "evaluatorConfigSchema": {
4
+ "$defs": {
5
+ "ToolCallOutputEvaluationCriteria": {
6
+ "description": "Evaluation criteria for the tool call order evaluator.",
7
+ "properties": {
8
+ "tool_outputs": {
9
+ "items": {
10
+ "$ref": "#/$defs/ToolOutput"
11
+ },
12
+ "title": "Tool Outputs",
13
+ "type": "array"
14
+ }
15
+ },
16
+ "required": [
17
+ "tool_outputs"
18
+ ],
19
+ "title": "ToolCallOutputEvaluationCriteria",
20
+ "type": "object"
21
+ },
22
+ "ToolOutput": {
23
+ "description": "Represents a tool output with its output.",
24
+ "properties": {
25
+ "name": {
26
+ "title": "Name",
27
+ "type": "string"
28
+ },
29
+ "output": {
30
+ "title": "Output",
31
+ "type": "string"
32
+ }
33
+ },
34
+ "required": [
35
+ "name",
36
+ "output"
37
+ ],
38
+ "title": "ToolOutput",
39
+ "type": "object"
40
+ }
41
+ },
42
+ "description": "Configuration for the tool call count evaluator.",
43
+ "properties": {
44
+ "name": {
45
+ "default": "ToolCallOutputEvaluator",
46
+ "title": "Name",
47
+ "type": "string"
48
+ },
49
+ "default_evaluation_criteria": {
50
+ "anyOf": [
51
+ {
52
+ "$ref": "#/$defs/ToolCallOutputEvaluationCriteria"
53
+ },
54
+ {
55
+ "type": "null"
56
+ }
57
+ ],
58
+ "default": null
59
+ },
60
+ "strict": {
61
+ "default": false,
62
+ "title": "Strict",
63
+ "type": "boolean"
64
+ }
65
+ },
66
+ "title": "ToolCallOutputEvaluatorConfig",
67
+ "type": "object"
68
+ },
69
+ "evaluationCriteriaSchema": {
70
+ "$defs": {
71
+ "ToolOutput": {
72
+ "description": "Represents a tool output with its output.",
73
+ "properties": {
74
+ "name": {
75
+ "title": "Name",
76
+ "type": "string"
77
+ },
78
+ "output": {
79
+ "title": "Output",
80
+ "type": "string"
81
+ }
82
+ },
83
+ "required": [
84
+ "name",
85
+ "output"
86
+ ],
87
+ "title": "ToolOutput",
88
+ "type": "object"
89
+ }
90
+ },
91
+ "description": "Evaluation criteria for the tool call order evaluator.",
92
+ "properties": {
93
+ "tool_outputs": {
94
+ "items": {
95
+ "$ref": "#/$defs/ToolOutput"
96
+ },
97
+ "title": "Tool Outputs",
98
+ "type": "array"
99
+ }
100
+ },
101
+ "required": [
102
+ "tool_outputs"
103
+ ],
104
+ "title": "ToolCallOutputEvaluationCriteria",
105
+ "type": "object"
106
+ },
107
+ "justificationSchema": {
108
+ "description": "Justification for the tool call output evaluator.",
109
+ "properties": {
110
+ "explained_tool_calls_outputs": {
111
+ "additionalProperties": {
112
+ "type": "string"
113
+ },
114
+ "title": "Explained Tool Calls Outputs",
115
+ "type": "object"
116
+ }
117
+ },
118
+ "required": [
119
+ "explained_tool_calls_outputs"
120
+ ],
121
+ "title": "ToolCallOutputEvaluatorJustification",
122
+ "type": "object"
123
+ }
124
+ }
@@ -0,0 +1,31 @@
1
+ """Generate the JSON types for all evaluators."""
2
+
3
+ import json
4
+ import os
5
+ from typing import Any
6
+
7
+ from uipath.eval.evaluators import EVALUATORS
8
+
9
+
10
+ def generate_evaluator_json_types(
11
+ write_to_file: bool = False, indent: int | str | None = None
12
+ ) -> dict[str, Any]:
13
+ """Generate the JSON types for all evaluators."""
14
+ OUTPUT_PATH = os.path.dirname(os.path.abspath(__file__))
15
+
16
+ os.makedirs(OUTPUT_PATH, exist_ok=True)
17
+
18
+ evaluator_json_types = {}
19
+ for evaluator in EVALUATORS:
20
+ evaluator_json_type = evaluator.generate_json_type()
21
+ evaluator_json_types[evaluator.__name__] = evaluator_json_type
22
+ if write_to_file:
23
+ with open(
24
+ os.path.join(OUTPUT_PATH, f"{evaluator.__name__}.json"), "w"
25
+ ) as f:
26
+ json.dump(evaluator_json_type, f, indent=indent)
27
+ return evaluator_json_types
28
+
29
+
30
+ if __name__ == "__main__":
31
+ generate_evaluator_json_types(write_to_file=True, indent=2)
@@ -1,19 +1,34 @@
1
1
  """UiPath evaluation module for agent performance assessment."""
2
2
 
3
- from uipath.eval.models.models import (
3
+ from .models import (
4
+ AgentExecution,
4
5
  BooleanEvaluationResult,
5
6
  ErrorEvaluationResult,
6
7
  EvalItemResult,
7
8
  EvaluationResult,
9
+ EvaluatorType,
10
+ LegacyEvaluatorCategory,
11
+ LegacyEvaluatorType,
12
+ LLMResponse,
8
13
  NumericEvaluationResult,
9
14
  ScoreType,
15
+ ToolCall,
16
+ ToolOutput,
10
17
  )
11
18
 
12
19
  __all__ = [
20
+ "AgentExecution",
13
21
  "EvaluationResult",
22
+ "LLMResponse",
23
+ "LegacyEvaluatorCategory",
24
+ "LegacyEvaluatorType",
25
+ "EvaluatorType",
14
26
  "ScoreType",
15
27
  "EvalItemResult",
16
28
  "BooleanEvaluationResult",
17
29
  "NumericEvaluationResult",
18
30
  "ErrorEvaluationResult",
31
+ "ToolCall",
32
+ "EvaluatorType",
33
+ "ToolOutput",
19
34
  ]
@@ -0,0 +1,196 @@
1
+ """Types for LLM judge evaluators."""
2
+
3
+ from enum import Enum
4
+
5
+ from pydantic import BaseModel, Field
6
+
7
+
8
+ class LLMJudgeOutputSchema(BaseModel):
9
+ """Schema for LLM judge output."""
10
+
11
+ justification: str = Field(
12
+ ...,
13
+ description="A clear analysis of the semantic similarity of the input contents that appears BEFORE reaching a numeric score. It must justify every penalty or lenience, and mention the effects of any deviation.",
14
+ )
15
+ score: float = Field(
16
+ ...,
17
+ description="The final rounded integer between 0 and 100, computed strictly from the rubric in the prompt. It must follow the reasoning and contain only the number-no additional text.",
18
+ )
19
+
20
+
21
+ class LLMJudgeStrictJSONSimilarityOutputSchema(BaseModel):
22
+ """Schema for LLM judge strict JSON similarity output."""
23
+
24
+ justification: str = Field(
25
+ ...,
26
+ description="A clear, ≤250-word analysis that appears BEFORE the numeric score. It must discuss every key from ExpectedOutput, state whether each value in ActualOutput is equivalent, partially correct, or incorrect/missing, justify every penalty or lenience, and mention effects of extra keys.",
27
+ )
28
+ score: float = Field(
29
+ ...,
30
+ description="The final rounded integer between 0 and 100, computed strictly from the rubric in the prompt. It must follow the reasoning and contain only the number—no additional text.",
31
+ )
32
+
33
+
34
+ class LLMJudgeTrajectoryOutputSchema(BaseModel):
35
+ """Schema for LLM judge trajectory output."""
36
+
37
+ justification: str = Field(
38
+ ...,
39
+ description="A clear analysis of the similarity between the expected behavior and the actual behavior of the agent that appears BEFORE reaching a numeric score. It must justify every penalty or lenience, and mention the effects of any deviation. Include the expected behavior, and the actual behavior of the agent.",
40
+ )
41
+ score: float = Field(
42
+ ...,
43
+ description="The final rounded integer between 0 and 100, computed strictly from the rubric in the prompt. It must follow the reasoning and contain only the number—no additional text.",
44
+ )
45
+
46
+
47
+ class LLMJudgePromptTemplates(str, Enum):
48
+ """Templates for LLM judge prompts."""
49
+
50
+ LLM_JUDGE_SYSTEM_PROMPT = """You are an expert evaluator tasked with assessing text based on specific criteria. You will be given:
51
+ 1. An evaluation criterion or question.
52
+ 2. A text to evaluate.
53
+ Your task is to carefully analyze the given text according to the specified criterion.
54
+ If the criterion asks for a degree or extent, respond with a numerical score from 0 to 100:
55
+ 0 means the text does not meet the criterion at all.
56
+ 100 means the text fully meets the criterion.
57
+ If the criterion is a yes/no question or can be answered with true/false, respond with a boolean: true or false.
58
+ To submit your evaluation, use the correct tool for the score type.
59
+ Never answer using text. Only use the tool to submit your score.
60
+ """
61
+
62
+ LLM_JUDGE_DEFAULT_USER_PROMPT = """As an expert evaluator, analyze the semantic similarity of these JSON contents to determine a score from 0-100. Focus on comparing the meaning and contextual equivalence of corresponding fields, accounting for alternative valid expressions, synonyms, and reasonable variations in language while maintaining high standards for accuracy and completeness. Provide your score with a justification, explaining briefly and concisely why you gave that score.
63
+ ----
64
+ ExpectedOutput:
65
+ {{ExpectedOutput}}
66
+ ----
67
+ ActualOutput:
68
+ {{ActualOutput}}"""
69
+
70
+ LLM_JUDGE_STRICT_JSON_SIMILARITY_SYSTEM_PROMPT = """You are an impartial grading agent.
71
+
72
+ ⚠️ STEP 1: MANDATORY KEY INVENTORY (EXACT COUNTING)
73
+ List the exact top-level keys by copying them character-for-character:
74
+
75
+ Expected keys: ['key1', 'key2', 'key3', ...]
76
+ Actual keys: ['key1', 'key2', ...]
77
+ N (total expected keys): [exact integer]
78
+
79
+ ⚠️ STEP 2: DETERMINISTIC KEY MATCHING
80
+ For each expected key, check if EXACTLY THE SAME key name exists in actual output:
81
+
82
+ Expected Key 'KeyName1': EXISTS in actual? [YES/NO]
83
+ Expected Key 'KeyName2': EXISTS in actual? [YES/NO]
84
+ [Continue for all expected keys]
85
+
86
+ ⚠️ STEP 3: EXTRA KEY IDENTIFICATION
87
+ List any actual keys not in expected:
88
+ Extra keys: ['extrakey1', 'extrakey2', ...] or [NONE]
89
+
90
+ ⚠️ STEP 4: CONTENT ASSESSMENT (ONLY FOR MATCHING KEYS)
91
+ For keys that exist in both (from Step 2), assess content:
92
+ Key 'KeyName': Content assessment [IDENTICAL/SIMILAR/DIFFERENT]
93
+ [Only assess keys that showed YES in Step 2]
94
+
95
+ ⚠️ STEP 5: MECHANICAL SCORING
96
+ Apply these exact penalties:
97
+ - Missing key (not in actual): 100/N points each
98
+ - Similar key (exists with similar content): 50/N points each
99
+ - Wrong key (exists but SIGNIFICANTLY different content): 100/N points each
100
+ - Identical key (exists with IDENTICAL content): 0 points each
101
+ - Extra key (in actual but not expected): 10/N points each
102
+
103
+ ⚠️ MECHANICAL CATEGORIZATION:
104
+ Based on Steps 1-4, categorize each expected key:
105
+
106
+ 1. 'ExpectedKey1' → [MISSING/WRONG/SIMILAR/IDENTICAL] → Penalty: [calculation]
107
+ 2. 'ExpectedKey2' → [MISSING/WRONG/SIMILAR/IDENTICAL] → Penalty: [calculation]
108
+ [Continue for all expected keys]
109
+
110
+ Extra keys: [count] × (10/N) = [calculation]
111
+
112
+ ⚠️ EXACT ARITHMETIC:
113
+ Penalty calculations (show all work):
114
+ - N = [number]
115
+ - Missing keys: [count] × (100/[N]) = [count] × [decimal] = [total]
116
+ - Wrong keys: [count] × (100/[N]) = [count] × [decimal] = [total]
117
+ - Similar keys: [count] × (50/[N]) = [count] × [decimal] = [total]
118
+ - Extra keys: [count] × (10/[N]) = [count] × [decimal] = [total]
119
+
120
+ Total penalty: [sum all penalties] = [final penalty]
121
+ Final score: 100 - [final penalty] = [score] (minimum 0)
122
+
123
+ ⚠️ VERIFICATION CHECKLIST:
124
+ - Did I count N correctly by listing all expected keys?
125
+ - Did I check EXACT key name matches (character-for-character)?
126
+ - Did I only assess content for keys that exist in both?
127
+ - Did I calculate exact penalty fractions (100/N, not 100)?
128
+ - Did I show all arithmetic work step by step?
129
+ - Is my final score between 0 and 100?
130
+
131
+ ⚠️ CRITICAL RULES FOR CONSISTENCY:
132
+ - NEVER use semantic interpretation for key names (must be exact match)
133
+ - NEVER assess content for missing keys
134
+ - ALWAYS calculate penalties as fractions of N
135
+ - ALWAYS show exact arithmetic work
136
+ - IDENTICAL inputs MUST produce IDENTICAL outputs.
137
+
138
+ ⚠️ DETERMINISTIC REQUIREMENTS:
139
+ • Key matching is purely textual (character-by-character comparison)
140
+ • Content assessment is only for keys that exist in both outputs
141
+ • All arithmetic must be shown with exact fractions"""
142
+
143
+ LLM_JUDGE_STRICT_JSON_SIMILARITY_DEFAULT_USER_PROMPT = """ExpectedOutput (ground truth):\n{{ExpectedOutput}}\n\nActualOutput (model answer):\n{{ActualOutput}}"""
144
+
145
+ LLM_JUDGE_SIMULATION_TRAJECTORY_SYSTEM_PROMPT = """You are an expert evaluator tasked with assessing an agent running through a simulation.
146
+ The simulation engine was used to mock the tool responses given during the agent run based on the simulation instructions.
147
+ The agent did not know that the tool responses are simulated.
148
+ You will be given:
149
+ 1. The instructions the simulation engine was given to mock the tool responses given during the agent run.
150
+ 2. Expected behavior for the agent during the simulation.
151
+ 3. A trace/history of the agent run.
152
+ 4. The agent configuration used during the run.
153
+ Your task is to carefully analyze the agent run trace and it's output according to the specified criterion.
154
+ 0 means the agent did not meet the criterion at all.
155
+ 100 means the agent fully met the criterion.
156
+ To submit your evaluation, use the correct tool for the score type.
157
+ Never answer using text. Only use the tool to submit your score.
158
+ """
159
+
160
+ LLM_JUDGE_SIMULATION_TRAJECTORY_DEFAULT_USER_PROMPT = """As an expert evaluator, determine how well the agent did on a scale of 0-100. Focus on if the simulation was successful and if the agent behaved according to the expected output accounting for alternative valid expressions, and reasonable variations in language while maintaining high standards for accuracy and completeness. Provide your score with a justification, explaining briefly and concisely why you gave that score.
161
+ ----
162
+ AgentInput:
163
+ {{UserOrSyntheticInput}}
164
+ ----
165
+ SimulationInstructions:
166
+ {{SimulationInstructions}}
167
+ ----
168
+ ExpectedAgentBehavior:
169
+ {{ExpectedAgentBehavior}}
170
+ ----
171
+ AgentRunHistory:
172
+ {{AgentRunHistory}}
173
+ """
174
+
175
+ LLM_JUDGE_TRAJECTORY_SYSTEM_PROMPT = """You are an expert evaluator tasked with assessing an agent's behavior based on its execution trajectory in a simulation or real environment.
176
+ You will be given:
177
+ 1. Expected behavior for the agent during the run.
178
+ 2. A trace/history of the agent's actions and outputs.
179
+ 3. The agent configuration used during the run.
180
+ Your task is to carefully analyze the agent's trajectory and output according to the specified criterion.
181
+ A score of 0 means the agent did not meet the criterion at all, while 100 means the agent fully met the criterion.
182
+ To submit your evaluation, use the correct tool for the score type.
183
+ Never answer using text. Only use the tool to submit your score.
184
+ """
185
+
186
+ LLM_JUDGE_TRAJECTORY_DEFAULT_USER_PROMPT = """As an expert evaluator, determine how well the agent performed on a scale of 0-100. Focus on whether the agent's actions and outputs matched the expected behavior, while allowing for alternative valid expressions and reasonable variations in language. Maintain high standards for accuracy and completeness. Provide your score with a brief and clear justification explaining your reasoning.
187
+ ----
188
+ AgentInput:
189
+ {{UserOrSyntheticInput}}
190
+ ----
191
+ ExpectedAgentBehavior:
192
+ {{ExpectedAgentBehavior}}
193
+ ----
194
+ AgentRunHistory:
195
+ {{AgentRunHistory}}
196
+ """