uipath 2.1.107__py3-none-any.whl → 2.1.109__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of uipath might be problematic. Click here for more details.
- uipath/_cli/__init__.py +4 -0
- uipath/_cli/_evals/_console_progress_reporter.py +2 -2
- uipath/_cli/_evals/_evaluator_factory.py +314 -29
- uipath/_cli/_evals/_helpers.py +194 -0
- uipath/_cli/_evals/_models/_evaluation_set.py +73 -7
- uipath/_cli/_evals/_models/_evaluator.py +183 -9
- uipath/_cli/_evals/_models/_evaluator_base_params.py +3 -3
- uipath/_cli/_evals/_models/_output.py +87 -3
- uipath/_cli/_evals/_progress_reporter.py +288 -28
- uipath/_cli/_evals/_runtime.py +80 -26
- uipath/_cli/_evals/mocks/input_mocker.py +1 -3
- uipath/_cli/_evals/mocks/llm_mocker.py +2 -2
- uipath/_cli/_evals/mocks/mocker_factory.py +2 -2
- uipath/_cli/_evals/mocks/mockito_mocker.py +2 -2
- uipath/_cli/_evals/mocks/mocks.py +5 -3
- uipath/_cli/_push/models.py +17 -0
- uipath/_cli/_push/sw_file_handler.py +336 -3
- uipath/_cli/_runtime/_contracts.py +25 -5
- uipath/_cli/_templates/custom_evaluator.py.template +65 -0
- uipath/_cli/_utils/_eval_set.py +30 -9
- uipath/_cli/_utils/_resources.py +21 -0
- uipath/_cli/_utils/_studio_project.py +18 -0
- uipath/_cli/cli_add.py +114 -0
- uipath/_cli/cli_eval.py +5 -1
- uipath/_cli/cli_pull.py +11 -26
- uipath/_cli/cli_push.py +2 -0
- uipath/_cli/cli_register.py +45 -0
- uipath/_events/_events.py +6 -5
- uipath/_resources/SDK_REFERENCE.md +0 -97
- uipath/_uipath.py +10 -37
- uipath/_utils/constants.py +4 -0
- uipath/eval/_helpers/evaluators_helpers.py +494 -0
- uipath/eval/_helpers/helpers.py +30 -2
- uipath/eval/evaluators/__init__.py +60 -5
- uipath/eval/evaluators/base_evaluator.py +546 -44
- uipath/eval/evaluators/contains_evaluator.py +80 -0
- uipath/eval/evaluators/exact_match_evaluator.py +43 -12
- uipath/eval/evaluators/json_similarity_evaluator.py +41 -12
- uipath/eval/evaluators/legacy_base_evaluator.py +89 -0
- uipath/eval/evaluators/{deterministic_evaluator_base.py → legacy_deterministic_evaluator_base.py} +2 -2
- uipath/eval/evaluators/legacy_exact_match_evaluator.py +37 -0
- uipath/eval/evaluators/legacy_json_similarity_evaluator.py +151 -0
- uipath/eval/evaluators/legacy_llm_as_judge_evaluator.py +137 -0
- uipath/eval/evaluators/{trajectory_evaluator.py → legacy_trajectory_evaluator.py} +5 -6
- uipath/eval/evaluators/llm_as_judge_evaluator.py +143 -78
- uipath/eval/evaluators/llm_judge_output_evaluator.py +112 -0
- uipath/eval/evaluators/llm_judge_trajectory_evaluator.py +142 -0
- uipath/eval/evaluators/output_evaluator.py +117 -0
- uipath/eval/evaluators/tool_call_args_evaluator.py +82 -0
- uipath/eval/evaluators/tool_call_count_evaluator.py +87 -0
- uipath/eval/evaluators/tool_call_order_evaluator.py +84 -0
- uipath/eval/evaluators/tool_call_output_evaluator.py +87 -0
- uipath/eval/evaluators_types/ContainsEvaluator.json +73 -0
- uipath/eval/evaluators_types/ExactMatchEvaluator.json +89 -0
- uipath/eval/evaluators_types/JsonSimilarityEvaluator.json +81 -0
- uipath/eval/evaluators_types/LLMJudgeOutputEvaluator.json +110 -0
- uipath/eval/evaluators_types/LLMJudgeSimulationTrajectoryEvaluator.json +88 -0
- uipath/eval/evaluators_types/LLMJudgeStrictJSONSimilarityOutputEvaluator.json +110 -0
- uipath/eval/evaluators_types/LLMJudgeTrajectoryEvaluator.json +88 -0
- uipath/eval/evaluators_types/ToolCallArgsEvaluator.json +131 -0
- uipath/eval/evaluators_types/ToolCallCountEvaluator.json +104 -0
- uipath/eval/evaluators_types/ToolCallOrderEvaluator.json +100 -0
- uipath/eval/evaluators_types/ToolCallOutputEvaluator.json +124 -0
- uipath/eval/evaluators_types/generate_types.py +31 -0
- uipath/eval/models/__init__.py +16 -1
- uipath/eval/models/llm_judge_types.py +196 -0
- uipath/eval/models/models.py +109 -7
- {uipath-2.1.107.dist-info → uipath-2.1.109.dist-info}/METADATA +1 -1
- {uipath-2.1.107.dist-info → uipath-2.1.109.dist-info}/RECORD +72 -40
- {uipath-2.1.107.dist-info → uipath-2.1.109.dist-info}/WHEEL +0 -0
- {uipath-2.1.107.dist-info → uipath-2.1.109.dist-info}/entry_points.txt +0 -0
- {uipath-2.1.107.dist-info → uipath-2.1.109.dist-info}/licenses/LICENSE +0 -0
|
@@ -16,11 +16,11 @@ from ..models.models import (
|
|
|
16
16
|
NumericEvaluationResult,
|
|
17
17
|
TrajectoryEvaluationTrace,
|
|
18
18
|
)
|
|
19
|
-
from .
|
|
19
|
+
from .legacy_base_evaluator import LegacyBaseEvaluator
|
|
20
20
|
|
|
21
21
|
|
|
22
|
-
class
|
|
23
|
-
"""
|
|
22
|
+
class LegacyTrajectoryEvaluator(LegacyBaseEvaluator[dict[str, Any]]):
|
|
23
|
+
"""Legacy evaluator that analyzes the trajectory/path taken to reach outputs."""
|
|
24
24
|
|
|
25
25
|
prompt: str
|
|
26
26
|
model: str
|
|
@@ -38,7 +38,7 @@ class TrajectoryEvaluator(BaseEvaluator[dict[str, Any]]):
|
|
|
38
38
|
)
|
|
39
39
|
return v
|
|
40
40
|
|
|
41
|
-
def model_post_init(self, __context):
|
|
41
|
+
def model_post_init(self, __context: Any):
|
|
42
42
|
"""Initialize the LLM service after model creation."""
|
|
43
43
|
super().model_post_init(__context)
|
|
44
44
|
self._initialize_llm()
|
|
@@ -76,7 +76,6 @@ class TrajectoryEvaluator(BaseEvaluator[dict[str, Any]]):
|
|
|
76
76
|
expected_agent_behavior=agent_execution.expected_agent_behavior,
|
|
77
77
|
agent_run_history=agent_execution.agent_trace,
|
|
78
78
|
)
|
|
79
|
-
|
|
80
79
|
llm_response = await self._get_llm_response(evaluation_prompt)
|
|
81
80
|
|
|
82
81
|
return NumericEvaluationResult(
|
|
@@ -160,4 +159,4 @@ class TrajectoryEvaluator(BaseEvaluator[dict[str, Any]]):
|
|
|
160
159
|
}
|
|
161
160
|
|
|
162
161
|
response = await self.llm.chat_completions(**request_data)
|
|
163
|
-
return LLMResponse(**json.loads(response.choices[-1].message.content))
|
|
162
|
+
return LLMResponse(**json.loads(response.choices[-1].message.content or "{}"))
|
|
@@ -1,137 +1,202 @@
|
|
|
1
1
|
"""LLM-as-a-judge evaluator for subjective quality assessment of agent outputs."""
|
|
2
2
|
|
|
3
3
|
import json
|
|
4
|
-
from
|
|
4
|
+
from abc import abstractmethod
|
|
5
|
+
from collections.abc import Callable
|
|
6
|
+
from typing import Any, TypeVar
|
|
7
|
+
|
|
8
|
+
from pydantic import BaseModel, Field, model_validator
|
|
9
|
+
|
|
10
|
+
from .._helpers.evaluators_helpers import COMMUNITY_agents_SUFFIX
|
|
11
|
+
from ..models import (
|
|
12
|
+
AgentExecution,
|
|
13
|
+
EvaluationResult,
|
|
14
|
+
LLMResponse,
|
|
15
|
+
NumericEvaluationResult,
|
|
16
|
+
)
|
|
17
|
+
from ..models.llm_judge_types import (
|
|
18
|
+
LLMJudgeOutputSchema,
|
|
19
|
+
LLMJudgePromptTemplates,
|
|
20
|
+
)
|
|
21
|
+
from ..models.models import UiPathEvaluationError, UiPathEvaluationErrorCategory
|
|
22
|
+
from .base_evaluator import (
|
|
23
|
+
BaseEvaluationCriteria,
|
|
24
|
+
BaseEvaluator,
|
|
25
|
+
BaseEvaluatorConfig,
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
T = TypeVar("T", bound=BaseEvaluationCriteria)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class BaseLLMJudgeEvaluatorConfig(BaseEvaluatorConfig[T]):
|
|
32
|
+
"""Base config for all LLM evaluators.
|
|
33
|
+
|
|
34
|
+
Generic over T (evaluation criteria type) to ensure type safety between
|
|
35
|
+
the config's default_evaluation_criteria and the evaluator's expected criteria type.
|
|
36
|
+
"""
|
|
5
37
|
|
|
6
|
-
|
|
38
|
+
prompt: str
|
|
39
|
+
model: str = ""
|
|
40
|
+
temperature: float = 0.0
|
|
41
|
+
max_tokens: int | None = None
|
|
7
42
|
|
|
8
|
-
from uipath.eval.models import NumericEvaluationResult
|
|
9
43
|
|
|
10
|
-
|
|
11
|
-
from ..._utils.constants import COMMUNITY_agents_SUFFIX
|
|
12
|
-
from ..models.models import AgentExecution, EvaluationResult, LLMResponse
|
|
13
|
-
from .base_evaluator import BaseEvaluator
|
|
44
|
+
C = TypeVar("C", bound=BaseLLMJudgeEvaluatorConfig[Any])
|
|
14
45
|
|
|
15
46
|
|
|
16
|
-
class
|
|
17
|
-
"""
|
|
47
|
+
class LLMJudgeMixin(BaseEvaluator[T, C, str]):
|
|
48
|
+
"""Mixin that provides common LLM judge functionality."""
|
|
18
49
|
|
|
19
|
-
|
|
20
|
-
|
|
50
|
+
system_prompt: str = LLMJudgePromptTemplates.LLM_JUDGE_SYSTEM_PROMPT
|
|
51
|
+
output_schema: type[BaseModel] = LLMJudgeOutputSchema
|
|
21
52
|
actual_output_placeholder: str = "{{ActualOutput}}"
|
|
22
53
|
expected_output_placeholder: str = "{{ExpectedOutput}}"
|
|
23
|
-
|
|
54
|
+
llm_service: Callable[..., Any] | None = Field(
|
|
55
|
+
default=None, exclude=True, description="The LLM service for evaluation"
|
|
56
|
+
)
|
|
24
57
|
|
|
25
|
-
@
|
|
26
|
-
|
|
27
|
-
def validate_prompt_placeholders(cls, v: str) -> str:
|
|
58
|
+
@model_validator(mode="after")
|
|
59
|
+
def validate_prompt_placeholders(self) -> "LLMJudgeMixin[T, C]":
|
|
28
60
|
"""Validate that prompt contains required placeholders."""
|
|
29
|
-
if
|
|
30
|
-
|
|
31
|
-
|
|
61
|
+
if (
|
|
62
|
+
self.actual_output_placeholder not in self.evaluator_config.prompt
|
|
63
|
+
or self.expected_output_placeholder not in self.evaluator_config.prompt
|
|
64
|
+
):
|
|
65
|
+
raise UiPathEvaluationError(
|
|
66
|
+
code="INVALID_PROMPT_PLACEHOLDERS",
|
|
67
|
+
title="Prompt must contain both {ActualOutput} and {ExpectedOutput} placeholders",
|
|
68
|
+
detail="Prompt must contain both {ActualOutput} and {ExpectedOutput} placeholders",
|
|
69
|
+
category=UiPathEvaluationErrorCategory.USER,
|
|
32
70
|
)
|
|
33
|
-
return
|
|
71
|
+
return self
|
|
34
72
|
|
|
35
|
-
def model_post_init(self, __context):
|
|
36
|
-
"""Initialize the LLM service
|
|
73
|
+
def model_post_init(self, __context: Any) -> None:
|
|
74
|
+
"""Initialize the LLM service if not provided."""
|
|
37
75
|
super().model_post_init(__context)
|
|
38
|
-
self.
|
|
76
|
+
if self.llm_service is None:
|
|
77
|
+
self.llm_service = self._get_llm_service()
|
|
39
78
|
|
|
40
|
-
def
|
|
41
|
-
"""
|
|
79
|
+
def _get_llm_service(self):
|
|
80
|
+
"""Get the LLM service from the UiPath instance."""
|
|
42
81
|
from uipath import UiPath
|
|
43
82
|
|
|
44
|
-
|
|
45
|
-
|
|
83
|
+
try:
|
|
84
|
+
uipath = UiPath()
|
|
85
|
+
return uipath.llm.chat_completions
|
|
86
|
+
except Exception as e:
|
|
87
|
+
raise UiPathEvaluationError(
|
|
88
|
+
code="FAILED_TO_GET_LLM_SERVICE",
|
|
89
|
+
title="Failed to get LLM service from the SDK and no otherLLM service provided",
|
|
90
|
+
detail=f"Error: {e}",
|
|
91
|
+
category=UiPathEvaluationErrorCategory.SYSTEM,
|
|
92
|
+
) from e
|
|
93
|
+
|
|
94
|
+
@abstractmethod
|
|
95
|
+
def _get_actual_output(self, agent_execution: AgentExecution) -> Any:
|
|
96
|
+
"""Get the actual output from the agent execution. Must be implemented by concrete evaluator classes."""
|
|
97
|
+
pass
|
|
98
|
+
|
|
99
|
+
@abstractmethod
|
|
100
|
+
def _get_expected_output(self, evaluation_criteria: T) -> Any:
|
|
101
|
+
"""Get the expected output from the evaluation criteria. Must be implemented by concrete evaluator classes."""
|
|
102
|
+
pass
|
|
46
103
|
|
|
47
104
|
async def evaluate(
|
|
48
105
|
self,
|
|
49
106
|
agent_execution: AgentExecution,
|
|
50
|
-
evaluation_criteria:
|
|
107
|
+
evaluation_criteria: T,
|
|
51
108
|
) -> EvaluationResult:
|
|
52
|
-
"""Evaluate using an LLM as a judge.
|
|
53
|
-
|
|
54
|
-
Sends the formatted prompt to the configured LLM and expects a JSON response
|
|
55
|
-
with a numerical score (0-100) and justification.
|
|
56
|
-
|
|
57
|
-
agent_execution: The execution details containing:
|
|
58
|
-
- agent_input: The input received by the agent
|
|
59
|
-
- actual_output: The actual output from the agent
|
|
60
|
-
- spans: The execution spans to use for the evaluation
|
|
61
|
-
evaluation_criteria: The criteria to evaluate
|
|
62
|
-
|
|
63
|
-
Returns:
|
|
64
|
-
EvaluationResult: Numerical score with LLM justification as details
|
|
65
|
-
"""
|
|
66
|
-
# Create the evaluation prompt
|
|
109
|
+
"""Evaluate using an LLM as a judge."""
|
|
67
110
|
evaluation_prompt = self._create_evaluation_prompt(
|
|
68
|
-
|
|
69
|
-
|
|
111
|
+
agent_execution=agent_execution,
|
|
112
|
+
evaluation_criteria=evaluation_criteria,
|
|
70
113
|
)
|
|
71
114
|
|
|
72
115
|
llm_response = await self._get_llm_response(evaluation_prompt)
|
|
116
|
+
validated_justification = self.validate_justification(
|
|
117
|
+
llm_response.justification
|
|
118
|
+
)
|
|
73
119
|
|
|
74
120
|
return NumericEvaluationResult(
|
|
75
|
-
score=llm_response.score,
|
|
76
|
-
details=
|
|
121
|
+
score=max(0.0, min(1.0, round(llm_response.score / 100.0, 2))),
|
|
122
|
+
details=validated_justification,
|
|
77
123
|
)
|
|
78
124
|
|
|
79
125
|
def _create_evaluation_prompt(
|
|
80
|
-
self,
|
|
126
|
+
self,
|
|
127
|
+
agent_execution: AgentExecution,
|
|
128
|
+
evaluation_criteria: T,
|
|
81
129
|
) -> str:
|
|
82
130
|
"""Create the evaluation prompt for the LLM."""
|
|
83
|
-
formatted_prompt = self.prompt.replace(
|
|
131
|
+
formatted_prompt = self.evaluator_config.prompt.replace(
|
|
84
132
|
self.actual_output_placeholder,
|
|
85
|
-
str(
|
|
133
|
+
str(self._get_actual_output(agent_execution)),
|
|
86
134
|
)
|
|
87
135
|
formatted_prompt = formatted_prompt.replace(
|
|
88
136
|
self.expected_output_placeholder,
|
|
89
|
-
str(
|
|
137
|
+
str(self._get_expected_output(evaluation_criteria)),
|
|
90
138
|
)
|
|
91
139
|
|
|
92
140
|
return formatted_prompt
|
|
93
141
|
|
|
94
142
|
async def _get_llm_response(self, evaluation_prompt: str) -> LLMResponse:
|
|
95
|
-
"""Get response from the LLM.
|
|
96
|
-
|
|
97
|
-
Args:
|
|
98
|
-
evaluation_prompt: The formatted prompt to send to the LLM
|
|
99
|
-
|
|
100
|
-
Returns:
|
|
101
|
-
LLMResponse with score and justification
|
|
102
|
-
"""
|
|
143
|
+
"""Get response from the LLM."""
|
|
103
144
|
# remove community-agents suffix from llm model name
|
|
104
|
-
model = self.model
|
|
145
|
+
model = self.evaluator_config.model
|
|
105
146
|
if model.endswith(COMMUNITY_agents_SUFFIX):
|
|
106
147
|
model = model.replace(COMMUNITY_agents_SUFFIX, "")
|
|
107
148
|
|
|
108
149
|
# Prepare the request
|
|
109
150
|
request_data = {
|
|
110
151
|
"model": model,
|
|
111
|
-
"messages": [
|
|
152
|
+
"messages": [
|
|
153
|
+
{"role": "system", "content": self.system_prompt},
|
|
154
|
+
{"role": "user", "content": evaluation_prompt},
|
|
155
|
+
],
|
|
112
156
|
"response_format": {
|
|
113
157
|
"type": "json_schema",
|
|
114
158
|
"json_schema": {
|
|
115
159
|
"name": "evaluation_response",
|
|
116
|
-
"schema":
|
|
117
|
-
"type": "object",
|
|
118
|
-
"properties": {
|
|
119
|
-
"score": {
|
|
120
|
-
"type": "number",
|
|
121
|
-
"minimum": 0,
|
|
122
|
-
"maximum": 100,
|
|
123
|
-
"description": "Score between 0 and 100",
|
|
124
|
-
},
|
|
125
|
-
"justification": {
|
|
126
|
-
"type": "string",
|
|
127
|
-
"description": "Explanation for the score",
|
|
128
|
-
},
|
|
129
|
-
},
|
|
130
|
-
"required": ["score", "justification"],
|
|
131
|
-
},
|
|
160
|
+
"schema": self.output_schema.model_json_schema(),
|
|
132
161
|
},
|
|
133
162
|
},
|
|
163
|
+
"max_tokens": self.evaluator_config.max_tokens,
|
|
164
|
+
"temperature": self.evaluator_config.temperature,
|
|
134
165
|
}
|
|
135
166
|
|
|
136
|
-
|
|
137
|
-
|
|
167
|
+
if self.llm_service is None:
|
|
168
|
+
raise UiPathEvaluationError(
|
|
169
|
+
code="LLM_SERVICE_NOT_INITIALIZED",
|
|
170
|
+
title="LLM service not initialized",
|
|
171
|
+
detail="LLM service not initialized",
|
|
172
|
+
category=UiPathEvaluationErrorCategory.SYSTEM,
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
try:
|
|
176
|
+
response = await self.llm_service(**request_data)
|
|
177
|
+
except Exception as e:
|
|
178
|
+
raise UiPathEvaluationError(
|
|
179
|
+
code="FAILED_TO_GET_LLM_RESPONSE",
|
|
180
|
+
title="Failed to get LLM response",
|
|
181
|
+
detail=f"Error: {e}",
|
|
182
|
+
category=UiPathEvaluationErrorCategory.SYSTEM,
|
|
183
|
+
) from e
|
|
184
|
+
|
|
185
|
+
try:
|
|
186
|
+
content = response.choices[-1].message.content
|
|
187
|
+
if content is None:
|
|
188
|
+
raise UiPathEvaluationError(
|
|
189
|
+
code="EMPTY_LLM_RESPONSE",
|
|
190
|
+
title="Empty LLM response",
|
|
191
|
+
detail="The LLM response message content was None.",
|
|
192
|
+
category=UiPathEvaluationErrorCategory.SYSTEM,
|
|
193
|
+
)
|
|
194
|
+
parsed_response = json.loads(str(content))
|
|
195
|
+
except Exception as e:
|
|
196
|
+
raise UiPathEvaluationError(
|
|
197
|
+
code="FAILED_TO_PARSE_LLM_RESPONSE",
|
|
198
|
+
title="Failed to parse LLM response",
|
|
199
|
+
detail=f"Error: {e}",
|
|
200
|
+
category=UiPathEvaluationErrorCategory.SYSTEM,
|
|
201
|
+
) from e
|
|
202
|
+
return LLMResponse(**parsed_response)
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
"""LLM judge output evaluators for evaluating agent outputs."""
|
|
2
|
+
|
|
3
|
+
from typing import TypeVar
|
|
4
|
+
|
|
5
|
+
from pydantic import BaseModel
|
|
6
|
+
|
|
7
|
+
from uipath.eval.models import EvaluatorType
|
|
8
|
+
|
|
9
|
+
from ..models import AgentExecution, EvaluationResult
|
|
10
|
+
from ..models.llm_judge_types import (
|
|
11
|
+
LLMJudgeOutputSchema,
|
|
12
|
+
LLMJudgePromptTemplates,
|
|
13
|
+
LLMJudgeStrictJSONSimilarityOutputSchema,
|
|
14
|
+
)
|
|
15
|
+
from .llm_as_judge_evaluator import (
|
|
16
|
+
BaseLLMJudgeEvaluatorConfig,
|
|
17
|
+
LLMJudgeMixin,
|
|
18
|
+
)
|
|
19
|
+
from .output_evaluator import (
|
|
20
|
+
OutputEvaluationCriteria,
|
|
21
|
+
OutputEvaluator,
|
|
22
|
+
OutputEvaluatorConfig,
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class BaseLLMJudgeOutputCriteriaEvaluatorConfig(
|
|
27
|
+
OutputEvaluatorConfig[OutputEvaluationCriteria],
|
|
28
|
+
BaseLLMJudgeEvaluatorConfig[OutputEvaluationCriteria],
|
|
29
|
+
):
|
|
30
|
+
"""Base configuration for LLM judge output criteria evaluators."""
|
|
31
|
+
|
|
32
|
+
pass
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class LLMJudgeOutputEvaluatorConfig(BaseLLMJudgeOutputCriteriaEvaluatorConfig):
|
|
36
|
+
"""Configuration for the LLM judge output evaluator."""
|
|
37
|
+
|
|
38
|
+
name: str = "LLMJudgeOutputEvaluator"
|
|
39
|
+
prompt: str = LLMJudgePromptTemplates.LLM_JUDGE_DEFAULT_USER_PROMPT
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class LLMJudgeStrictJSONSimilarityOutputEvaluatorConfig(LLMJudgeOutputEvaluatorConfig):
|
|
43
|
+
"""Configuration for the LLM judge strict JSON similarity output evaluator."""
|
|
44
|
+
|
|
45
|
+
name: str = "LLMJudgeStrictJSONSimilarityOutputEvaluator"
|
|
46
|
+
prompt: str = (
|
|
47
|
+
LLMJudgePromptTemplates.LLM_JUDGE_STRICT_JSON_SIMILARITY_DEFAULT_USER_PROMPT
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
OC = TypeVar("OC", bound=LLMJudgeOutputEvaluatorConfig)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class BaseLLMOutputEvaluator(
|
|
55
|
+
OutputEvaluator[OutputEvaluationCriteria, OC, str],
|
|
56
|
+
LLMJudgeMixin[OutputEvaluationCriteria, OC],
|
|
57
|
+
):
|
|
58
|
+
"""Base class for LLM judge output evaluators that contains all shared functionality.
|
|
59
|
+
|
|
60
|
+
This class encapsulates the common evaluation logic for output-based LLM evaluators,
|
|
61
|
+
combining OutputEvaluator (for output extraction) with LLMJudgeMixin (for LLM functionality).
|
|
62
|
+
"""
|
|
63
|
+
|
|
64
|
+
@classmethod
|
|
65
|
+
def get_evaluator_id(cls) -> str:
|
|
66
|
+
"""Get the evaluator id."""
|
|
67
|
+
return EvaluatorType.LLM_JUDGE_OUTPUT.value
|
|
68
|
+
|
|
69
|
+
async def evaluate(
|
|
70
|
+
self,
|
|
71
|
+
agent_execution: AgentExecution,
|
|
72
|
+
evaluation_criteria: OutputEvaluationCriteria,
|
|
73
|
+
) -> EvaluationResult:
|
|
74
|
+
"""Evaluate using an LLM as a judge."""
|
|
75
|
+
# Explicitly delegate to LLMJudgeMixin's evaluate method to override BaseEvaluator
|
|
76
|
+
return await LLMJudgeMixin.evaluate(self, agent_execution, evaluation_criteria)
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
class LLMJudgeOutputEvaluator(BaseLLMOutputEvaluator[LLMJudgeOutputEvaluatorConfig]):
|
|
80
|
+
"""Evaluator that uses an LLM to judge the quality of agent output.
|
|
81
|
+
|
|
82
|
+
Inherits all functionality from BaseLLMOutputEvaluator but uses the standard
|
|
83
|
+
system prompt and output schema for general output evaluation.
|
|
84
|
+
"""
|
|
85
|
+
|
|
86
|
+
system_prompt: str = LLMJudgePromptTemplates.LLM_JUDGE_SYSTEM_PROMPT
|
|
87
|
+
output_schema: type[BaseModel] = LLMJudgeOutputSchema
|
|
88
|
+
|
|
89
|
+
@classmethod
|
|
90
|
+
def get_evaluator_id(cls) -> str:
|
|
91
|
+
"""Get the evaluator id."""
|
|
92
|
+
return EvaluatorType.LLM_JUDGE_OUTPUT_SEMANTIC_SIMILARITY.value
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
class LLMJudgeStrictJSONSimilarityOutputEvaluator(
|
|
96
|
+
BaseLLMOutputEvaluator[LLMJudgeStrictJSONSimilarityOutputEvaluatorConfig]
|
|
97
|
+
):
|
|
98
|
+
"""Evaluator that uses an LLM to judge the quality of agent output with strict JSON similarity.
|
|
99
|
+
|
|
100
|
+
Inherits all functionality from BaseLLMOutputEvaluator but uses a different system prompt
|
|
101
|
+
and output schema specific to strict JSON similarity evaluation.
|
|
102
|
+
"""
|
|
103
|
+
|
|
104
|
+
system_prompt: str = (
|
|
105
|
+
LLMJudgePromptTemplates.LLM_JUDGE_STRICT_JSON_SIMILARITY_SYSTEM_PROMPT
|
|
106
|
+
)
|
|
107
|
+
output_schema: type[BaseModel] = LLMJudgeStrictJSONSimilarityOutputSchema
|
|
108
|
+
|
|
109
|
+
@classmethod
|
|
110
|
+
def get_evaluator_id(cls) -> str:
|
|
111
|
+
"""Get the evaluator id."""
|
|
112
|
+
return EvaluatorType.LLM_JUDGE_OUTPUT_STRICT_JSON_SIMILARITY.value
|
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
"""LLM judge trajectory evaluator for evaluating agent execution trajectories."""
|
|
2
|
+
|
|
3
|
+
from typing import Any, TypeVar
|
|
4
|
+
|
|
5
|
+
from pydantic import BaseModel
|
|
6
|
+
|
|
7
|
+
from .._helpers.evaluators_helpers import trace_to_str
|
|
8
|
+
from ..models import (
|
|
9
|
+
AgentExecution,
|
|
10
|
+
EvaluationResult,
|
|
11
|
+
EvaluatorType,
|
|
12
|
+
)
|
|
13
|
+
from ..models.llm_judge_types import (
|
|
14
|
+
LLMJudgePromptTemplates,
|
|
15
|
+
LLMJudgeTrajectoryOutputSchema,
|
|
16
|
+
)
|
|
17
|
+
from .base_evaluator import BaseEvaluationCriteria
|
|
18
|
+
from .llm_as_judge_evaluator import (
|
|
19
|
+
BaseLLMJudgeEvaluatorConfig,
|
|
20
|
+
LLMJudgeMixin,
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class TrajectoryEvaluationCriteria(BaseEvaluationCriteria):
|
|
25
|
+
"""Evaluation criteria for trajectory-based evaluations."""
|
|
26
|
+
|
|
27
|
+
expected_agent_behavior: str
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class LLMJudgeTrajectoryEvaluatorConfig(
|
|
31
|
+
BaseLLMJudgeEvaluatorConfig[TrajectoryEvaluationCriteria]
|
|
32
|
+
):
|
|
33
|
+
"""Configuration for the llm judge trajectory evaluator."""
|
|
34
|
+
|
|
35
|
+
name: str = "LLMJudgeTrajectoryEvaluator"
|
|
36
|
+
prompt: str = LLMJudgePromptTemplates.LLM_JUDGE_TRAJECTORY_DEFAULT_USER_PROMPT
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class LLMJudgeTrajectorySimulationEvaluatorConfig(
|
|
40
|
+
BaseLLMJudgeEvaluatorConfig[TrajectoryEvaluationCriteria]
|
|
41
|
+
):
|
|
42
|
+
"""Configuration for the llm judge simulation trajectory evaluator."""
|
|
43
|
+
|
|
44
|
+
name: str = "LLMJudgeTrajectorySimulationEvaluator"
|
|
45
|
+
prompt: str = (
|
|
46
|
+
LLMJudgePromptTemplates.LLM_JUDGE_SIMULATION_TRAJECTORY_DEFAULT_USER_PROMPT
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
TC = TypeVar("TC", bound=BaseLLMJudgeEvaluatorConfig[TrajectoryEvaluationCriteria])
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class BaseLLMTrajectoryEvaluator(LLMJudgeMixin[TrajectoryEvaluationCriteria, TC]):
|
|
54
|
+
"""Base class for LLM trajectory evaluators that contains all shared functionality.
|
|
55
|
+
|
|
56
|
+
This class encapsulates the common evaluation logic for trajectory-based LLM evaluators,
|
|
57
|
+
including output extraction, prompt formatting, and evaluation criteria handling.
|
|
58
|
+
"""
|
|
59
|
+
|
|
60
|
+
output_schema: type[BaseModel] = LLMJudgeTrajectoryOutputSchema
|
|
61
|
+
actual_output_placeholder: str = "{{AgentRunHistory}}"
|
|
62
|
+
expected_output_placeholder: str = "{{ExpectedAgentBehavior}}"
|
|
63
|
+
user_input_placeholder: str = "{{UserOrSyntheticInput}}"
|
|
64
|
+
simulation_instructions_placeholder: str = "{{SimulationInstructions}}"
|
|
65
|
+
|
|
66
|
+
@classmethod
|
|
67
|
+
def get_evaluator_id(cls) -> str:
|
|
68
|
+
"""Get the evaluator id."""
|
|
69
|
+
return EvaluatorType.LLM_JUDGE_TRAJECTORY.value
|
|
70
|
+
|
|
71
|
+
async def evaluate(
|
|
72
|
+
self,
|
|
73
|
+
agent_execution: AgentExecution,
|
|
74
|
+
evaluation_criteria: TrajectoryEvaluationCriteria,
|
|
75
|
+
) -> EvaluationResult:
|
|
76
|
+
"""Evaluate using trajectory analysis."""
|
|
77
|
+
return await super().evaluate(agent_execution, evaluation_criteria)
|
|
78
|
+
|
|
79
|
+
def _get_actual_output(self, agent_execution: AgentExecution) -> Any:
|
|
80
|
+
"""Get the actual output from the agent execution."""
|
|
81
|
+
return trace_to_str(agent_execution.agent_trace)
|
|
82
|
+
|
|
83
|
+
def _get_expected_output(
|
|
84
|
+
self, evaluation_criteria: TrajectoryEvaluationCriteria
|
|
85
|
+
) -> Any:
|
|
86
|
+
"""Get the expected agent behavior from the evaluation criteria."""
|
|
87
|
+
return evaluation_criteria.expected_agent_behavior
|
|
88
|
+
|
|
89
|
+
def _create_evaluation_prompt(
|
|
90
|
+
self,
|
|
91
|
+
agent_execution: AgentExecution,
|
|
92
|
+
evaluation_criteria: TrajectoryEvaluationCriteria,
|
|
93
|
+
) -> str:
|
|
94
|
+
"""Create the evaluation prompt for the LLM."""
|
|
95
|
+
formatted_prompt = super()._create_evaluation_prompt(
|
|
96
|
+
agent_execution, evaluation_criteria
|
|
97
|
+
)
|
|
98
|
+
formatted_prompt = formatted_prompt.replace(
|
|
99
|
+
self.user_input_placeholder,
|
|
100
|
+
str(agent_execution.agent_input),
|
|
101
|
+
)
|
|
102
|
+
formatted_prompt = formatted_prompt.replace(
|
|
103
|
+
self.simulation_instructions_placeholder,
|
|
104
|
+
agent_execution.simulation_instructions,
|
|
105
|
+
)
|
|
106
|
+
return formatted_prompt
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
class LLMJudgeTrajectoryEvaluator(
|
|
110
|
+
BaseLLMTrajectoryEvaluator[LLMJudgeTrajectoryEvaluatorConfig]
|
|
111
|
+
):
|
|
112
|
+
"""Evaluator that uses an LLM to judge the quality of agent trajectory.
|
|
113
|
+
|
|
114
|
+
Inherits all functionality from BaseLLMTrajectoryEvaluator but uses the standard
|
|
115
|
+
system prompt and configuration for general trajectory evaluation.
|
|
116
|
+
"""
|
|
117
|
+
|
|
118
|
+
system_prompt: str = LLMJudgePromptTemplates.LLM_JUDGE_TRAJECTORY_SYSTEM_PROMPT
|
|
119
|
+
|
|
120
|
+
@classmethod
|
|
121
|
+
def get_evaluator_id(cls) -> str:
|
|
122
|
+
"""Get the evaluator id."""
|
|
123
|
+
return EvaluatorType.LLM_JUDGE_TRAJECTORY_SIMILARITY.value
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
class LLMJudgeTrajectorySimulationEvaluator(
|
|
127
|
+
BaseLLMTrajectoryEvaluator[LLMJudgeTrajectorySimulationEvaluatorConfig]
|
|
128
|
+
):
|
|
129
|
+
"""Evaluator that uses an LLM to judge the quality of agent trajectory for simulations.
|
|
130
|
+
|
|
131
|
+
Inherits all functionality from BaseLLMTrajectoryEvaluator but uses a different system prompt
|
|
132
|
+
and configuration specific to simulation evaluation.
|
|
133
|
+
"""
|
|
134
|
+
|
|
135
|
+
system_prompt: str = (
|
|
136
|
+
LLMJudgePromptTemplates.LLM_JUDGE_SIMULATION_TRAJECTORY_SYSTEM_PROMPT
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
@classmethod
|
|
140
|
+
def get_evaluator_id(cls) -> str:
|
|
141
|
+
"""Get the evaluator id."""
|
|
142
|
+
return EvaluatorType.LLM_JUDGE_TRAJECTORY_SIMULATION.value
|