uipath 2.1.107__py3-none-any.whl → 2.1.109__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of uipath might be problematic. Click here for more details.

Files changed (72) hide show
  1. uipath/_cli/__init__.py +4 -0
  2. uipath/_cli/_evals/_console_progress_reporter.py +2 -2
  3. uipath/_cli/_evals/_evaluator_factory.py +314 -29
  4. uipath/_cli/_evals/_helpers.py +194 -0
  5. uipath/_cli/_evals/_models/_evaluation_set.py +73 -7
  6. uipath/_cli/_evals/_models/_evaluator.py +183 -9
  7. uipath/_cli/_evals/_models/_evaluator_base_params.py +3 -3
  8. uipath/_cli/_evals/_models/_output.py +87 -3
  9. uipath/_cli/_evals/_progress_reporter.py +288 -28
  10. uipath/_cli/_evals/_runtime.py +80 -26
  11. uipath/_cli/_evals/mocks/input_mocker.py +1 -3
  12. uipath/_cli/_evals/mocks/llm_mocker.py +2 -2
  13. uipath/_cli/_evals/mocks/mocker_factory.py +2 -2
  14. uipath/_cli/_evals/mocks/mockito_mocker.py +2 -2
  15. uipath/_cli/_evals/mocks/mocks.py +5 -3
  16. uipath/_cli/_push/models.py +17 -0
  17. uipath/_cli/_push/sw_file_handler.py +336 -3
  18. uipath/_cli/_runtime/_contracts.py +25 -5
  19. uipath/_cli/_templates/custom_evaluator.py.template +65 -0
  20. uipath/_cli/_utils/_eval_set.py +30 -9
  21. uipath/_cli/_utils/_resources.py +21 -0
  22. uipath/_cli/_utils/_studio_project.py +18 -0
  23. uipath/_cli/cli_add.py +114 -0
  24. uipath/_cli/cli_eval.py +5 -1
  25. uipath/_cli/cli_pull.py +11 -26
  26. uipath/_cli/cli_push.py +2 -0
  27. uipath/_cli/cli_register.py +45 -0
  28. uipath/_events/_events.py +6 -5
  29. uipath/_resources/SDK_REFERENCE.md +0 -97
  30. uipath/_uipath.py +10 -37
  31. uipath/_utils/constants.py +4 -0
  32. uipath/eval/_helpers/evaluators_helpers.py +494 -0
  33. uipath/eval/_helpers/helpers.py +30 -2
  34. uipath/eval/evaluators/__init__.py +60 -5
  35. uipath/eval/evaluators/base_evaluator.py +546 -44
  36. uipath/eval/evaluators/contains_evaluator.py +80 -0
  37. uipath/eval/evaluators/exact_match_evaluator.py +43 -12
  38. uipath/eval/evaluators/json_similarity_evaluator.py +41 -12
  39. uipath/eval/evaluators/legacy_base_evaluator.py +89 -0
  40. uipath/eval/evaluators/{deterministic_evaluator_base.py → legacy_deterministic_evaluator_base.py} +2 -2
  41. uipath/eval/evaluators/legacy_exact_match_evaluator.py +37 -0
  42. uipath/eval/evaluators/legacy_json_similarity_evaluator.py +151 -0
  43. uipath/eval/evaluators/legacy_llm_as_judge_evaluator.py +137 -0
  44. uipath/eval/evaluators/{trajectory_evaluator.py → legacy_trajectory_evaluator.py} +5 -6
  45. uipath/eval/evaluators/llm_as_judge_evaluator.py +143 -78
  46. uipath/eval/evaluators/llm_judge_output_evaluator.py +112 -0
  47. uipath/eval/evaluators/llm_judge_trajectory_evaluator.py +142 -0
  48. uipath/eval/evaluators/output_evaluator.py +117 -0
  49. uipath/eval/evaluators/tool_call_args_evaluator.py +82 -0
  50. uipath/eval/evaluators/tool_call_count_evaluator.py +87 -0
  51. uipath/eval/evaluators/tool_call_order_evaluator.py +84 -0
  52. uipath/eval/evaluators/tool_call_output_evaluator.py +87 -0
  53. uipath/eval/evaluators_types/ContainsEvaluator.json +73 -0
  54. uipath/eval/evaluators_types/ExactMatchEvaluator.json +89 -0
  55. uipath/eval/evaluators_types/JsonSimilarityEvaluator.json +81 -0
  56. uipath/eval/evaluators_types/LLMJudgeOutputEvaluator.json +110 -0
  57. uipath/eval/evaluators_types/LLMJudgeSimulationTrajectoryEvaluator.json +88 -0
  58. uipath/eval/evaluators_types/LLMJudgeStrictJSONSimilarityOutputEvaluator.json +110 -0
  59. uipath/eval/evaluators_types/LLMJudgeTrajectoryEvaluator.json +88 -0
  60. uipath/eval/evaluators_types/ToolCallArgsEvaluator.json +131 -0
  61. uipath/eval/evaluators_types/ToolCallCountEvaluator.json +104 -0
  62. uipath/eval/evaluators_types/ToolCallOrderEvaluator.json +100 -0
  63. uipath/eval/evaluators_types/ToolCallOutputEvaluator.json +124 -0
  64. uipath/eval/evaluators_types/generate_types.py +31 -0
  65. uipath/eval/models/__init__.py +16 -1
  66. uipath/eval/models/llm_judge_types.py +196 -0
  67. uipath/eval/models/models.py +109 -7
  68. {uipath-2.1.107.dist-info → uipath-2.1.109.dist-info}/METADATA +1 -1
  69. {uipath-2.1.107.dist-info → uipath-2.1.109.dist-info}/RECORD +72 -40
  70. {uipath-2.1.107.dist-info → uipath-2.1.109.dist-info}/WHEEL +0 -0
  71. {uipath-2.1.107.dist-info → uipath-2.1.109.dist-info}/entry_points.txt +0 -0
  72. {uipath-2.1.107.dist-info → uipath-2.1.109.dist-info}/licenses/LICENSE +0 -0
@@ -16,11 +16,11 @@ from ..models.models import (
16
16
  NumericEvaluationResult,
17
17
  TrajectoryEvaluationTrace,
18
18
  )
19
- from .base_evaluator import BaseEvaluator
19
+ from .legacy_base_evaluator import LegacyBaseEvaluator
20
20
 
21
21
 
22
- class TrajectoryEvaluator(BaseEvaluator[dict[str, Any]]):
23
- """Evaluator that analyzes the trajectory/path taken to reach outputs."""
22
+ class LegacyTrajectoryEvaluator(LegacyBaseEvaluator[dict[str, Any]]):
23
+ """Legacy evaluator that analyzes the trajectory/path taken to reach outputs."""
24
24
 
25
25
  prompt: str
26
26
  model: str
@@ -38,7 +38,7 @@ class TrajectoryEvaluator(BaseEvaluator[dict[str, Any]]):
38
38
  )
39
39
  return v
40
40
 
41
- def model_post_init(self, __context):
41
+ def model_post_init(self, __context: Any):
42
42
  """Initialize the LLM service after model creation."""
43
43
  super().model_post_init(__context)
44
44
  self._initialize_llm()
@@ -76,7 +76,6 @@ class TrajectoryEvaluator(BaseEvaluator[dict[str, Any]]):
76
76
  expected_agent_behavior=agent_execution.expected_agent_behavior,
77
77
  agent_run_history=agent_execution.agent_trace,
78
78
  )
79
-
80
79
  llm_response = await self._get_llm_response(evaluation_prompt)
81
80
 
82
81
  return NumericEvaluationResult(
@@ -160,4 +159,4 @@ class TrajectoryEvaluator(BaseEvaluator[dict[str, Any]]):
160
159
  }
161
160
 
162
161
  response = await self.llm.chat_completions(**request_data)
163
- return LLMResponse(**json.loads(response.choices[-1].message.content))
162
+ return LLMResponse(**json.loads(response.choices[-1].message.content or "{}"))
@@ -1,137 +1,202 @@
1
1
  """LLM-as-a-judge evaluator for subjective quality assessment of agent outputs."""
2
2
 
3
3
  import json
4
- from typing import Any, Optional
4
+ from abc import abstractmethod
5
+ from collections.abc import Callable
6
+ from typing import Any, TypeVar
7
+
8
+ from pydantic import BaseModel, Field, model_validator
9
+
10
+ from .._helpers.evaluators_helpers import COMMUNITY_agents_SUFFIX
11
+ from ..models import (
12
+ AgentExecution,
13
+ EvaluationResult,
14
+ LLMResponse,
15
+ NumericEvaluationResult,
16
+ )
17
+ from ..models.llm_judge_types import (
18
+ LLMJudgeOutputSchema,
19
+ LLMJudgePromptTemplates,
20
+ )
21
+ from ..models.models import UiPathEvaluationError, UiPathEvaluationErrorCategory
22
+ from .base_evaluator import (
23
+ BaseEvaluationCriteria,
24
+ BaseEvaluator,
25
+ BaseEvaluatorConfig,
26
+ )
27
+
28
+ T = TypeVar("T", bound=BaseEvaluationCriteria)
29
+
30
+
31
+ class BaseLLMJudgeEvaluatorConfig(BaseEvaluatorConfig[T]):
32
+ """Base config for all LLM evaluators.
33
+
34
+ Generic over T (evaluation criteria type) to ensure type safety between
35
+ the config's default_evaluation_criteria and the evaluator's expected criteria type.
36
+ """
5
37
 
6
- from pydantic import field_validator
38
+ prompt: str
39
+ model: str = ""
40
+ temperature: float = 0.0
41
+ max_tokens: int | None = None
7
42
 
8
- from uipath.eval.models import NumericEvaluationResult
9
43
 
10
- from ..._services import UiPathLlmChatService
11
- from ..._utils.constants import COMMUNITY_agents_SUFFIX
12
- from ..models.models import AgentExecution, EvaluationResult, LLMResponse
13
- from .base_evaluator import BaseEvaluator
44
+ C = TypeVar("C", bound=BaseLLMJudgeEvaluatorConfig[Any])
14
45
 
15
46
 
16
- class LlmAsAJudgeEvaluator(BaseEvaluator[dict[str, Any]]):
17
- """Evaluator that uses an LLM to judge the quality of agent output."""
47
+ class LLMJudgeMixin(BaseEvaluator[T, C, str]):
48
+ """Mixin that provides common LLM judge functionality."""
18
49
 
19
- prompt: str
20
- model: str
50
+ system_prompt: str = LLMJudgePromptTemplates.LLM_JUDGE_SYSTEM_PROMPT
51
+ output_schema: type[BaseModel] = LLMJudgeOutputSchema
21
52
  actual_output_placeholder: str = "{{ActualOutput}}"
22
53
  expected_output_placeholder: str = "{{ExpectedOutput}}"
23
- llm: Optional[UiPathLlmChatService] = None
54
+ llm_service: Callable[..., Any] | None = Field(
55
+ default=None, exclude=True, description="The LLM service for evaluation"
56
+ )
24
57
 
25
- @field_validator("prompt")
26
- @classmethod
27
- def validate_prompt_placeholders(cls, v: str) -> str:
58
+ @model_validator(mode="after")
59
+ def validate_prompt_placeholders(self) -> "LLMJudgeMixin[T, C]":
28
60
  """Validate that prompt contains required placeholders."""
29
- if "{{ActualOutput}}" not in v or "{{ExpectedOutput}}" not in v:
30
- raise ValueError(
31
- "Prompt must contain both {ActualOutput} and {ExpectedOutput} placeholders"
61
+ if (
62
+ self.actual_output_placeholder not in self.evaluator_config.prompt
63
+ or self.expected_output_placeholder not in self.evaluator_config.prompt
64
+ ):
65
+ raise UiPathEvaluationError(
66
+ code="INVALID_PROMPT_PLACEHOLDERS",
67
+ title="Prompt must contain both {ActualOutput} and {ExpectedOutput} placeholders",
68
+ detail="Prompt must contain both {ActualOutput} and {ExpectedOutput} placeholders",
69
+ category=UiPathEvaluationErrorCategory.USER,
32
70
  )
33
- return v
71
+ return self
34
72
 
35
- def model_post_init(self, __context):
36
- """Initialize the LLM service after model creation."""
73
+ def model_post_init(self, __context: Any) -> None:
74
+ """Initialize the LLM service if not provided."""
37
75
  super().model_post_init(__context)
38
- self._initialize_llm()
76
+ if self.llm_service is None:
77
+ self.llm_service = self._get_llm_service()
39
78
 
40
- def _initialize_llm(self):
41
- """Initialize the LLM used for evaluation."""
79
+ def _get_llm_service(self):
80
+ """Get the LLM service from the UiPath instance."""
42
81
  from uipath import UiPath
43
82
 
44
- uipath = UiPath()
45
- self.llm = uipath.llm
83
+ try:
84
+ uipath = UiPath()
85
+ return uipath.llm.chat_completions
86
+ except Exception as e:
87
+ raise UiPathEvaluationError(
88
+ code="FAILED_TO_GET_LLM_SERVICE",
89
+ title="Failed to get LLM service from the SDK and no otherLLM service provided",
90
+ detail=f"Error: {e}",
91
+ category=UiPathEvaluationErrorCategory.SYSTEM,
92
+ ) from e
93
+
94
+ @abstractmethod
95
+ def _get_actual_output(self, agent_execution: AgentExecution) -> Any:
96
+ """Get the actual output from the agent execution. Must be implemented by concrete evaluator classes."""
97
+ pass
98
+
99
+ @abstractmethod
100
+ def _get_expected_output(self, evaluation_criteria: T) -> Any:
101
+ """Get the expected output from the evaluation criteria. Must be implemented by concrete evaluator classes."""
102
+ pass
46
103
 
47
104
  async def evaluate(
48
105
  self,
49
106
  agent_execution: AgentExecution,
50
- evaluation_criteria: dict[str, Any],
107
+ evaluation_criteria: T,
51
108
  ) -> EvaluationResult:
52
- """Evaluate using an LLM as a judge.
53
-
54
- Sends the formatted prompt to the configured LLM and expects a JSON response
55
- with a numerical score (0-100) and justification.
56
-
57
- agent_execution: The execution details containing:
58
- - agent_input: The input received by the agent
59
- - actual_output: The actual output from the agent
60
- - spans: The execution spans to use for the evaluation
61
- evaluation_criteria: The criteria to evaluate
62
-
63
- Returns:
64
- EvaluationResult: Numerical score with LLM justification as details
65
- """
66
- # Create the evaluation prompt
109
+ """Evaluate using an LLM as a judge."""
67
110
  evaluation_prompt = self._create_evaluation_prompt(
68
- expected_output=evaluation_criteria,
69
- actual_output=agent_execution.agent_output,
111
+ agent_execution=agent_execution,
112
+ evaluation_criteria=evaluation_criteria,
70
113
  )
71
114
 
72
115
  llm_response = await self._get_llm_response(evaluation_prompt)
116
+ validated_justification = self.validate_justification(
117
+ llm_response.justification
118
+ )
73
119
 
74
120
  return NumericEvaluationResult(
75
- score=llm_response.score,
76
- details=llm_response.justification,
121
+ score=max(0.0, min(1.0, round(llm_response.score / 100.0, 2))),
122
+ details=validated_justification,
77
123
  )
78
124
 
79
125
  def _create_evaluation_prompt(
80
- self, expected_output: Any, actual_output: Any
126
+ self,
127
+ agent_execution: AgentExecution,
128
+ evaluation_criteria: T,
81
129
  ) -> str:
82
130
  """Create the evaluation prompt for the LLM."""
83
- formatted_prompt = self.prompt.replace(
131
+ formatted_prompt = self.evaluator_config.prompt.replace(
84
132
  self.actual_output_placeholder,
85
- str(actual_output),
133
+ str(self._get_actual_output(agent_execution)),
86
134
  )
87
135
  formatted_prompt = formatted_prompt.replace(
88
136
  self.expected_output_placeholder,
89
- str(expected_output),
137
+ str(self._get_expected_output(evaluation_criteria)),
90
138
  )
91
139
 
92
140
  return formatted_prompt
93
141
 
94
142
  async def _get_llm_response(self, evaluation_prompt: str) -> LLMResponse:
95
- """Get response from the LLM.
96
-
97
- Args:
98
- evaluation_prompt: The formatted prompt to send to the LLM
99
-
100
- Returns:
101
- LLMResponse with score and justification
102
- """
143
+ """Get response from the LLM."""
103
144
  # remove community-agents suffix from llm model name
104
- model = self.model
145
+ model = self.evaluator_config.model
105
146
  if model.endswith(COMMUNITY_agents_SUFFIX):
106
147
  model = model.replace(COMMUNITY_agents_SUFFIX, "")
107
148
 
108
149
  # Prepare the request
109
150
  request_data = {
110
151
  "model": model,
111
- "messages": [{"role": "user", "content": evaluation_prompt}],
152
+ "messages": [
153
+ {"role": "system", "content": self.system_prompt},
154
+ {"role": "user", "content": evaluation_prompt},
155
+ ],
112
156
  "response_format": {
113
157
  "type": "json_schema",
114
158
  "json_schema": {
115
159
  "name": "evaluation_response",
116
- "schema": {
117
- "type": "object",
118
- "properties": {
119
- "score": {
120
- "type": "number",
121
- "minimum": 0,
122
- "maximum": 100,
123
- "description": "Score between 0 and 100",
124
- },
125
- "justification": {
126
- "type": "string",
127
- "description": "Explanation for the score",
128
- },
129
- },
130
- "required": ["score", "justification"],
131
- },
160
+ "schema": self.output_schema.model_json_schema(),
132
161
  },
133
162
  },
163
+ "max_tokens": self.evaluator_config.max_tokens,
164
+ "temperature": self.evaluator_config.temperature,
134
165
  }
135
166
 
136
- response = await self.llm.chat_completions(**request_data) # type: ignore
137
- return LLMResponse(**json.loads(response.choices[-1].message.content))
167
+ if self.llm_service is None:
168
+ raise UiPathEvaluationError(
169
+ code="LLM_SERVICE_NOT_INITIALIZED",
170
+ title="LLM service not initialized",
171
+ detail="LLM service not initialized",
172
+ category=UiPathEvaluationErrorCategory.SYSTEM,
173
+ )
174
+
175
+ try:
176
+ response = await self.llm_service(**request_data)
177
+ except Exception as e:
178
+ raise UiPathEvaluationError(
179
+ code="FAILED_TO_GET_LLM_RESPONSE",
180
+ title="Failed to get LLM response",
181
+ detail=f"Error: {e}",
182
+ category=UiPathEvaluationErrorCategory.SYSTEM,
183
+ ) from e
184
+
185
+ try:
186
+ content = response.choices[-1].message.content
187
+ if content is None:
188
+ raise UiPathEvaluationError(
189
+ code="EMPTY_LLM_RESPONSE",
190
+ title="Empty LLM response",
191
+ detail="The LLM response message content was None.",
192
+ category=UiPathEvaluationErrorCategory.SYSTEM,
193
+ )
194
+ parsed_response = json.loads(str(content))
195
+ except Exception as e:
196
+ raise UiPathEvaluationError(
197
+ code="FAILED_TO_PARSE_LLM_RESPONSE",
198
+ title="Failed to parse LLM response",
199
+ detail=f"Error: {e}",
200
+ category=UiPathEvaluationErrorCategory.SYSTEM,
201
+ ) from e
202
+ return LLMResponse(**parsed_response)
@@ -0,0 +1,112 @@
1
+ """LLM judge output evaluators for evaluating agent outputs."""
2
+
3
+ from typing import TypeVar
4
+
5
+ from pydantic import BaseModel
6
+
7
+ from uipath.eval.models import EvaluatorType
8
+
9
+ from ..models import AgentExecution, EvaluationResult
10
+ from ..models.llm_judge_types import (
11
+ LLMJudgeOutputSchema,
12
+ LLMJudgePromptTemplates,
13
+ LLMJudgeStrictJSONSimilarityOutputSchema,
14
+ )
15
+ from .llm_as_judge_evaluator import (
16
+ BaseLLMJudgeEvaluatorConfig,
17
+ LLMJudgeMixin,
18
+ )
19
+ from .output_evaluator import (
20
+ OutputEvaluationCriteria,
21
+ OutputEvaluator,
22
+ OutputEvaluatorConfig,
23
+ )
24
+
25
+
26
+ class BaseLLMJudgeOutputCriteriaEvaluatorConfig(
27
+ OutputEvaluatorConfig[OutputEvaluationCriteria],
28
+ BaseLLMJudgeEvaluatorConfig[OutputEvaluationCriteria],
29
+ ):
30
+ """Base configuration for LLM judge output criteria evaluators."""
31
+
32
+ pass
33
+
34
+
35
+ class LLMJudgeOutputEvaluatorConfig(BaseLLMJudgeOutputCriteriaEvaluatorConfig):
36
+ """Configuration for the LLM judge output evaluator."""
37
+
38
+ name: str = "LLMJudgeOutputEvaluator"
39
+ prompt: str = LLMJudgePromptTemplates.LLM_JUDGE_DEFAULT_USER_PROMPT
40
+
41
+
42
+ class LLMJudgeStrictJSONSimilarityOutputEvaluatorConfig(LLMJudgeOutputEvaluatorConfig):
43
+ """Configuration for the LLM judge strict JSON similarity output evaluator."""
44
+
45
+ name: str = "LLMJudgeStrictJSONSimilarityOutputEvaluator"
46
+ prompt: str = (
47
+ LLMJudgePromptTemplates.LLM_JUDGE_STRICT_JSON_SIMILARITY_DEFAULT_USER_PROMPT
48
+ )
49
+
50
+
51
+ OC = TypeVar("OC", bound=LLMJudgeOutputEvaluatorConfig)
52
+
53
+
54
+ class BaseLLMOutputEvaluator(
55
+ OutputEvaluator[OutputEvaluationCriteria, OC, str],
56
+ LLMJudgeMixin[OutputEvaluationCriteria, OC],
57
+ ):
58
+ """Base class for LLM judge output evaluators that contains all shared functionality.
59
+
60
+ This class encapsulates the common evaluation logic for output-based LLM evaluators,
61
+ combining OutputEvaluator (for output extraction) with LLMJudgeMixin (for LLM functionality).
62
+ """
63
+
64
+ @classmethod
65
+ def get_evaluator_id(cls) -> str:
66
+ """Get the evaluator id."""
67
+ return EvaluatorType.LLM_JUDGE_OUTPUT.value
68
+
69
+ async def evaluate(
70
+ self,
71
+ agent_execution: AgentExecution,
72
+ evaluation_criteria: OutputEvaluationCriteria,
73
+ ) -> EvaluationResult:
74
+ """Evaluate using an LLM as a judge."""
75
+ # Explicitly delegate to LLMJudgeMixin's evaluate method to override BaseEvaluator
76
+ return await LLMJudgeMixin.evaluate(self, agent_execution, evaluation_criteria)
77
+
78
+
79
+ class LLMJudgeOutputEvaluator(BaseLLMOutputEvaluator[LLMJudgeOutputEvaluatorConfig]):
80
+ """Evaluator that uses an LLM to judge the quality of agent output.
81
+
82
+ Inherits all functionality from BaseLLMOutputEvaluator but uses the standard
83
+ system prompt and output schema for general output evaluation.
84
+ """
85
+
86
+ system_prompt: str = LLMJudgePromptTemplates.LLM_JUDGE_SYSTEM_PROMPT
87
+ output_schema: type[BaseModel] = LLMJudgeOutputSchema
88
+
89
+ @classmethod
90
+ def get_evaluator_id(cls) -> str:
91
+ """Get the evaluator id."""
92
+ return EvaluatorType.LLM_JUDGE_OUTPUT_SEMANTIC_SIMILARITY.value
93
+
94
+
95
+ class LLMJudgeStrictJSONSimilarityOutputEvaluator(
96
+ BaseLLMOutputEvaluator[LLMJudgeStrictJSONSimilarityOutputEvaluatorConfig]
97
+ ):
98
+ """Evaluator that uses an LLM to judge the quality of agent output with strict JSON similarity.
99
+
100
+ Inherits all functionality from BaseLLMOutputEvaluator but uses a different system prompt
101
+ and output schema specific to strict JSON similarity evaluation.
102
+ """
103
+
104
+ system_prompt: str = (
105
+ LLMJudgePromptTemplates.LLM_JUDGE_STRICT_JSON_SIMILARITY_SYSTEM_PROMPT
106
+ )
107
+ output_schema: type[BaseModel] = LLMJudgeStrictJSONSimilarityOutputSchema
108
+
109
+ @classmethod
110
+ def get_evaluator_id(cls) -> str:
111
+ """Get the evaluator id."""
112
+ return EvaluatorType.LLM_JUDGE_OUTPUT_STRICT_JSON_SIMILARITY.value
@@ -0,0 +1,142 @@
1
+ """LLM judge trajectory evaluator for evaluating agent execution trajectories."""
2
+
3
+ from typing import Any, TypeVar
4
+
5
+ from pydantic import BaseModel
6
+
7
+ from .._helpers.evaluators_helpers import trace_to_str
8
+ from ..models import (
9
+ AgentExecution,
10
+ EvaluationResult,
11
+ EvaluatorType,
12
+ )
13
+ from ..models.llm_judge_types import (
14
+ LLMJudgePromptTemplates,
15
+ LLMJudgeTrajectoryOutputSchema,
16
+ )
17
+ from .base_evaluator import BaseEvaluationCriteria
18
+ from .llm_as_judge_evaluator import (
19
+ BaseLLMJudgeEvaluatorConfig,
20
+ LLMJudgeMixin,
21
+ )
22
+
23
+
24
+ class TrajectoryEvaluationCriteria(BaseEvaluationCriteria):
25
+ """Evaluation criteria for trajectory-based evaluations."""
26
+
27
+ expected_agent_behavior: str
28
+
29
+
30
+ class LLMJudgeTrajectoryEvaluatorConfig(
31
+ BaseLLMJudgeEvaluatorConfig[TrajectoryEvaluationCriteria]
32
+ ):
33
+ """Configuration for the llm judge trajectory evaluator."""
34
+
35
+ name: str = "LLMJudgeTrajectoryEvaluator"
36
+ prompt: str = LLMJudgePromptTemplates.LLM_JUDGE_TRAJECTORY_DEFAULT_USER_PROMPT
37
+
38
+
39
+ class LLMJudgeTrajectorySimulationEvaluatorConfig(
40
+ BaseLLMJudgeEvaluatorConfig[TrajectoryEvaluationCriteria]
41
+ ):
42
+ """Configuration for the llm judge simulation trajectory evaluator."""
43
+
44
+ name: str = "LLMJudgeTrajectorySimulationEvaluator"
45
+ prompt: str = (
46
+ LLMJudgePromptTemplates.LLM_JUDGE_SIMULATION_TRAJECTORY_DEFAULT_USER_PROMPT
47
+ )
48
+
49
+
50
+ TC = TypeVar("TC", bound=BaseLLMJudgeEvaluatorConfig[TrajectoryEvaluationCriteria])
51
+
52
+
53
+ class BaseLLMTrajectoryEvaluator(LLMJudgeMixin[TrajectoryEvaluationCriteria, TC]):
54
+ """Base class for LLM trajectory evaluators that contains all shared functionality.
55
+
56
+ This class encapsulates the common evaluation logic for trajectory-based LLM evaluators,
57
+ including output extraction, prompt formatting, and evaluation criteria handling.
58
+ """
59
+
60
+ output_schema: type[BaseModel] = LLMJudgeTrajectoryOutputSchema
61
+ actual_output_placeholder: str = "{{AgentRunHistory}}"
62
+ expected_output_placeholder: str = "{{ExpectedAgentBehavior}}"
63
+ user_input_placeholder: str = "{{UserOrSyntheticInput}}"
64
+ simulation_instructions_placeholder: str = "{{SimulationInstructions}}"
65
+
66
+ @classmethod
67
+ def get_evaluator_id(cls) -> str:
68
+ """Get the evaluator id."""
69
+ return EvaluatorType.LLM_JUDGE_TRAJECTORY.value
70
+
71
+ async def evaluate(
72
+ self,
73
+ agent_execution: AgentExecution,
74
+ evaluation_criteria: TrajectoryEvaluationCriteria,
75
+ ) -> EvaluationResult:
76
+ """Evaluate using trajectory analysis."""
77
+ return await super().evaluate(agent_execution, evaluation_criteria)
78
+
79
+ def _get_actual_output(self, agent_execution: AgentExecution) -> Any:
80
+ """Get the actual output from the agent execution."""
81
+ return trace_to_str(agent_execution.agent_trace)
82
+
83
+ def _get_expected_output(
84
+ self, evaluation_criteria: TrajectoryEvaluationCriteria
85
+ ) -> Any:
86
+ """Get the expected agent behavior from the evaluation criteria."""
87
+ return evaluation_criteria.expected_agent_behavior
88
+
89
+ def _create_evaluation_prompt(
90
+ self,
91
+ agent_execution: AgentExecution,
92
+ evaluation_criteria: TrajectoryEvaluationCriteria,
93
+ ) -> str:
94
+ """Create the evaluation prompt for the LLM."""
95
+ formatted_prompt = super()._create_evaluation_prompt(
96
+ agent_execution, evaluation_criteria
97
+ )
98
+ formatted_prompt = formatted_prompt.replace(
99
+ self.user_input_placeholder,
100
+ str(agent_execution.agent_input),
101
+ )
102
+ formatted_prompt = formatted_prompt.replace(
103
+ self.simulation_instructions_placeholder,
104
+ agent_execution.simulation_instructions,
105
+ )
106
+ return formatted_prompt
107
+
108
+
109
+ class LLMJudgeTrajectoryEvaluator(
110
+ BaseLLMTrajectoryEvaluator[LLMJudgeTrajectoryEvaluatorConfig]
111
+ ):
112
+ """Evaluator that uses an LLM to judge the quality of agent trajectory.
113
+
114
+ Inherits all functionality from BaseLLMTrajectoryEvaluator but uses the standard
115
+ system prompt and configuration for general trajectory evaluation.
116
+ """
117
+
118
+ system_prompt: str = LLMJudgePromptTemplates.LLM_JUDGE_TRAJECTORY_SYSTEM_PROMPT
119
+
120
+ @classmethod
121
+ def get_evaluator_id(cls) -> str:
122
+ """Get the evaluator id."""
123
+ return EvaluatorType.LLM_JUDGE_TRAJECTORY_SIMILARITY.value
124
+
125
+
126
+ class LLMJudgeTrajectorySimulationEvaluator(
127
+ BaseLLMTrajectoryEvaluator[LLMJudgeTrajectorySimulationEvaluatorConfig]
128
+ ):
129
+ """Evaluator that uses an LLM to judge the quality of agent trajectory for simulations.
130
+
131
+ Inherits all functionality from BaseLLMTrajectoryEvaluator but uses a different system prompt
132
+ and configuration specific to simulation evaluation.
133
+ """
134
+
135
+ system_prompt: str = (
136
+ LLMJudgePromptTemplates.LLM_JUDGE_SIMULATION_TRAJECTORY_SYSTEM_PROMPT
137
+ )
138
+
139
+ @classmethod
140
+ def get_evaluator_id(cls) -> str:
141
+ """Get the evaluator id."""
142
+ return EvaluatorType.LLM_JUDGE_TRAJECTORY_SIMULATION.value