uipath 2.1.108__py3-none-any.whl → 2.1.109__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of uipath might be problematic. Click here for more details.

Files changed (69) hide show
  1. uipath/_cli/__init__.py +4 -0
  2. uipath/_cli/_evals/_console_progress_reporter.py +2 -2
  3. uipath/_cli/_evals/_evaluator_factory.py +314 -29
  4. uipath/_cli/_evals/_helpers.py +194 -0
  5. uipath/_cli/_evals/_models/_evaluation_set.py +73 -7
  6. uipath/_cli/_evals/_models/_evaluator.py +183 -9
  7. uipath/_cli/_evals/_models/_evaluator_base_params.py +3 -3
  8. uipath/_cli/_evals/_models/_output.py +87 -3
  9. uipath/_cli/_evals/_progress_reporter.py +288 -28
  10. uipath/_cli/_evals/_runtime.py +80 -26
  11. uipath/_cli/_evals/mocks/input_mocker.py +1 -3
  12. uipath/_cli/_evals/mocks/llm_mocker.py +2 -2
  13. uipath/_cli/_evals/mocks/mocker_factory.py +2 -2
  14. uipath/_cli/_evals/mocks/mockito_mocker.py +2 -2
  15. uipath/_cli/_evals/mocks/mocks.py +5 -3
  16. uipath/_cli/_push/models.py +17 -0
  17. uipath/_cli/_push/sw_file_handler.py +336 -3
  18. uipath/_cli/_templates/custom_evaluator.py.template +65 -0
  19. uipath/_cli/_utils/_eval_set.py +30 -9
  20. uipath/_cli/_utils/_resources.py +21 -0
  21. uipath/_cli/_utils/_studio_project.py +18 -0
  22. uipath/_cli/cli_add.py +114 -0
  23. uipath/_cli/cli_eval.py +5 -1
  24. uipath/_cli/cli_pull.py +11 -26
  25. uipath/_cli/cli_push.py +2 -0
  26. uipath/_cli/cli_register.py +45 -0
  27. uipath/_events/_events.py +6 -5
  28. uipath/_utils/constants.py +4 -0
  29. uipath/eval/_helpers/evaluators_helpers.py +494 -0
  30. uipath/eval/_helpers/helpers.py +30 -2
  31. uipath/eval/evaluators/__init__.py +60 -5
  32. uipath/eval/evaluators/base_evaluator.py +546 -44
  33. uipath/eval/evaluators/contains_evaluator.py +80 -0
  34. uipath/eval/evaluators/exact_match_evaluator.py +43 -12
  35. uipath/eval/evaluators/json_similarity_evaluator.py +41 -12
  36. uipath/eval/evaluators/legacy_base_evaluator.py +89 -0
  37. uipath/eval/evaluators/{deterministic_evaluator_base.py → legacy_deterministic_evaluator_base.py} +2 -2
  38. uipath/eval/evaluators/legacy_exact_match_evaluator.py +37 -0
  39. uipath/eval/evaluators/legacy_json_similarity_evaluator.py +151 -0
  40. uipath/eval/evaluators/legacy_llm_as_judge_evaluator.py +137 -0
  41. uipath/eval/evaluators/{trajectory_evaluator.py → legacy_trajectory_evaluator.py} +5 -6
  42. uipath/eval/evaluators/llm_as_judge_evaluator.py +143 -78
  43. uipath/eval/evaluators/llm_judge_output_evaluator.py +112 -0
  44. uipath/eval/evaluators/llm_judge_trajectory_evaluator.py +142 -0
  45. uipath/eval/evaluators/output_evaluator.py +117 -0
  46. uipath/eval/evaluators/tool_call_args_evaluator.py +82 -0
  47. uipath/eval/evaluators/tool_call_count_evaluator.py +87 -0
  48. uipath/eval/evaluators/tool_call_order_evaluator.py +84 -0
  49. uipath/eval/evaluators/tool_call_output_evaluator.py +87 -0
  50. uipath/eval/evaluators_types/ContainsEvaluator.json +73 -0
  51. uipath/eval/evaluators_types/ExactMatchEvaluator.json +89 -0
  52. uipath/eval/evaluators_types/JsonSimilarityEvaluator.json +81 -0
  53. uipath/eval/evaluators_types/LLMJudgeOutputEvaluator.json +110 -0
  54. uipath/eval/evaluators_types/LLMJudgeSimulationTrajectoryEvaluator.json +88 -0
  55. uipath/eval/evaluators_types/LLMJudgeStrictJSONSimilarityOutputEvaluator.json +110 -0
  56. uipath/eval/evaluators_types/LLMJudgeTrajectoryEvaluator.json +88 -0
  57. uipath/eval/evaluators_types/ToolCallArgsEvaluator.json +131 -0
  58. uipath/eval/evaluators_types/ToolCallCountEvaluator.json +104 -0
  59. uipath/eval/evaluators_types/ToolCallOrderEvaluator.json +100 -0
  60. uipath/eval/evaluators_types/ToolCallOutputEvaluator.json +124 -0
  61. uipath/eval/evaluators_types/generate_types.py +31 -0
  62. uipath/eval/models/__init__.py +16 -1
  63. uipath/eval/models/llm_judge_types.py +196 -0
  64. uipath/eval/models/models.py +109 -7
  65. {uipath-2.1.108.dist-info → uipath-2.1.109.dist-info}/METADATA +1 -1
  66. {uipath-2.1.108.dist-info → uipath-2.1.109.dist-info}/RECORD +69 -37
  67. {uipath-2.1.108.dist-info → uipath-2.1.109.dist-info}/WHEEL +0 -0
  68. {uipath-2.1.108.dist-info → uipath-2.1.109.dist-info}/entry_points.txt +0 -0
  69. {uipath-2.1.108.dist-info → uipath-2.1.109.dist-info}/licenses/LICENSE +0 -0
@@ -1,61 +1,93 @@
1
1
  """Base evaluator abstract class for agent evaluation."""
2
2
 
3
- import functools
4
- import time
3
+ import json
4
+ import warnings
5
5
  from abc import ABC, abstractmethod
6
- from typing import Generic, TypeVar
6
+ from typing import Any, Generic, TypeVar, Union, cast, get_args
7
7
 
8
- from pydantic import BaseModel, ConfigDict
8
+ from pydantic import BaseModel, ConfigDict, Field, model_validator
9
+ from pydantic.alias_generators import to_camel
9
10
 
10
- from uipath.eval.models import EvaluationResult
11
- from uipath.eval.models.models import (
12
- AgentExecution,
13
- ErrorEvaluationResult,
14
- EvaluatorCategory,
15
- EvaluatorType,
16
- )
11
+ from .._helpers.helpers import track_evaluation_metrics
12
+ from ..models import AgentExecution, EvaluationResult
13
+ from ..models.models import UiPathEvaluationError, UiPathEvaluationErrorCategory
17
14
 
18
15
 
19
- def track_evaluation_metrics(func):
20
- """Decorator to track evaluation metrics and handle errors gracefully."""
16
+ class BaseEvaluationCriteria(BaseModel):
17
+ """Base class for all evaluation criteria."""
18
+
19
+ model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True)
20
+ pass
21
+
22
+
23
+ # Type variable for evaluation criteria, used by both Config and Evaluator
24
+ T = TypeVar("T", bound=BaseEvaluationCriteria)
25
+
26
+
27
+ class BaseEvaluatorConfig(BaseModel, Generic[T]):
28
+ """Base class for all evaluator configurations.
29
+
30
+ Generic over T (evaluation criteria type) to ensure type safety between
31
+ the config's default_evaluation_criteria and the evaluator's expected criteria type.
32
+ """
33
+
34
+ model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True)
35
+
36
+ name: str
37
+ default_evaluation_criteria: T | None = None
38
+
39
+
40
+ class BaseEvaluatorJustification(BaseModel):
41
+ """Base class for all evaluator justifications."""
42
+
43
+ pass
44
+
45
+
46
+ # Additional type variables for Config and Justification
47
+ # Note: C must be BaseEvaluatorConfig[T] to ensure type consistency
48
+ C = TypeVar("C", bound=BaseEvaluatorConfig[Any])
49
+ J = TypeVar("J", bound=Union[str, None, BaseEvaluatorJustification])
21
50
 
22
- @functools.wraps(func)
23
- async def wrapper(*args, **kwargs) -> EvaluationResult:
24
- start_time = time.time()
25
- try:
26
- result = await func(*args, **kwargs)
27
- except Exception as e:
28
- result = ErrorEvaluationResult(
29
- details="Exception thrown by evaluator: {}".format(e),
30
- evaluation_time=time.time() - start_time,
31
- )
32
- end_time = time.time()
33
- execution_time = end_time - start_time
34
51
 
35
- result.evaluation_time = execution_time
36
- return result
52
+ class BaseEvaluator(BaseModel, Generic[T, C, J], ABC):
53
+ """Abstract base class for all evaluators.
37
54
 
38
- return wrapper
55
+ Generic Parameters:
56
+ T: The evaluation criteria type (bound to BaseEvaluationCriteria)
57
+ C: The evaluator config type (bound to BaseEvaluatorConfig[T])
58
+ J: The justification type (str, None, or BaseEvaluatorJustification subclass)
39
59
 
60
+ Design Rationale:
61
+ T is explicitly specified even though C = BaseEvaluatorConfig[T] already encodes it.
62
+ This redundancy is intentional and provides:
40
63
 
41
- T = TypeVar("T")
64
+ 1. **Type Checker Support**: Static type checkers can infer the exact criteria type
65
+ for the evaluate() method signature without runtime introspection
42
66
 
67
+ 2. **Clear API**: The signature BaseEvaluator[MyCriteria, MyConfig[MyCriteria], str]
68
+ makes it immediately obvious what criteria type is expected
43
69
 
44
- class BaseEvaluator(BaseModel, Generic[T], ABC):
45
- """Abstract base class for all evaluators."""
70
+ 3. **IDE Support**: Autocomplete and type hints work perfectly for method parameters
71
+
72
+ Runtime validation ensures T and C's generic parameter are consistent.
73
+ """
46
74
 
47
75
  model_config = ConfigDict(arbitrary_types_allowed=True)
48
76
 
49
77
  id: str
50
- name: str
51
- description: str
52
- target_output_key: str = "*"
53
- created_at: str
54
- updated_at: str
55
- category: EvaluatorCategory
56
- evaluator_type: EvaluatorType
57
-
58
- def __init_subclass__(cls, **kwargs):
78
+ config: dict[str, Any] = Field(description="The config dictionary")
79
+ config_type: type[C] = Field(description="The config type class")
80
+ evaluation_criteria_type: type[T] = Field(
81
+ description="The type used for evaluation criteria validation and creation"
82
+ )
83
+ justification_type: type[J] = Field(
84
+ description="The type used for justification validation and creation"
85
+ )
86
+ evaluator_config: C = Field(
87
+ exclude=True, description="The validated config object instance"
88
+ )
89
+
90
+ def __init_subclass__(cls, **kwargs: Any):
59
91
  """Hook for subclass creation - automatically applies evaluation metrics tracking."""
60
92
  super().__init_subclass__(**kwargs)
61
93
 
@@ -65,10 +97,479 @@ class BaseEvaluator(BaseModel, Generic[T], ABC):
65
97
  cls.evaluate = track_evaluation_metrics(cls.evaluate) # type: ignore[method-assign]
66
98
  cls.evaluate._has_metrics_decorator = True # type: ignore[attr-defined]
67
99
 
68
- def model_post_init(self, __context):
69
- """Post-initialization hook for Pydantic models."""
100
+ @property
101
+ def name(self) -> str:
102
+ """Evaluator's name."""
103
+ return self.evaluator_config.name
104
+
105
+ @model_validator(mode="before")
106
+ @classmethod
107
+ def validate_model(cls, values: Any) -> Any:
108
+ """Pre-initialization model validator for Pydantic models.
109
+
110
+ This validator extracts the Generic type parameters and validates their consistency.
111
+
112
+ Args:
113
+ values: The raw input values before validation
114
+
115
+ Returns:
116
+ The validated/transformed values with types set
117
+
118
+ Raises:
119
+ ValueError: If types cannot be determined or are inconsistent
120
+ """
121
+ if isinstance(values, dict):
122
+ # Always extract and set evaluation_criteria_type
123
+ criteria_type = cls._extract_evaluation_criteria_type()
124
+ values["evaluation_criteria_type"] = criteria_type
125
+
126
+ # Always extract and set config_type
127
+ config_type = cls._extract_config_type()
128
+ values["config_type"] = config_type
129
+
130
+ # Always extract and set justification_type
131
+ justification_type = cls._extract_justification_type()
132
+ values["justification_type"] = justification_type
133
+
134
+ # Validate consistency: config's generic parameter should match criteria_type
135
+ cls._validate_type_consistency(config_type, criteria_type)
136
+
137
+ # Validate and create the config object if config dict is provided
138
+ try:
139
+ validated_config = config_type.model_validate(values.get("config", {}))
140
+ values["evaluator_config"] = validated_config
141
+ except Exception as e:
142
+ raise UiPathEvaluationError(
143
+ code="FAILED_TO_VALIDATE_EVALUATOR_CONFIG",
144
+ title=f"Failed to validate evaluator config for {cls.__name__}",
145
+ detail=f"Error: {e}",
146
+ category=UiPathEvaluationErrorCategory.SYSTEM,
147
+ ) from e
148
+
149
+ return values
150
+
151
+ @classmethod
152
+ def _validate_type_consistency(
153
+ cls,
154
+ config_type: type[BaseEvaluatorConfig[Any]],
155
+ criteria_type: type[BaseEvaluationCriteria],
156
+ ) -> None:
157
+ """Validate that the config's generic parameter matches the evaluator's criteria type.
158
+
159
+ Extracts the criteria type from the config's default_evaluation_criteria field
160
+ annotation and validates it matches the evaluator's expected criteria type.
161
+
162
+ Args:
163
+ config_type: The config type to validate
164
+ criteria_type: The expected evaluation criteria type
165
+
166
+ Raises:
167
+ ValueError: If the types are inconsistent
168
+ """
169
+ # Skip validation for base classes
170
+ if config_type.__name__ in (
171
+ "BaseEvaluatorConfig",
172
+ "OutputEvaluatorConfig",
173
+ "BaseLLMJudgeEvaluatorConfig",
174
+ ):
175
+ return
176
+
177
+ # Extract from Pydantic's model_fields which preserves generic types
178
+ if (
179
+ hasattr(config_type, "model_fields")
180
+ and "default_evaluation_criteria" in config_type.model_fields
181
+ ):
182
+ field_info = config_type.model_fields["default_evaluation_criteria"]
183
+ if hasattr(field_info, "annotation"):
184
+ annotation = field_info.annotation
185
+ # The annotation will be SomeCriteria | None
186
+ args = get_args(annotation)
187
+ if args:
188
+ # Get the criteria type (the non-None arg)
189
+ for arg in args:
190
+ if (
191
+ arg is not type(None)
192
+ and isinstance(arg, type)
193
+ and issubclass(arg, BaseEvaluationCriteria)
194
+ ):
195
+ # Found the config's criteria type, check if it matches
196
+ if arg != criteria_type:
197
+ raise UiPathEvaluationError(
198
+ code="TYPE_INCONSISTENCY_IN_EVALUATOR",
199
+ title=f"Type inconsistency in {cls.__name__}: "
200
+ f"Config {config_type.__name__} expects criteria type {arg.__name__}",
201
+ detail=f"Evaluator expects {criteria_type.__name__}. "
202
+ f"Ensure BaseEvaluator[T, C[T], J] has matching T and C[T] parameters.",
203
+ category=UiPathEvaluationErrorCategory.SYSTEM,
204
+ )
205
+ return # Validation passed
206
+
207
+ @classmethod
208
+ def _extract_evaluation_criteria_type(cls) -> type[BaseEvaluationCriteria]:
209
+ """Extract the evaluation criteria type from Pydantic model fields.
210
+
211
+ Returns:
212
+ The evaluation criteria type
213
+
214
+ Raises:
215
+ ValueError: If no valid evaluation criteria type can be determined from the class definition
216
+ """
217
+ # Special case: if this is the BaseEvaluator class itself, return BaseEvaluationCriteria
218
+ if cls.__name__ == ("BaseEvaluator" or "BaseEvaluator[Any, Any, Any]"):
219
+ return BaseEvaluationCriteria
220
+
221
+ # Check if Pydantic has already resolved the evaluation_criteria_type field annotation
222
+ if not (
223
+ hasattr(cls, "model_fields")
224
+ and "evaluation_criteria_type" in cls.model_fields
225
+ ):
226
+ raise UiPathEvaluationError(
227
+ code="COULD_NOT_FIND_EVALUATION_CRITERIA_TYPE_FIELD",
228
+ title=f"Could not find evaluation_criteria_type field in {cls.__name__}",
229
+ detail="Ensure the class properly inherits from BaseEvaluator with correct Generic parameters.",
230
+ category=UiPathEvaluationErrorCategory.SYSTEM,
231
+ )
232
+
233
+ field_info = cls.model_fields["evaluation_criteria_type"]
234
+ if not hasattr(field_info, "annotation"):
235
+ raise UiPathEvaluationError(
236
+ code="NO_ANNOTATION_FOUND_FOR_EVALUATION_CRITERIA_TYPE_FIELD",
237
+ title=f"No annotation found for evaluation_criteria_type field in {cls.__name__}",
238
+ detail="Ensure the class properly inherits from BaseEvaluator with correct Generic parameters.",
239
+ category=UiPathEvaluationErrorCategory.SYSTEM,
240
+ )
241
+
242
+ # Extract the inner type from type[SomeType]
243
+ annotation = field_info.annotation
244
+ args = get_args(annotation)
245
+ if not args:
246
+ raise UiPathEvaluationError(
247
+ code="INVALID_ANNOTATION_FOR_EVALUATION_CRITERIA_TYPE",
248
+ title=f"Invalid annotation for evaluation_criteria_type in {cls.__name__}: {annotation}",
249
+ detail="Expected type[SomeEvaluationCriteria]",
250
+ category=UiPathEvaluationErrorCategory.SYSTEM,
251
+ )
252
+
253
+ criteria_type = args[0]
254
+ if not (
255
+ isinstance(criteria_type, type)
256
+ and issubclass(criteria_type, BaseEvaluationCriteria)
257
+ ):
258
+ raise UiPathEvaluationError(
259
+ code="INVALID_EVALUATION_CRITERIA_TYPE",
260
+ title=f"Invalid evaluation criteria type {criteria_type} in {cls.__name__}",
261
+ detail=f"{criteria_type} must be a subclass of BaseEvaluationCriteria",
262
+ category=UiPathEvaluationErrorCategory.SYSTEM,
263
+ )
264
+
265
+ return criteria_type
266
+
267
+ @classmethod
268
+ def _extract_config_type(cls) -> type[BaseEvaluatorConfig[Any]]:
269
+ """Extract the config type from Pydantic model fields.
270
+
271
+ Returns:
272
+ The config type for this evaluator
273
+
274
+ Raises:
275
+ ValueError: If no valid config type can be determined from the class definition
276
+ """
277
+ # Special case: if this is the BaseEvaluator class itself, return BaseEvaluatorConfig
278
+ if cls.__name__ == ("BaseEvaluator" or "BaseEvaluator[Any, Any, Any]"):
279
+ return BaseEvaluatorConfig
280
+ # Check if Pydantic has already resolved the config_type field annotation
281
+ if not (hasattr(cls, "model_fields") and "config_type" in cls.model_fields):
282
+ raise UiPathEvaluationError(
283
+ code="COULD_NOT_FIND_CONFIG_TYPE_FIELD",
284
+ title=f"Could not find config_type field in {cls.__name__}",
285
+ detail="Ensure the class properly inherits from BaseEvaluator with correct Generic parameters.",
286
+ category=UiPathEvaluationErrorCategory.SYSTEM,
287
+ )
288
+
289
+ field_info = cls.model_fields["config_type"]
290
+ if not hasattr(field_info, "annotation"):
291
+ raise UiPathEvaluationError(
292
+ code="NO_ANNOTATION_FOUND_FOR_CONFIG_TYPE_FIELD",
293
+ title=f"No annotation found for config_type field in {cls.__name__}",
294
+ detail="Ensure the class properly inherits from BaseEvaluator with correct Generic parameters.",
295
+ category=UiPathEvaluationErrorCategory.SYSTEM,
296
+ )
297
+
298
+ # Extract the inner type from type[SomeType]
299
+ annotation = field_info.annotation
300
+ args = get_args(annotation)
301
+ if not args:
302
+ raise UiPathEvaluationError(
303
+ code="INVALID_ANNOTATION_FOR_CONFIG_TYPE",
304
+ title=f"Invalid annotation for config_type in {cls.__name__}: {annotation}",
305
+ detail="Expected type[SomeEvaluatorConfig]",
306
+ category=UiPathEvaluationErrorCategory.SYSTEM,
307
+ )
308
+
309
+ config_type = args[0]
310
+ if not (
311
+ isinstance(config_type, type)
312
+ and issubclass(config_type, BaseEvaluatorConfig)
313
+ ):
314
+ raise UiPathEvaluationError(
315
+ code="INVALID_CONFIG_TYPE",
316
+ title=f"Invalid config type {config_type} in {cls.__name__}",
317
+ detail=f"{config_type} must be a subclass of BaseEvaluatorConfig",
318
+ category=UiPathEvaluationErrorCategory.SYSTEM,
319
+ )
320
+
321
+ return config_type
322
+
323
+ @classmethod
324
+ def _extract_justification_type(cls) -> type[J]:
325
+ """Extract the justification type from Pydantic model fields.
326
+
327
+ Returns:
328
+ The justification type (str, None, or BaseEvaluatorJustification subclass)
329
+
330
+ Note:
331
+ Unlike the other type extraction methods, this one returns a default (type(None))
332
+ instead of raising an error, since justification support is optional and
333
+ defaults to None for evaluators that don't specify a justification type.
334
+ """
335
+ try:
336
+ # Special case: if this is the BaseEvaluator class itself, return type(None)
337
+ if cls.__name__ == "BaseEvaluator[Any, Any, Any]":
338
+ return cast(type[J], type(None))
339
+
340
+ # Check if Pydantic has resolved the justification_type field annotation
341
+ if not (
342
+ hasattr(cls, "model_fields")
343
+ and "justification_type" in cls.model_fields
344
+ ):
345
+ # Default to None if field doesn't exist (justification is optional)
346
+ return cast(type[J], type(None))
347
+
348
+ field_info = cls.model_fields["justification_type"]
349
+ if not hasattr(field_info, "annotation"):
350
+ # Default to None if no annotation (justification is optional)
351
+ return cast(type[J], type(None))
352
+
353
+ # Extract the inner type from type[SomeType]
354
+ annotation = field_info.annotation
355
+ args = get_args(annotation)
356
+ if not args:
357
+ # Default to None if no type args (justification is optional)
358
+ return cast(type[J], type(None))
359
+
360
+ justification_type = args[0]
361
+
362
+ # Validate the justification type - must be str, type(None), or BaseEvaluatorJustification subclass
363
+ if justification_type is str or justification_type is type(None):
364
+ return cast(type[J], justification_type)
365
+ elif isinstance(justification_type, type) and issubclass(
366
+ justification_type, BaseEvaluatorJustification
367
+ ):
368
+ return cast(type[J], justification_type)
369
+ else:
370
+ # Invalid justification type - log warning but default to None for robustness
371
+ warnings.warn(
372
+ f"Invalid justification type {justification_type} in {cls.__name__}. "
373
+ f"Must be str, None, or subclass of BaseEvaluatorJustification. Defaulting to None.",
374
+ UserWarning,
375
+ stacklevel=2,
376
+ )
377
+ return cast(type[J], type(None))
378
+ except Exception as e:
379
+ raise UiPathEvaluationError(
380
+ code="CANNOT_EXTRACT_JUSTIFICATION_TYPE",
381
+ title=f"Cannot extract justification type from {cls.__name__}",
382
+ detail=f"Error: {e}",
383
+ category=UiPathEvaluationErrorCategory.SYSTEM,
384
+ ) from e
385
+
386
+ def validate_evaluation_criteria(self, criteria: Any) -> T:
387
+ """Validate and convert input to the correct evaluation criteria type.
388
+
389
+ Uses Pydantic's model_validate for proper validation, type coercion,
390
+ and error handling.
391
+
392
+ Args:
393
+ criteria: The criteria to validate (dict, BaseEvaluationCriteria, or other)
394
+
395
+ Returns:
396
+ An instance of the evaluation criteria type (T)
397
+
398
+ Raises:
399
+ ValueError: If the criteria cannot be converted to the expected type
400
+ """
401
+ try:
402
+ if isinstance(criteria, self.evaluation_criteria_type):
403
+ return criteria
404
+ elif isinstance(criteria, dict):
405
+ return self.evaluation_criteria_type.model_validate(criteria)
406
+ elif hasattr(criteria, "__dict__"):
407
+ # Try to convert from another object type
408
+ return self.evaluation_criteria_type.model_validate(criteria.__dict__)
409
+ else:
410
+ # Try to let Pydantic handle the conversion
411
+ return self.evaluation_criteria_type.model_validate(criteria)
412
+ except Exception as e:
413
+ raise UiPathEvaluationError(
414
+ code="CANNOT_VALIDATE_EVALUATION_CRITERIA",
415
+ title=f"Cannot validate {type(criteria)} to {self.evaluation_criteria_type}",
416
+ detail=f"Error: {e}",
417
+ category=UiPathEvaluationErrorCategory.SYSTEM,
418
+ ) from e
419
+
420
+ def validate_justification(self, justification: Any) -> J:
421
+ """Validate and convert input to the correct justification type.
422
+
423
+ Args:
424
+ justification: The justification to validate (str, None, dict, BaseEvaluatorJustification, or other)
425
+
426
+ Returns:
427
+ The validated justification of the correct type
428
+ """
429
+ # The key insight: J is constrained to be one of str, None, or BaseEvaluatorJustification
430
+ # At instantiation time, J gets bound to exactly one of these types
431
+ # We need to handle each case and ensure the return matches the bound type
432
+ try:
433
+ # Handle None type - when J is bound to None (the literal None type)
434
+ if self.justification_type is type(None):
435
+ # When J is None, we can only return None
436
+ return cast(J, justification if justification is None else None)
437
+
438
+ # Handle str type - when J is bound to str
439
+ if self.justification_type is str:
440
+ # When J is str, we must return a str
441
+ if justification is None:
442
+ return cast(J, "")
443
+ return cast(J, str(justification))
444
+
445
+ # Handle BaseEvaluatorJustification subclasses - when J is bound to a specific subclass
446
+ if isinstance(self.justification_type, type) and issubclass(
447
+ self.justification_type, BaseEvaluatorJustification
448
+ ):
449
+ # When J is a BaseEvaluatorJustification subclass, we must return that type
450
+ if justification is None:
451
+ raise ValueError(
452
+ f"None is not allowed for justification type {self.justification_type}"
453
+ )
454
+
455
+ if isinstance(justification, self.justification_type):
456
+ return justification
457
+ elif isinstance(justification, dict):
458
+ return self.justification_type.model_validate(justification)
459
+ elif hasattr(justification, "__dict__"):
460
+ return self.justification_type.model_validate(
461
+ justification.__dict__
462
+ )
463
+ else:
464
+ return self.justification_type.model_validate(justification)
465
+ except Exception as e:
466
+ raise UiPathEvaluationError(
467
+ code="CANNOT_CONVERT_JUSTIFICATION",
468
+ title=f"Cannot convert {type(justification)} to {self.justification_type}",
469
+ detail=f"Error: {e}",
470
+ category=UiPathEvaluationErrorCategory.SYSTEM,
471
+ ) from e
472
+
473
+ # Fallback: this should never happen
474
+ raise UiPathEvaluationError(
475
+ code="UNSUPPORTED_JUSTIFICATION_TYPE",
476
+ title=f"Unsupported justification type {self.justification_type} for input {type(justification)}",
477
+ detail=f"Unsupported justification type {self.justification_type} for input {type(justification)}",
478
+ category=UiPathEvaluationErrorCategory.SYSTEM,
479
+ )
480
+
481
+ @classmethod
482
+ def get_evaluation_criteria_schema(cls) -> dict[str, Any]:
483
+ """Get the JSON schema for the evaluation criteria type.
484
+
485
+ Returns:
486
+ The JSON schema for the evaluation criteria type
487
+ """
488
+ criteria_type = cls._extract_evaluation_criteria_type()
489
+ return criteria_type.model_json_schema(by_alias=False)
490
+
491
+ @classmethod
492
+ def get_config_schema(cls) -> dict[str, Any]:
493
+ """Get the JSON schema for the config type.
494
+
495
+ Returns:
496
+ The JSON schema for the config type
497
+ """
498
+ config_type = cls._extract_config_type()
499
+ return config_type.model_json_schema(by_alias=False)
500
+
501
+ @classmethod
502
+ def get_justification_schema(cls) -> dict[str, Any]:
503
+ """Get the JSON schema for the justification type.
504
+
505
+ Returns:
506
+ The JSON schema for the justification type
507
+ """
508
+ justification_type = cls._extract_justification_type()
509
+ if justification_type is type(None):
510
+ return {}
511
+ elif justification_type is str:
512
+ return {"type": "string"}
513
+ elif isinstance(justification_type, type) and issubclass(
514
+ justification_type, BaseEvaluatorJustification
515
+ ):
516
+ return justification_type.model_json_schema(by_alias=False)
517
+ else:
518
+ raise UiPathEvaluationError(
519
+ code="INVALID_JUSTIFICATION_TYPE",
520
+ title=f"Invalid justification type {justification_type} in {cls.__name__}",
521
+ detail="Must be str, None, or subclass of BaseEvaluatorJustification",
522
+ category=UiPathEvaluationErrorCategory.SYSTEM,
523
+ )
524
+
525
+ def _canonical_json(self, obj: Any) -> str:
526
+ """Convert an object to canonical JSON string for consistent comparison.
527
+
528
+ Args:
529
+ obj: The object to convert to canonical JSON
530
+
531
+ Returns:
532
+ str: Canonical JSON string with normalized numbers and sorted keys
533
+ """
534
+ return json.dumps(
535
+ obj,
536
+ sort_keys=True,
537
+ separators=(",", ":"),
538
+ ensure_ascii=False,
539
+ )
540
+
541
+ @classmethod
542
+ @abstractmethod
543
+ def get_evaluator_id(cls) -> str:
544
+ """Get the evaluator id."""
70
545
  pass
71
546
 
547
+ @classmethod
548
+ def generate_json_type(cls) -> dict[str, Any]:
549
+ """Generate the JSON schema for the evaluator."""
550
+ return {
551
+ "evaluatorTypeId": cls.get_evaluator_id(),
552
+ "evaluatorConfigSchema": cls.get_config_schema(),
553
+ "evaluationCriteriaSchema": cls.get_evaluation_criteria_schema(),
554
+ "justificationSchema": cls.get_justification_schema(),
555
+ }
556
+
557
+ async def validate_and_evaluate_criteria(
558
+ self, agent_execution: AgentExecution, evaluation_criteria: Any
559
+ ) -> EvaluationResult:
560
+ """Evaluate the given data and return a result from a raw evaluation criteria."""
561
+ if evaluation_criteria is None:
562
+ evaluation_criteria = self.evaluator_config.default_evaluation_criteria
563
+ if evaluation_criteria is None:
564
+ raise UiPathEvaluationError(
565
+ code="NO_EVALUATION_CRITERIA_PROVIDED",
566
+ title="No evaluation criteria provided and no default evaluation criteria configured",
567
+ detail="No evaluation criteria provided and no default evaluation criteria configured",
568
+ category=UiPathEvaluationErrorCategory.SYSTEM,
569
+ )
570
+ criteria = self.validate_evaluation_criteria(evaluation_criteria)
571
+ return await self.evaluate(agent_execution, criteria)
572
+
72
573
  @abstractmethod
73
574
  async def evaluate(
74
575
  self, agent_execution: AgentExecution, evaluation_criteria: T
@@ -78,8 +579,9 @@ class BaseEvaluator(BaseModel, Generic[T], ABC):
78
579
  Args:
79
580
  agent_execution: The execution details containing:
80
581
  - agent_input: The input received by the agent
81
- - actual_output: The actual output from the agent
82
- - spans: The execution spans to use for the evaluation
582
+ - agent_output: The actual output from the agent
583
+ - agent_trace: The execution trace from the agent
584
+ - simulation_instructions: The simulation instructions for the agent
83
585
  evaluation_criteria: The criteria to evaluate
84
586
 
85
587
  Returns: