uipath 2.1.108__py3-none-any.whl → 2.1.109__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of uipath might be problematic. Click here for more details.

Files changed (69) hide show
  1. uipath/_cli/__init__.py +4 -0
  2. uipath/_cli/_evals/_console_progress_reporter.py +2 -2
  3. uipath/_cli/_evals/_evaluator_factory.py +314 -29
  4. uipath/_cli/_evals/_helpers.py +194 -0
  5. uipath/_cli/_evals/_models/_evaluation_set.py +73 -7
  6. uipath/_cli/_evals/_models/_evaluator.py +183 -9
  7. uipath/_cli/_evals/_models/_evaluator_base_params.py +3 -3
  8. uipath/_cli/_evals/_models/_output.py +87 -3
  9. uipath/_cli/_evals/_progress_reporter.py +288 -28
  10. uipath/_cli/_evals/_runtime.py +80 -26
  11. uipath/_cli/_evals/mocks/input_mocker.py +1 -3
  12. uipath/_cli/_evals/mocks/llm_mocker.py +2 -2
  13. uipath/_cli/_evals/mocks/mocker_factory.py +2 -2
  14. uipath/_cli/_evals/mocks/mockito_mocker.py +2 -2
  15. uipath/_cli/_evals/mocks/mocks.py +5 -3
  16. uipath/_cli/_push/models.py +17 -0
  17. uipath/_cli/_push/sw_file_handler.py +336 -3
  18. uipath/_cli/_templates/custom_evaluator.py.template +65 -0
  19. uipath/_cli/_utils/_eval_set.py +30 -9
  20. uipath/_cli/_utils/_resources.py +21 -0
  21. uipath/_cli/_utils/_studio_project.py +18 -0
  22. uipath/_cli/cli_add.py +114 -0
  23. uipath/_cli/cli_eval.py +5 -1
  24. uipath/_cli/cli_pull.py +11 -26
  25. uipath/_cli/cli_push.py +2 -0
  26. uipath/_cli/cli_register.py +45 -0
  27. uipath/_events/_events.py +6 -5
  28. uipath/_utils/constants.py +4 -0
  29. uipath/eval/_helpers/evaluators_helpers.py +494 -0
  30. uipath/eval/_helpers/helpers.py +30 -2
  31. uipath/eval/evaluators/__init__.py +60 -5
  32. uipath/eval/evaluators/base_evaluator.py +546 -44
  33. uipath/eval/evaluators/contains_evaluator.py +80 -0
  34. uipath/eval/evaluators/exact_match_evaluator.py +43 -12
  35. uipath/eval/evaluators/json_similarity_evaluator.py +41 -12
  36. uipath/eval/evaluators/legacy_base_evaluator.py +89 -0
  37. uipath/eval/evaluators/{deterministic_evaluator_base.py → legacy_deterministic_evaluator_base.py} +2 -2
  38. uipath/eval/evaluators/legacy_exact_match_evaluator.py +37 -0
  39. uipath/eval/evaluators/legacy_json_similarity_evaluator.py +151 -0
  40. uipath/eval/evaluators/legacy_llm_as_judge_evaluator.py +137 -0
  41. uipath/eval/evaluators/{trajectory_evaluator.py → legacy_trajectory_evaluator.py} +5 -6
  42. uipath/eval/evaluators/llm_as_judge_evaluator.py +143 -78
  43. uipath/eval/evaluators/llm_judge_output_evaluator.py +112 -0
  44. uipath/eval/evaluators/llm_judge_trajectory_evaluator.py +142 -0
  45. uipath/eval/evaluators/output_evaluator.py +117 -0
  46. uipath/eval/evaluators/tool_call_args_evaluator.py +82 -0
  47. uipath/eval/evaluators/tool_call_count_evaluator.py +87 -0
  48. uipath/eval/evaluators/tool_call_order_evaluator.py +84 -0
  49. uipath/eval/evaluators/tool_call_output_evaluator.py +87 -0
  50. uipath/eval/evaluators_types/ContainsEvaluator.json +73 -0
  51. uipath/eval/evaluators_types/ExactMatchEvaluator.json +89 -0
  52. uipath/eval/evaluators_types/JsonSimilarityEvaluator.json +81 -0
  53. uipath/eval/evaluators_types/LLMJudgeOutputEvaluator.json +110 -0
  54. uipath/eval/evaluators_types/LLMJudgeSimulationTrajectoryEvaluator.json +88 -0
  55. uipath/eval/evaluators_types/LLMJudgeStrictJSONSimilarityOutputEvaluator.json +110 -0
  56. uipath/eval/evaluators_types/LLMJudgeTrajectoryEvaluator.json +88 -0
  57. uipath/eval/evaluators_types/ToolCallArgsEvaluator.json +131 -0
  58. uipath/eval/evaluators_types/ToolCallCountEvaluator.json +104 -0
  59. uipath/eval/evaluators_types/ToolCallOrderEvaluator.json +100 -0
  60. uipath/eval/evaluators_types/ToolCallOutputEvaluator.json +124 -0
  61. uipath/eval/evaluators_types/generate_types.py +31 -0
  62. uipath/eval/models/__init__.py +16 -1
  63. uipath/eval/models/llm_judge_types.py +196 -0
  64. uipath/eval/models/models.py +109 -7
  65. {uipath-2.1.108.dist-info → uipath-2.1.109.dist-info}/METADATA +1 -1
  66. {uipath-2.1.108.dist-info → uipath-2.1.109.dist-info}/RECORD +69 -37
  67. {uipath-2.1.108.dist-info → uipath-2.1.109.dist-info}/WHEEL +0 -0
  68. {uipath-2.1.108.dist-info → uipath-2.1.109.dist-info}/entry_points.txt +0 -0
  69. {uipath-2.1.108.dist-info → uipath-2.1.109.dist-info}/licenses/LICENSE +0 -0
@@ -24,7 +24,7 @@ from ..._events._events import (
24
24
  EvalSetRunUpdatedEvent,
25
25
  EvaluationEvents,
26
26
  )
27
- from ...eval.evaluators import BaseEvaluator
27
+ from ...eval.evaluators import BaseEvaluator, LegacyBaseEvaluator
28
28
  from ...eval.models import EvaluationResult
29
29
  from ...eval.models.models import AgentExecution, EvalItemResult
30
30
  from .._runtime._contracts import (
@@ -38,7 +38,13 @@ from .._runtime._contracts import (
38
38
  from .._runtime._logging import ExecutionLogHandler
39
39
  from .._utils._eval_set import EvalHelpers
40
40
  from ._evaluator_factory import EvaluatorFactory
41
- from ._models._evaluation_set import EvaluationItem, EvaluationSet
41
+ from ._models._evaluation_set import (
42
+ AnyEvaluationItem,
43
+ AnyEvaluationSet,
44
+ AnyEvaluator,
45
+ EvaluationItem,
46
+ LegacyEvaluationItem,
47
+ )
42
48
  from ._models._exceptions import EvaluationRuntimeException
43
49
  from ._models._output import (
44
50
  EvaluationResultDto,
@@ -182,7 +188,8 @@ class UiPathEvalRuntime(UiPathBaseRuntime, Generic[T, C]):
182
188
 
183
189
  event_bus = self.event_bus
184
190
 
185
- evaluation_set = EvalHelpers.load_eval_set(
191
+ # Load eval set (path is already resolved in cli_eval.py)
192
+ evaluation_set, _ = EvalHelpers.load_eval_set(
186
193
  self.context.eval_set, self.context.eval_ids
187
194
  )
188
195
  evaluators = self._load_evaluators(evaluation_set)
@@ -215,6 +222,7 @@ class UiPathEvalRuntime(UiPathBaseRuntime, Generic[T, C]):
215
222
  evaluation_set_name=evaluation_set.name,
216
223
  evaluation_set_results=eval_run_result_list,
217
224
  )
225
+
218
226
  # Computing evaluator averages
219
227
  evaluator_averages: Dict[str, float] = defaultdict(float)
220
228
  evaluator_count: Dict[str, int] = defaultdict(int)
@@ -245,8 +253,8 @@ class UiPathEvalRuntime(UiPathBaseRuntime, Generic[T, C]):
245
253
 
246
254
  async def _execute_sequential(
247
255
  self,
248
- evaluation_set: EvaluationSet,
249
- evaluators: List[BaseEvaluator[Any]],
256
+ evaluation_set: AnyEvaluationSet,
257
+ evaluators: List[AnyEvaluator],
250
258
  event_bus: EventBus,
251
259
  ) -> List[EvaluationRunResult]:
252
260
  all_eval_run_result: list[EvaluationRunResult] = []
@@ -260,13 +268,13 @@ class UiPathEvalRuntime(UiPathBaseRuntime, Generic[T, C]):
260
268
 
261
269
  async def _execute_parallel(
262
270
  self,
263
- evaluation_set: EvaluationSet,
264
- evaluators: List[BaseEvaluator[Any]],
271
+ evaluation_set: AnyEvaluationSet,
272
+ evaluators: List[AnyEvaluator],
265
273
  event_bus: EventBus,
266
274
  workers: int,
267
275
  ) -> List[EvaluationRunResult]:
268
276
  # Create a queue with max concurrency
269
- queue: asyncio.Queue[tuple[int, EvaluationItem]] = asyncio.Queue(
277
+ queue: asyncio.Queue[tuple[int, AnyEvaluationItem]] = asyncio.Queue(
270
278
  maxsize=workers
271
279
  )
272
280
 
@@ -276,7 +284,7 @@ class UiPathEvalRuntime(UiPathBaseRuntime, Generic[T, C]):
276
284
  # Producer task to fill the queue
277
285
  async def producer() -> None:
278
286
  for index, eval_item in enumerate(evaluation_set.evaluations):
279
- await queue.put((index, eval_item))
287
+ await queue.put((index, eval_item)) # type: ignore[arg-type]
280
288
  # Signal completion by putting None markers
281
289
  for _ in range(workers):
282
290
  await queue.put(None) # type: ignore
@@ -318,15 +326,12 @@ class UiPathEvalRuntime(UiPathBaseRuntime, Generic[T, C]):
318
326
 
319
327
  async def _execute_eval(
320
328
  self,
321
- eval_item: EvaluationItem,
322
- evaluators: List[BaseEvaluator[Any]],
329
+ eval_item: AnyEvaluationItem,
330
+ evaluators: List[AnyEvaluator],
323
331
  event_bus: EventBus,
324
332
  ) -> EvaluationRunResult:
325
- # Generate LLM-based input if input_mocking_strategy is defined
326
- if eval_item.input_mocking_strategy:
327
- eval_item = await self._generate_input_for_eval(eval_item)
328
-
329
333
  execution_id = str(uuid.uuid4())
334
+
330
335
  set_execution_context(eval_item, self.span_collector, execution_id)
331
336
 
332
337
  await event_bus.publish(
@@ -346,11 +351,41 @@ class UiPathEvalRuntime(UiPathBaseRuntime, Generic[T, C]):
346
351
  evaluation_item_results: list[EvalItemResult] = []
347
352
 
348
353
  for evaluator in evaluators:
349
- evaluation_result = await self.run_evaluator(
350
- evaluator=evaluator,
351
- execution_output=agent_execution_output,
352
- eval_item=eval_item,
353
- )
354
+ # Determine which evaluator method to use based on evaluation set/item type
355
+ evaluation_result: Optional[EvaluationResult] = None
356
+
357
+ match eval_item:
358
+ case LegacyEvaluationItem():
359
+ # Legacy evaluation - use run_legacy_evaluator
360
+ evaluation_result = await self.run_legacy_evaluator(
361
+ evaluator=evaluator, # type: ignore
362
+ execution_output=agent_execution_output,
363
+ eval_item=eval_item,
364
+ )
365
+ case EvaluationItem() if (
366
+ evaluator.id in eval_item.evaluation_criterias
367
+ ):
368
+ # New evaluation with criteria
369
+ evaluation_criteria = eval_item.evaluation_criterias[
370
+ evaluator.id
371
+ ]
372
+
373
+ evaluation_result = await self.run_evaluator(
374
+ evaluator=evaluator, # type: ignore
375
+ execution_output=agent_execution_output,
376
+ eval_item=eval_item,
377
+ evaluation_criteria=evaluator.evaluation_criteria_type( # type: ignore
378
+ **evaluation_criteria
379
+ )
380
+ if evaluation_criteria
381
+ else evaluator.evaluator_config.default_evaluation_criteria, # type: ignore
382
+ )
383
+ case _:
384
+ # Skip if evaluator not in evaluation criteria
385
+ continue
386
+
387
+ if evaluation_result is None:
388
+ continue
354
389
 
355
390
  dto_result = EvaluationResultDto.from_evaluation_result(
356
391
  evaluation_result
@@ -449,7 +484,7 @@ class UiPathEvalRuntime(UiPathBaseRuntime, Generic[T, C]):
449
484
  return spans, logs
450
485
 
451
486
  async def execute_runtime(
452
- self, eval_item: EvaluationItem, execution_id: str
487
+ self, eval_item: AnyEvaluationItem, execution_id: str
453
488
  ) -> UiPathEvalRunExecutionOutput:
454
489
  context_args = self.context.model_dump()
455
490
  context_args["execution_id"] = execution_id
@@ -486,7 +521,6 @@ class UiPathEvalRuntime(UiPathBaseRuntime, Generic[T, C]):
486
521
 
487
522
  if result is None:
488
523
  raise ValueError("Execution result cannot be None for eval runs")
489
-
490
524
  return UiPathEvalRunExecutionOutput(
491
525
  execution_time=end_time - start_time,
492
526
  spans=spans,
@@ -501,9 +535,31 @@ class UiPathEvalRuntime(UiPathBaseRuntime, Generic[T, C]):
501
535
 
502
536
  async def run_evaluator(
503
537
  self,
504
- evaluator: BaseEvaluator[Any],
538
+ evaluator: BaseEvaluator[Any, Any, Any],
505
539
  execution_output: UiPathEvalRunExecutionOutput,
506
540
  eval_item: EvaluationItem,
541
+ *,
542
+ evaluation_criteria: Any,
543
+ ) -> EvaluationResult:
544
+ agent_execution = AgentExecution(
545
+ agent_input=eval_item.inputs,
546
+ agent_output=execution_output.result.output or {},
547
+ agent_trace=execution_output.spans,
548
+ expected_agent_behavior=eval_item.expected_agent_behavior,
549
+ )
550
+
551
+ result = await evaluator.validate_and_evaluate_criteria(
552
+ agent_execution=agent_execution,
553
+ evaluation_criteria=evaluation_criteria,
554
+ )
555
+
556
+ return result
557
+
558
+ async def run_legacy_evaluator(
559
+ self,
560
+ evaluator: LegacyBaseEvaluator[Any],
561
+ execution_output: UiPathEvalRunExecutionOutput,
562
+ eval_item: LegacyEvaluationItem,
507
563
  ) -> EvaluationResult:
508
564
  agent_execution = AgentExecution(
509
565
  agent_input=eval_item.inputs,
@@ -520,9 +576,7 @@ class UiPathEvalRuntime(UiPathBaseRuntime, Generic[T, C]):
520
576
 
521
577
  return result
522
578
 
523
- def _load_evaluators(
524
- self, evaluation_set: EvaluationSet
525
- ) -> List[BaseEvaluator[Any]]:
579
+ def _load_evaluators(self, evaluation_set: AnyEvaluationSet) -> list[AnyEvaluator]:
526
580
  """Load evaluators referenced by the evaluation set."""
527
581
  evaluators = []
528
582
  evaluators_dir = Path(self.context.eval_set).parent.parent / "evaluators" # type: ignore
@@ -67,9 +67,7 @@ async def generate_llm_input(
67
67
  if evaluation_item.input_mocking_strategy
68
68
  else "",
69
69
  expected_behavior=evaluation_item.expected_agent_behavior or "",
70
- expected_output=json.dumps(evaluation_item.expected_output, indent=2)
71
- if evaluation_item.expected_output
72
- else "",
70
+ expected_output=json.dumps(evaluation_item.evaluation_criterias, indent=2),
73
71
  )
74
72
 
75
73
  response_format = {
@@ -10,7 +10,7 @@ from uipath.tracing._traced import traced
10
10
  from uipath.tracing._utils import _SpanUtils
11
11
 
12
12
  from .._models._evaluation_set import (
13
- EvaluationItem,
13
+ AnyEvaluationItem,
14
14
  LLMMockingStrategy,
15
15
  )
16
16
  from .._models._mocks import ExampleCall
@@ -77,7 +77,7 @@ def pydantic_to_dict_safe(obj: Any) -> Any:
77
77
  class LLMMocker(Mocker):
78
78
  """LLM Based Mocker."""
79
79
 
80
- def __init__(self, evaluation_item: EvaluationItem):
80
+ def __init__(self, evaluation_item: AnyEvaluationItem):
81
81
  """LLM Mocker constructor."""
82
82
  self.evaluation_item = evaluation_item
83
83
  assert isinstance(self.evaluation_item.mocking_strategy, LLMMockingStrategy)
@@ -1,7 +1,7 @@
1
1
  """Mocker Factory."""
2
2
 
3
3
  from uipath._cli._evals._models._evaluation_set import (
4
- EvaluationItem,
4
+ AnyEvaluationItem,
5
5
  LLMMockingStrategy,
6
6
  MockitoMockingStrategy,
7
7
  )
@@ -14,7 +14,7 @@ class MockerFactory:
14
14
  """Mocker factory."""
15
15
 
16
16
  @staticmethod
17
- def create(evaluation_item: EvaluationItem) -> Mocker:
17
+ def create(evaluation_item: AnyEvaluationItem) -> Mocker:
18
18
  """Create a mocker instance."""
19
19
  match evaluation_item.mocking_strategy:
20
20
  case LLMMockingStrategy():
@@ -9,7 +9,7 @@ from hydra.utils import instantiate
9
9
  from mockito import invocation, mocking # type: ignore[import-untyped]
10
10
 
11
11
  from uipath._cli._evals._models._evaluation_set import (
12
- EvaluationItem,
12
+ AnyEvaluationItem,
13
13
  MockingAnswerType,
14
14
  MockitoMockingStrategy,
15
15
  )
@@ -38,7 +38,7 @@ class Stub:
38
38
  class MockitoMocker(Mocker):
39
39
  """Mockito Mocker."""
40
40
 
41
- def __init__(self, evaluation_item: EvaluationItem):
41
+ def __init__(self, evaluation_item: AnyEvaluationItem):
42
42
  """Instantiate a mockito mocker."""
43
43
  self.evaluation_item = evaluation_item
44
44
  assert isinstance(self.evaluation_item.mocking_strategy, MockitoMockingStrategy)
@@ -4,13 +4,13 @@ import logging
4
4
  from contextvars import ContextVar
5
5
  from typing import Any, Callable, Optional
6
6
 
7
- from uipath._cli._evals._models._evaluation_set import EvaluationItem
7
+ from uipath._cli._evals._models._evaluation_set import AnyEvaluationItem
8
8
  from uipath._cli._evals._span_collection import ExecutionSpanCollector
9
9
  from uipath._cli._evals.mocks.mocker import Mocker, UiPathNoMockFoundError
10
10
  from uipath._cli._evals.mocks.mocker_factory import MockerFactory
11
11
 
12
12
  # Context variables for evaluation items and mockers
13
- evaluation_context: ContextVar[Optional[EvaluationItem]] = ContextVar(
13
+ evaluation_context: ContextVar[Optional[AnyEvaluationItem]] = ContextVar(
14
14
  "evaluation", default=None
15
15
  )
16
16
 
@@ -30,7 +30,9 @@ logger = logging.getLogger(__name__)
30
30
 
31
31
 
32
32
  def set_execution_context(
33
- eval_item: EvaluationItem, span_collector: ExecutionSpanCollector, execution_id: str
33
+ eval_item: AnyEvaluationItem,
34
+ span_collector: ExecutionSpanCollector,
35
+ execution_id: str,
34
36
  ) -> None:
35
37
  """Set the execution context for an evaluation run for mocking and trace access."""
36
38
  evaluation_context.set(eval_item)
@@ -0,0 +1,17 @@
1
+ """Models for push command."""
2
+
3
+ from pydantic import BaseModel, Field
4
+
5
+
6
+ class EvaluatorFileDetails(BaseModel):
7
+ """Details about an evaluator file for push operations."""
8
+
9
+ path: str
10
+ custom_evaluator_file_name: str = Field(
11
+ "", description="Name of the custom evaluator file, if available."
12
+ )
13
+
14
+ @property
15
+ def is_custom(self) -> bool:
16
+ """Check if this is a custom evaluator."""
17
+ return len(self.custom_evaluator_file_name) > 0