uipath 2.1.107__py3-none-any.whl → 2.1.109__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of uipath might be problematic. Click here for more details.
- uipath/_cli/__init__.py +4 -0
- uipath/_cli/_evals/_console_progress_reporter.py +2 -2
- uipath/_cli/_evals/_evaluator_factory.py +314 -29
- uipath/_cli/_evals/_helpers.py +194 -0
- uipath/_cli/_evals/_models/_evaluation_set.py +73 -7
- uipath/_cli/_evals/_models/_evaluator.py +183 -9
- uipath/_cli/_evals/_models/_evaluator_base_params.py +3 -3
- uipath/_cli/_evals/_models/_output.py +87 -3
- uipath/_cli/_evals/_progress_reporter.py +288 -28
- uipath/_cli/_evals/_runtime.py +80 -26
- uipath/_cli/_evals/mocks/input_mocker.py +1 -3
- uipath/_cli/_evals/mocks/llm_mocker.py +2 -2
- uipath/_cli/_evals/mocks/mocker_factory.py +2 -2
- uipath/_cli/_evals/mocks/mockito_mocker.py +2 -2
- uipath/_cli/_evals/mocks/mocks.py +5 -3
- uipath/_cli/_push/models.py +17 -0
- uipath/_cli/_push/sw_file_handler.py +336 -3
- uipath/_cli/_runtime/_contracts.py +25 -5
- uipath/_cli/_templates/custom_evaluator.py.template +65 -0
- uipath/_cli/_utils/_eval_set.py +30 -9
- uipath/_cli/_utils/_resources.py +21 -0
- uipath/_cli/_utils/_studio_project.py +18 -0
- uipath/_cli/cli_add.py +114 -0
- uipath/_cli/cli_eval.py +5 -1
- uipath/_cli/cli_pull.py +11 -26
- uipath/_cli/cli_push.py +2 -0
- uipath/_cli/cli_register.py +45 -0
- uipath/_events/_events.py +6 -5
- uipath/_resources/SDK_REFERENCE.md +0 -97
- uipath/_uipath.py +10 -37
- uipath/_utils/constants.py +4 -0
- uipath/eval/_helpers/evaluators_helpers.py +494 -0
- uipath/eval/_helpers/helpers.py +30 -2
- uipath/eval/evaluators/__init__.py +60 -5
- uipath/eval/evaluators/base_evaluator.py +546 -44
- uipath/eval/evaluators/contains_evaluator.py +80 -0
- uipath/eval/evaluators/exact_match_evaluator.py +43 -12
- uipath/eval/evaluators/json_similarity_evaluator.py +41 -12
- uipath/eval/evaluators/legacy_base_evaluator.py +89 -0
- uipath/eval/evaluators/{deterministic_evaluator_base.py → legacy_deterministic_evaluator_base.py} +2 -2
- uipath/eval/evaluators/legacy_exact_match_evaluator.py +37 -0
- uipath/eval/evaluators/legacy_json_similarity_evaluator.py +151 -0
- uipath/eval/evaluators/legacy_llm_as_judge_evaluator.py +137 -0
- uipath/eval/evaluators/{trajectory_evaluator.py → legacy_trajectory_evaluator.py} +5 -6
- uipath/eval/evaluators/llm_as_judge_evaluator.py +143 -78
- uipath/eval/evaluators/llm_judge_output_evaluator.py +112 -0
- uipath/eval/evaluators/llm_judge_trajectory_evaluator.py +142 -0
- uipath/eval/evaluators/output_evaluator.py +117 -0
- uipath/eval/evaluators/tool_call_args_evaluator.py +82 -0
- uipath/eval/evaluators/tool_call_count_evaluator.py +87 -0
- uipath/eval/evaluators/tool_call_order_evaluator.py +84 -0
- uipath/eval/evaluators/tool_call_output_evaluator.py +87 -0
- uipath/eval/evaluators_types/ContainsEvaluator.json +73 -0
- uipath/eval/evaluators_types/ExactMatchEvaluator.json +89 -0
- uipath/eval/evaluators_types/JsonSimilarityEvaluator.json +81 -0
- uipath/eval/evaluators_types/LLMJudgeOutputEvaluator.json +110 -0
- uipath/eval/evaluators_types/LLMJudgeSimulationTrajectoryEvaluator.json +88 -0
- uipath/eval/evaluators_types/LLMJudgeStrictJSONSimilarityOutputEvaluator.json +110 -0
- uipath/eval/evaluators_types/LLMJudgeTrajectoryEvaluator.json +88 -0
- uipath/eval/evaluators_types/ToolCallArgsEvaluator.json +131 -0
- uipath/eval/evaluators_types/ToolCallCountEvaluator.json +104 -0
- uipath/eval/evaluators_types/ToolCallOrderEvaluator.json +100 -0
- uipath/eval/evaluators_types/ToolCallOutputEvaluator.json +124 -0
- uipath/eval/evaluators_types/generate_types.py +31 -0
- uipath/eval/models/__init__.py +16 -1
- uipath/eval/models/llm_judge_types.py +196 -0
- uipath/eval/models/models.py +109 -7
- {uipath-2.1.107.dist-info → uipath-2.1.109.dist-info}/METADATA +1 -1
- {uipath-2.1.107.dist-info → uipath-2.1.109.dist-info}/RECORD +72 -40
- {uipath-2.1.107.dist-info → uipath-2.1.109.dist-info}/WHEEL +0 -0
- {uipath-2.1.107.dist-info → uipath-2.1.109.dist-info}/entry_points.txt +0 -0
- {uipath-2.1.107.dist-info → uipath-2.1.109.dist-info}/licenses/LICENSE +0 -0
uipath/_cli/_evals/_runtime.py
CHANGED
|
@@ -24,7 +24,7 @@ from ..._events._events import (
|
|
|
24
24
|
EvalSetRunUpdatedEvent,
|
|
25
25
|
EvaluationEvents,
|
|
26
26
|
)
|
|
27
|
-
from ...eval.evaluators import BaseEvaluator
|
|
27
|
+
from ...eval.evaluators import BaseEvaluator, LegacyBaseEvaluator
|
|
28
28
|
from ...eval.models import EvaluationResult
|
|
29
29
|
from ...eval.models.models import AgentExecution, EvalItemResult
|
|
30
30
|
from .._runtime._contracts import (
|
|
@@ -38,7 +38,13 @@ from .._runtime._contracts import (
|
|
|
38
38
|
from .._runtime._logging import ExecutionLogHandler
|
|
39
39
|
from .._utils._eval_set import EvalHelpers
|
|
40
40
|
from ._evaluator_factory import EvaluatorFactory
|
|
41
|
-
from ._models._evaluation_set import
|
|
41
|
+
from ._models._evaluation_set import (
|
|
42
|
+
AnyEvaluationItem,
|
|
43
|
+
AnyEvaluationSet,
|
|
44
|
+
AnyEvaluator,
|
|
45
|
+
EvaluationItem,
|
|
46
|
+
LegacyEvaluationItem,
|
|
47
|
+
)
|
|
42
48
|
from ._models._exceptions import EvaluationRuntimeException
|
|
43
49
|
from ._models._output import (
|
|
44
50
|
EvaluationResultDto,
|
|
@@ -182,7 +188,8 @@ class UiPathEvalRuntime(UiPathBaseRuntime, Generic[T, C]):
|
|
|
182
188
|
|
|
183
189
|
event_bus = self.event_bus
|
|
184
190
|
|
|
185
|
-
|
|
191
|
+
# Load eval set (path is already resolved in cli_eval.py)
|
|
192
|
+
evaluation_set, _ = EvalHelpers.load_eval_set(
|
|
186
193
|
self.context.eval_set, self.context.eval_ids
|
|
187
194
|
)
|
|
188
195
|
evaluators = self._load_evaluators(evaluation_set)
|
|
@@ -215,6 +222,7 @@ class UiPathEvalRuntime(UiPathBaseRuntime, Generic[T, C]):
|
|
|
215
222
|
evaluation_set_name=evaluation_set.name,
|
|
216
223
|
evaluation_set_results=eval_run_result_list,
|
|
217
224
|
)
|
|
225
|
+
|
|
218
226
|
# Computing evaluator averages
|
|
219
227
|
evaluator_averages: Dict[str, float] = defaultdict(float)
|
|
220
228
|
evaluator_count: Dict[str, int] = defaultdict(int)
|
|
@@ -245,8 +253,8 @@ class UiPathEvalRuntime(UiPathBaseRuntime, Generic[T, C]):
|
|
|
245
253
|
|
|
246
254
|
async def _execute_sequential(
|
|
247
255
|
self,
|
|
248
|
-
evaluation_set:
|
|
249
|
-
evaluators: List[
|
|
256
|
+
evaluation_set: AnyEvaluationSet,
|
|
257
|
+
evaluators: List[AnyEvaluator],
|
|
250
258
|
event_bus: EventBus,
|
|
251
259
|
) -> List[EvaluationRunResult]:
|
|
252
260
|
all_eval_run_result: list[EvaluationRunResult] = []
|
|
@@ -260,13 +268,13 @@ class UiPathEvalRuntime(UiPathBaseRuntime, Generic[T, C]):
|
|
|
260
268
|
|
|
261
269
|
async def _execute_parallel(
|
|
262
270
|
self,
|
|
263
|
-
evaluation_set:
|
|
264
|
-
evaluators: List[
|
|
271
|
+
evaluation_set: AnyEvaluationSet,
|
|
272
|
+
evaluators: List[AnyEvaluator],
|
|
265
273
|
event_bus: EventBus,
|
|
266
274
|
workers: int,
|
|
267
275
|
) -> List[EvaluationRunResult]:
|
|
268
276
|
# Create a queue with max concurrency
|
|
269
|
-
queue: asyncio.Queue[tuple[int,
|
|
277
|
+
queue: asyncio.Queue[tuple[int, AnyEvaluationItem]] = asyncio.Queue(
|
|
270
278
|
maxsize=workers
|
|
271
279
|
)
|
|
272
280
|
|
|
@@ -276,7 +284,7 @@ class UiPathEvalRuntime(UiPathBaseRuntime, Generic[T, C]):
|
|
|
276
284
|
# Producer task to fill the queue
|
|
277
285
|
async def producer() -> None:
|
|
278
286
|
for index, eval_item in enumerate(evaluation_set.evaluations):
|
|
279
|
-
await queue.put((index, eval_item))
|
|
287
|
+
await queue.put((index, eval_item)) # type: ignore[arg-type]
|
|
280
288
|
# Signal completion by putting None markers
|
|
281
289
|
for _ in range(workers):
|
|
282
290
|
await queue.put(None) # type: ignore
|
|
@@ -318,15 +326,12 @@ class UiPathEvalRuntime(UiPathBaseRuntime, Generic[T, C]):
|
|
|
318
326
|
|
|
319
327
|
async def _execute_eval(
|
|
320
328
|
self,
|
|
321
|
-
eval_item:
|
|
322
|
-
evaluators: List[
|
|
329
|
+
eval_item: AnyEvaluationItem,
|
|
330
|
+
evaluators: List[AnyEvaluator],
|
|
323
331
|
event_bus: EventBus,
|
|
324
332
|
) -> EvaluationRunResult:
|
|
325
|
-
# Generate LLM-based input if input_mocking_strategy is defined
|
|
326
|
-
if eval_item.input_mocking_strategy:
|
|
327
|
-
eval_item = await self._generate_input_for_eval(eval_item)
|
|
328
|
-
|
|
329
333
|
execution_id = str(uuid.uuid4())
|
|
334
|
+
|
|
330
335
|
set_execution_context(eval_item, self.span_collector, execution_id)
|
|
331
336
|
|
|
332
337
|
await event_bus.publish(
|
|
@@ -346,11 +351,41 @@ class UiPathEvalRuntime(UiPathBaseRuntime, Generic[T, C]):
|
|
|
346
351
|
evaluation_item_results: list[EvalItemResult] = []
|
|
347
352
|
|
|
348
353
|
for evaluator in evaluators:
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
+
# Determine which evaluator method to use based on evaluation set/item type
|
|
355
|
+
evaluation_result: Optional[EvaluationResult] = None
|
|
356
|
+
|
|
357
|
+
match eval_item:
|
|
358
|
+
case LegacyEvaluationItem():
|
|
359
|
+
# Legacy evaluation - use run_legacy_evaluator
|
|
360
|
+
evaluation_result = await self.run_legacy_evaluator(
|
|
361
|
+
evaluator=evaluator, # type: ignore
|
|
362
|
+
execution_output=agent_execution_output,
|
|
363
|
+
eval_item=eval_item,
|
|
364
|
+
)
|
|
365
|
+
case EvaluationItem() if (
|
|
366
|
+
evaluator.id in eval_item.evaluation_criterias
|
|
367
|
+
):
|
|
368
|
+
# New evaluation with criteria
|
|
369
|
+
evaluation_criteria = eval_item.evaluation_criterias[
|
|
370
|
+
evaluator.id
|
|
371
|
+
]
|
|
372
|
+
|
|
373
|
+
evaluation_result = await self.run_evaluator(
|
|
374
|
+
evaluator=evaluator, # type: ignore
|
|
375
|
+
execution_output=agent_execution_output,
|
|
376
|
+
eval_item=eval_item,
|
|
377
|
+
evaluation_criteria=evaluator.evaluation_criteria_type( # type: ignore
|
|
378
|
+
**evaluation_criteria
|
|
379
|
+
)
|
|
380
|
+
if evaluation_criteria
|
|
381
|
+
else evaluator.evaluator_config.default_evaluation_criteria, # type: ignore
|
|
382
|
+
)
|
|
383
|
+
case _:
|
|
384
|
+
# Skip if evaluator not in evaluation criteria
|
|
385
|
+
continue
|
|
386
|
+
|
|
387
|
+
if evaluation_result is None:
|
|
388
|
+
continue
|
|
354
389
|
|
|
355
390
|
dto_result = EvaluationResultDto.from_evaluation_result(
|
|
356
391
|
evaluation_result
|
|
@@ -449,7 +484,7 @@ class UiPathEvalRuntime(UiPathBaseRuntime, Generic[T, C]):
|
|
|
449
484
|
return spans, logs
|
|
450
485
|
|
|
451
486
|
async def execute_runtime(
|
|
452
|
-
self, eval_item:
|
|
487
|
+
self, eval_item: AnyEvaluationItem, execution_id: str
|
|
453
488
|
) -> UiPathEvalRunExecutionOutput:
|
|
454
489
|
context_args = self.context.model_dump()
|
|
455
490
|
context_args["execution_id"] = execution_id
|
|
@@ -486,7 +521,6 @@ class UiPathEvalRuntime(UiPathBaseRuntime, Generic[T, C]):
|
|
|
486
521
|
|
|
487
522
|
if result is None:
|
|
488
523
|
raise ValueError("Execution result cannot be None for eval runs")
|
|
489
|
-
|
|
490
524
|
return UiPathEvalRunExecutionOutput(
|
|
491
525
|
execution_time=end_time - start_time,
|
|
492
526
|
spans=spans,
|
|
@@ -501,9 +535,31 @@ class UiPathEvalRuntime(UiPathBaseRuntime, Generic[T, C]):
|
|
|
501
535
|
|
|
502
536
|
async def run_evaluator(
|
|
503
537
|
self,
|
|
504
|
-
evaluator: BaseEvaluator[Any],
|
|
538
|
+
evaluator: BaseEvaluator[Any, Any, Any],
|
|
505
539
|
execution_output: UiPathEvalRunExecutionOutput,
|
|
506
540
|
eval_item: EvaluationItem,
|
|
541
|
+
*,
|
|
542
|
+
evaluation_criteria: Any,
|
|
543
|
+
) -> EvaluationResult:
|
|
544
|
+
agent_execution = AgentExecution(
|
|
545
|
+
agent_input=eval_item.inputs,
|
|
546
|
+
agent_output=execution_output.result.output or {},
|
|
547
|
+
agent_trace=execution_output.spans,
|
|
548
|
+
expected_agent_behavior=eval_item.expected_agent_behavior,
|
|
549
|
+
)
|
|
550
|
+
|
|
551
|
+
result = await evaluator.validate_and_evaluate_criteria(
|
|
552
|
+
agent_execution=agent_execution,
|
|
553
|
+
evaluation_criteria=evaluation_criteria,
|
|
554
|
+
)
|
|
555
|
+
|
|
556
|
+
return result
|
|
557
|
+
|
|
558
|
+
async def run_legacy_evaluator(
|
|
559
|
+
self,
|
|
560
|
+
evaluator: LegacyBaseEvaluator[Any],
|
|
561
|
+
execution_output: UiPathEvalRunExecutionOutput,
|
|
562
|
+
eval_item: LegacyEvaluationItem,
|
|
507
563
|
) -> EvaluationResult:
|
|
508
564
|
agent_execution = AgentExecution(
|
|
509
565
|
agent_input=eval_item.inputs,
|
|
@@ -520,9 +576,7 @@ class UiPathEvalRuntime(UiPathBaseRuntime, Generic[T, C]):
|
|
|
520
576
|
|
|
521
577
|
return result
|
|
522
578
|
|
|
523
|
-
def _load_evaluators(
|
|
524
|
-
self, evaluation_set: EvaluationSet
|
|
525
|
-
) -> List[BaseEvaluator[Any]]:
|
|
579
|
+
def _load_evaluators(self, evaluation_set: AnyEvaluationSet) -> list[AnyEvaluator]:
|
|
526
580
|
"""Load evaluators referenced by the evaluation set."""
|
|
527
581
|
evaluators = []
|
|
528
582
|
evaluators_dir = Path(self.context.eval_set).parent.parent / "evaluators" # type: ignore
|
|
@@ -67,9 +67,7 @@ async def generate_llm_input(
|
|
|
67
67
|
if evaluation_item.input_mocking_strategy
|
|
68
68
|
else "",
|
|
69
69
|
expected_behavior=evaluation_item.expected_agent_behavior or "",
|
|
70
|
-
expected_output=json.dumps(evaluation_item.
|
|
71
|
-
if evaluation_item.expected_output
|
|
72
|
-
else "",
|
|
70
|
+
expected_output=json.dumps(evaluation_item.evaluation_criterias, indent=2),
|
|
73
71
|
)
|
|
74
72
|
|
|
75
73
|
response_format = {
|
|
@@ -10,7 +10,7 @@ from uipath.tracing._traced import traced
|
|
|
10
10
|
from uipath.tracing._utils import _SpanUtils
|
|
11
11
|
|
|
12
12
|
from .._models._evaluation_set import (
|
|
13
|
-
|
|
13
|
+
AnyEvaluationItem,
|
|
14
14
|
LLMMockingStrategy,
|
|
15
15
|
)
|
|
16
16
|
from .._models._mocks import ExampleCall
|
|
@@ -77,7 +77,7 @@ def pydantic_to_dict_safe(obj: Any) -> Any:
|
|
|
77
77
|
class LLMMocker(Mocker):
|
|
78
78
|
"""LLM Based Mocker."""
|
|
79
79
|
|
|
80
|
-
def __init__(self, evaluation_item:
|
|
80
|
+
def __init__(self, evaluation_item: AnyEvaluationItem):
|
|
81
81
|
"""LLM Mocker constructor."""
|
|
82
82
|
self.evaluation_item = evaluation_item
|
|
83
83
|
assert isinstance(self.evaluation_item.mocking_strategy, LLMMockingStrategy)
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
"""Mocker Factory."""
|
|
2
2
|
|
|
3
3
|
from uipath._cli._evals._models._evaluation_set import (
|
|
4
|
-
|
|
4
|
+
AnyEvaluationItem,
|
|
5
5
|
LLMMockingStrategy,
|
|
6
6
|
MockitoMockingStrategy,
|
|
7
7
|
)
|
|
@@ -14,7 +14,7 @@ class MockerFactory:
|
|
|
14
14
|
"""Mocker factory."""
|
|
15
15
|
|
|
16
16
|
@staticmethod
|
|
17
|
-
def create(evaluation_item:
|
|
17
|
+
def create(evaluation_item: AnyEvaluationItem) -> Mocker:
|
|
18
18
|
"""Create a mocker instance."""
|
|
19
19
|
match evaluation_item.mocking_strategy:
|
|
20
20
|
case LLMMockingStrategy():
|
|
@@ -9,7 +9,7 @@ from hydra.utils import instantiate
|
|
|
9
9
|
from mockito import invocation, mocking # type: ignore[import-untyped]
|
|
10
10
|
|
|
11
11
|
from uipath._cli._evals._models._evaluation_set import (
|
|
12
|
-
|
|
12
|
+
AnyEvaluationItem,
|
|
13
13
|
MockingAnswerType,
|
|
14
14
|
MockitoMockingStrategy,
|
|
15
15
|
)
|
|
@@ -38,7 +38,7 @@ class Stub:
|
|
|
38
38
|
class MockitoMocker(Mocker):
|
|
39
39
|
"""Mockito Mocker."""
|
|
40
40
|
|
|
41
|
-
def __init__(self, evaluation_item:
|
|
41
|
+
def __init__(self, evaluation_item: AnyEvaluationItem):
|
|
42
42
|
"""Instantiate a mockito mocker."""
|
|
43
43
|
self.evaluation_item = evaluation_item
|
|
44
44
|
assert isinstance(self.evaluation_item.mocking_strategy, MockitoMockingStrategy)
|
|
@@ -4,13 +4,13 @@ import logging
|
|
|
4
4
|
from contextvars import ContextVar
|
|
5
5
|
from typing import Any, Callable, Optional
|
|
6
6
|
|
|
7
|
-
from uipath._cli._evals._models._evaluation_set import
|
|
7
|
+
from uipath._cli._evals._models._evaluation_set import AnyEvaluationItem
|
|
8
8
|
from uipath._cli._evals._span_collection import ExecutionSpanCollector
|
|
9
9
|
from uipath._cli._evals.mocks.mocker import Mocker, UiPathNoMockFoundError
|
|
10
10
|
from uipath._cli._evals.mocks.mocker_factory import MockerFactory
|
|
11
11
|
|
|
12
12
|
# Context variables for evaluation items and mockers
|
|
13
|
-
evaluation_context: ContextVar[Optional[
|
|
13
|
+
evaluation_context: ContextVar[Optional[AnyEvaluationItem]] = ContextVar(
|
|
14
14
|
"evaluation", default=None
|
|
15
15
|
)
|
|
16
16
|
|
|
@@ -30,7 +30,9 @@ logger = logging.getLogger(__name__)
|
|
|
30
30
|
|
|
31
31
|
|
|
32
32
|
def set_execution_context(
|
|
33
|
-
eval_item:
|
|
33
|
+
eval_item: AnyEvaluationItem,
|
|
34
|
+
span_collector: ExecutionSpanCollector,
|
|
35
|
+
execution_id: str,
|
|
34
36
|
) -> None:
|
|
35
37
|
"""Set the execution context for an evaluation run for mocking and trace access."""
|
|
36
38
|
evaluation_context.set(eval_item)
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
"""Models for push command."""
|
|
2
|
+
|
|
3
|
+
from pydantic import BaseModel, Field
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class EvaluatorFileDetails(BaseModel):
|
|
7
|
+
"""Details about an evaluator file for push operations."""
|
|
8
|
+
|
|
9
|
+
path: str
|
|
10
|
+
custom_evaluator_file_name: str = Field(
|
|
11
|
+
"", description="Name of the custom evaluator file, if available."
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
@property
|
|
15
|
+
def is_custom(self) -> bool:
|
|
16
|
+
"""Check if this is a custom evaluator."""
|
|
17
|
+
return len(self.custom_evaluator_file_name) > 0
|