uipath 2.1.51__py3-none-any.whl → 2.1.53__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- uipath/_cli/_evals/{_evaluators/_evaluator_factory.py → _evaluator_factory.py} +24 -23
- uipath/_cli/_evals/_models/_evaluation_set.py +23 -18
- uipath/_cli/_evals/_models/_evaluator_base_params.py +16 -0
- uipath/_cli/_evals/_models/_output.py +85 -0
- uipath/_cli/_evals/_runtime.py +102 -10
- uipath/_cli/_runtime/_contracts.py +12 -3
- uipath/_cli/_utils/_eval_set.py +1 -1
- uipath/_cli/_utils/_project_files.py +1 -0
- uipath/_cli/cli_eval.py +46 -61
- uipath/eval/evaluators/__init__.py +15 -0
- uipath/eval/evaluators/base_evaluator.py +88 -0
- uipath/eval/evaluators/deterministic_evaluator_base.py +53 -0
- uipath/eval/evaluators/exact_match_evaluator.py +37 -0
- uipath/{_cli/_evals/_evaluators/_json_similarity_evaluator.py → eval/evaluators/json_similarity_evaluator.py} +23 -40
- uipath/eval/evaluators/llm_as_judge_evaluator.py +137 -0
- uipath/eval/evaluators/trajectory_evaluator.py +36 -0
- uipath/eval/models/__init__.py +19 -0
- uipath/{_cli/_evals/_models/_evaluators.py → eval/models/models.py} +67 -43
- {uipath-2.1.51.dist-info → uipath-2.1.53.dist-info}/METADATA +1 -1
- {uipath-2.1.51.dist-info → uipath-2.1.53.dist-info}/RECORD +23 -23
- uipath/_cli/_evals/_evaluators/__init__.py +0 -22
- uipath/_cli/_evals/_evaluators/_deterministic_evaluator_base.py +0 -46
- uipath/_cli/_evals/_evaluators/_evaluator_base.py +0 -124
- uipath/_cli/_evals/_evaluators/_exact_match_evaluator.py +0 -40
- uipath/_cli/_evals/_evaluators/_llm_as_judge_evaluator.py +0 -183
- uipath/_cli/_evals/_evaluators/_trajectory_evaluator.py +0 -48
- uipath/_cli/_evals/_models/__init__.py +0 -18
- uipath/_cli/_evals/_models/_agent_execution_output.py +0 -14
- uipath/_cli/_evals/progress_reporter.py +0 -304
- {uipath-2.1.51.dist-info → uipath-2.1.53.dist-info}/WHEEL +0 -0
- {uipath-2.1.51.dist-info → uipath-2.1.53.dist-info}/entry_points.txt +0 -0
- {uipath-2.1.51.dist-info → uipath-2.1.53.dist-info}/licenses/LICENSE +0 -0
@@ -9,7 +9,7 @@ uipath/_cli/__init__.py,sha256=tscKceSouYcEOxUbGjoyHi4qGi74giBFeXG1I-ut1hs,2308
|
|
9
9
|
uipath/_cli/cli_auth.py,sha256=i3ykLlCg68xgPXHHaa0agHwGFIiLiTLzOiF6Su8XaEo,2436
|
10
10
|
uipath/_cli/cli_deploy.py,sha256=KPCmQ0c_NYD5JofSDao5r6QYxHshVCRxlWDVnQvlp5w,645
|
11
11
|
uipath/_cli/cli_dev.py,sha256=nEfpjw1PZ72O6jmufYWVrueVwihFxDPOeJakdvNHdOA,2146
|
12
|
-
uipath/_cli/cli_eval.py,sha256=
|
12
|
+
uipath/_cli/cli_eval.py,sha256=uiisQ3Wpalu8e9gHRMmn_9Gqus_t4brbjTegMQOhLa0,3831
|
13
13
|
uipath/_cli/cli_init.py,sha256=Ac3-9tIH3rpikIX1ehWTo7InW5tjVNoz_w6fjvgLK4w,7052
|
14
14
|
uipath/_cli/cli_invoke.py,sha256=4jyhqcy7tPrpxvaUhW-9gut6ddsCGMdJJcpOXXmIe8g,4348
|
15
15
|
uipath/_cli/cli_new.py,sha256=9378NYUBc9j-qKVXV7oja-jahfJhXBg8zKVyaon7ctY,2102
|
@@ -44,22 +44,13 @@ uipath/_cli/_dev/_terminal/_styles/terminal.tcss,sha256=ktVpKwXIXw2VZp8KIZD6fO9i
|
|
44
44
|
uipath/_cli/_dev/_terminal/_utils/_chat.py,sha256=YUZxYVdmEManwHDuZsczJT1dWIYE1dVBgABlurwMFcE,8493
|
45
45
|
uipath/_cli/_dev/_terminal/_utils/_exporter.py,sha256=oI6D_eMwrh_2aqDYUh4GrJg8VLGrLYhDahR-_o0uJns,4144
|
46
46
|
uipath/_cli/_dev/_terminal/_utils/_logger.py,sha256=jeNShEED27cNIHTe_NNx-2kUiXpSLTmi0onM6tVkqRM,888
|
47
|
-
uipath/_cli/_evals/
|
48
|
-
uipath/_cli/_evals/
|
49
|
-
uipath/_cli/_evals/
|
50
|
-
uipath/_cli/_evals/
|
51
|
-
uipath/_cli/_evals/
|
52
|
-
uipath/_cli/_evals/_evaluators/_evaluator_factory.py,sha256=cURShn17X6BW-_G3rknJXWtlgpeh5UdioLUV6oGCGAU,4912
|
53
|
-
uipath/_cli/_evals/_evaluators/_exact_match_evaluator.py,sha256=lvEtAitrZy9myoZLMXLqlBWBPX06Msu67kuFMGSbikM,1319
|
54
|
-
uipath/_cli/_evals/_evaluators/_json_similarity_evaluator.py,sha256=HpmkvuwU4Az3IIqFVLUmDvzkqb21pFMxY0sg2biZOMM,7093
|
55
|
-
uipath/_cli/_evals/_evaluators/_llm_as_judge_evaluator.py,sha256=nSLZ29xWqALEI53ifr79JPXjyx0T4sr7p-4NygwgAio,6594
|
56
|
-
uipath/_cli/_evals/_evaluators/_trajectory_evaluator.py,sha256=dnogQTOskpI4_cNF0Ge3hBceJJocvOgxBWAwaCWnzB0,1595
|
57
|
-
uipath/_cli/_evals/_models/__init__.py,sha256=Ewjp3u2YeTH2MmzY9LWf7EIbAoIf_nW9fMYbj7pGlPs,420
|
58
|
-
uipath/_cli/_evals/_models/_agent_execution_output.py,sha256=llvApU4JkTnNgQ5DvHPt8ee3bnV6cCANyeiebWKE07E,401
|
59
|
-
uipath/_cli/_evals/_models/_evaluation_set.py,sha256=tVHykSget-G3sOCs9bSchMYUTpFqzXVlYYbY8L9SI0c,1518
|
60
|
-
uipath/_cli/_evals/_models/_evaluators.py,sha256=l57NEVyYmzSKuoIXuGkE94Br01hAMg35fiS2MlTkaQM,2115
|
47
|
+
uipath/_cli/_evals/_evaluator_factory.py,sha256=2lOalabNSzmnnwr0SfoPWvFWXs0Ly857XBmPuOdhFBQ,4729
|
48
|
+
uipath/_cli/_evals/_runtime.py,sha256=KFGl2we1RH0omuD2HWw5thIK6DDZxVGtqx_G9T4DM_A,8332
|
49
|
+
uipath/_cli/_evals/_models/_evaluation_set.py,sha256=mwcTstHuyHd7ys_nLzgCNKBAsS4ns9UL2TF5Oq2Cc64,1758
|
50
|
+
uipath/_cli/_evals/_models/_evaluator_base_params.py,sha256=lTYKOV66tcjW85KHTyOdtF1p1VDaBNemrMAvH8bFIFc,382
|
51
|
+
uipath/_cli/_evals/_models/_output.py,sha256=TTQ0hhmD3dTkIbj_Ly_rDCGSnpZsHwdmCsl7FLdoZD0,2634
|
61
52
|
uipath/_cli/_push/sw_file_handler.py,sha256=AX4TKM-q6CNGw3JyBW02M8ktPZuFMcAU9LN3Ii0Q2QI,18202
|
62
|
-
uipath/_cli/_runtime/_contracts.py,sha256=
|
53
|
+
uipath/_cli/_runtime/_contracts.py,sha256=FziI4E1GF-brOwkYSGEUvRAKhLJUQ_2ZwUMefynl944,28764
|
63
54
|
uipath/_cli/_runtime/_escalation.py,sha256=x3vI98qsfRA-fL_tNkRVTFXioM5Gv2w0GFcXJJ5eQtg,7981
|
64
55
|
uipath/_cli/_runtime/_hitl.py,sha256=VKbM021nVg1HEDnTfucSLJ0LsDn83CKyUtVzofS2qTU,11369
|
65
56
|
uipath/_cli/_runtime/_logging.py,sha256=MGklGKPjYKjs7J5Jy9eplA9zCDsdtEbkZdCbTwgut_4,8311
|
@@ -74,12 +65,12 @@ uipath/_cli/_utils/_common.py,sha256=CzhhkIRfCuQ1-5HLDtjzOyt8KFs1jm6wzrBeU_v2B7c
|
|
74
65
|
uipath/_cli/_utils/_console.py,sha256=scvnrrFoFX6CE451K-PXKV7UN0DUkInbOtDZ5jAdPP0,10070
|
75
66
|
uipath/_cli/_utils/_constants.py,sha256=rS8lQ5Nzull8ytajK6lBsz398qiCp1REoAwlHtyBwF0,1415
|
76
67
|
uipath/_cli/_utils/_debug.py,sha256=zamzIR4VgbdKADAE4gbmjxDsbgF7wvdr7C5Dqp744Oc,1739
|
77
|
-
uipath/_cli/_utils/_eval_set.py,sha256=
|
68
|
+
uipath/_cli/_utils/_eval_set.py,sha256=4aP8yAC-jMrNYaC62Yj8fHD2hNlotGwy63bciQrpdc4,2766
|
78
69
|
uipath/_cli/_utils/_folders.py,sha256=UVJcKPfPAVR5HF4AP6EXdlNVcfEF1v5pwGCpoAgBY34,1155
|
79
70
|
uipath/_cli/_utils/_input_args.py,sha256=3LGNqVpJItvof75VGm-ZNTUMUH9-c7-YgleM5b2YgRg,5088
|
80
71
|
uipath/_cli/_utils/_parse_ast.py,sha256=8Iohz58s6bYQ7rgWtOTjrEInLJ-ETikmOMZzZdIY2Co,20072
|
81
72
|
uipath/_cli/_utils/_processes.py,sha256=q7DfEKHISDWf3pngci5za_z0Pbnf_shWiYEcTOTCiyk,1855
|
82
|
-
uipath/_cli/_utils/_project_files.py,sha256=
|
73
|
+
uipath/_cli/_utils/_project_files.py,sha256=sulh3xZhDDw_rBOrn_XSUfVSD6sUu47ZK4n_lF5BKkQ,13197
|
83
74
|
uipath/_cli/_utils/_studio_project.py,sha256=HvzcpIIIA4hUIvMbId1dsAhmFLMuhnS2ZtyNdcpXJ8c,15422
|
84
75
|
uipath/_cli/_utils/_tracing.py,sha256=2igb03j3EHjF_A406UhtCKkPfudVfFPjUq5tXUEG4oo,1541
|
85
76
|
uipath/_cli/_utils/_uv_helpers.py,sha256=6SvoLnZPoKIxW0sjMvD1-ENV_HOXDYzH34GjBqwT138,3450
|
@@ -122,6 +113,15 @@ uipath/agent/conversation/meta.py,sha256=3t0eS9UHoAPHre97QTUeVbjDhnMX4zj4-qG6ju0
|
|
122
113
|
uipath/agent/conversation/tool.py,sha256=ol8XI8AVd-QNn5auXNBPcCzOkh9PPFtL7hTK3kqInkU,2191
|
123
114
|
uipath/eval/_helpers/__init__.py,sha256=GSmZMryjuO3Wo_zdxZdrHCRRsgOxsVFYkYgJ15YNC3E,86
|
124
115
|
uipath/eval/_helpers/helpers.py,sha256=iE2HHdMiAdAMLqxHkPKHpfecEtAuN5BTBqvKFTI8ciE,1315
|
116
|
+
uipath/eval/evaluators/__init__.py,sha256=DJAAhgv0I5UfBod4sGnSiKerfrz1iMmk7GNFb71V8eI,494
|
117
|
+
uipath/eval/evaluators/base_evaluator.py,sha256=gryaN7WMV__NGorwu4WPRL5A5RlJ1exQ9jDJ6ZrXDB8,2679
|
118
|
+
uipath/eval/evaluators/deterministic_evaluator_base.py,sha256=yDWTMU1mG-93D6DscAUHmaVUc1rhGYtNjGXgevzAObM,1723
|
119
|
+
uipath/eval/evaluators/exact_match_evaluator.py,sha256=Qfz-kIUf80PKjAuge1Tc1GvN6kDB6hHveBZ86w_2How,1512
|
120
|
+
uipath/eval/evaluators/json_similarity_evaluator.py,sha256=cP4kpN-UIf690V5dq4LaCjJc2zFx-nEffUclCwDdlhM,6607
|
121
|
+
uipath/eval/evaluators/llm_as_judge_evaluator.py,sha256=l0bbn8ZLi9ZTXcgr7tJ2tsCvHFqIIeGa7sobaAHgI2Y,4927
|
122
|
+
uipath/eval/evaluators/trajectory_evaluator.py,sha256=7boiKzjLpQPs8M8y2PGnI3bZQ1MEwR6QRZpXyKQcR7Y,1244
|
123
|
+
uipath/eval/models/__init__.py,sha256=x360CDZaRjUL3q3kh2CcXYYrQ47jwn6p6JnmhEIvMlA,419
|
124
|
+
uipath/eval/models/models.py,sha256=9IraD5C2KfKK1ZLMZ7jBOJzzHW4X1Dp2k41abqmPMnA,2838
|
125
125
|
uipath/models/__init__.py,sha256=d_DkK1AtRUetM1t2NrH5UKgvJOBiynzaKnK5pMY7aIc,1289
|
126
126
|
uipath/models/action_schema.py,sha256=tBn1qQ3NQLU5nwWlBIzIKIx3XK5pO_D1S51IjFlZ1FA,610
|
127
127
|
uipath/models/actions.py,sha256=1vRsJ3JSmMdPkbiYAiHzY8K44vmW3VlMsmQUBAkSgrQ,3141
|
@@ -148,8 +148,8 @@ uipath/tracing/_traced.py,sha256=qeVDrds2OUnpdUIA0RhtF0kg2dlAZhyC1RRkI-qivTM,185
|
|
148
148
|
uipath/tracing/_utils.py,sha256=wJRELaPu69iY0AhV432Dk5QYf_N_ViRU4kAUG1BI1ew,10384
|
149
149
|
uipath/utils/__init__.py,sha256=VD-KXFpF_oWexFg6zyiWMkxl2HM4hYJMIUDZ1UEtGx0,105
|
150
150
|
uipath/utils/_endpoints_manager.py,sha256=iRTl5Q0XAm_YgcnMcJOXtj-8052sr6jpWuPNz6CgT0Q,8408
|
151
|
-
uipath-2.1.
|
152
|
-
uipath-2.1.
|
153
|
-
uipath-2.1.
|
154
|
-
uipath-2.1.
|
155
|
-
uipath-2.1.
|
151
|
+
uipath-2.1.53.dist-info/METADATA,sha256=QYSNjKWwpoLp7Vuc_HWzFplKDGJu_40CGdIhmf9SxFE,6482
|
152
|
+
uipath-2.1.53.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
153
|
+
uipath-2.1.53.dist-info/entry_points.txt,sha256=9C2_29U6Oq1ExFu7usihR-dnfIVNSKc-0EFbh0rskB4,43
|
154
|
+
uipath-2.1.53.dist-info/licenses/LICENSE,sha256=-KBavWXepyDjimmzH5fVAsi-6jNVpIKFc2kZs0Ri4ng,1058
|
155
|
+
uipath-2.1.53.dist-info/RECORD,,
|
@@ -1,22 +0,0 @@
|
|
1
|
-
"""Evaluators package for the evaluation system.
|
2
|
-
|
3
|
-
This package contains all evaluator types and the factory for creating them.
|
4
|
-
"""
|
5
|
-
|
6
|
-
from ._deterministic_evaluator_base import DeterministicEvaluatorBase
|
7
|
-
from ._evaluator_base import EvaluatorBase
|
8
|
-
from ._evaluator_factory import EvaluatorFactory
|
9
|
-
from ._exact_match_evaluator import ExactMatchEvaluator
|
10
|
-
from ._json_similarity_evaluator import JsonSimilarityEvaluator
|
11
|
-
from ._llm_as_judge_evaluator import LlmAsAJudgeEvaluator
|
12
|
-
from ._trajectory_evaluator import TrajectoryEvaluator
|
13
|
-
|
14
|
-
__all__ = [
|
15
|
-
"EvaluatorBase",
|
16
|
-
"DeterministicEvaluatorBase",
|
17
|
-
"EvaluatorFactory",
|
18
|
-
"JsonSimilarityEvaluator",
|
19
|
-
"ExactMatchEvaluator",
|
20
|
-
"LlmAsAJudgeEvaluator",
|
21
|
-
"TrajectoryEvaluator",
|
22
|
-
]
|
@@ -1,46 +0,0 @@
|
|
1
|
-
import copy
|
2
|
-
import json
|
3
|
-
from abc import ABC
|
4
|
-
from typing import Any, Dict, Tuple
|
5
|
-
|
6
|
-
from ._evaluator_base import EvaluatorBase
|
7
|
-
|
8
|
-
|
9
|
-
class DeterministicEvaluatorBase(EvaluatorBase, ABC):
|
10
|
-
def __init__(self, target_output_key: str = "*"):
|
11
|
-
super().__init__()
|
12
|
-
self.target_output_key = target_output_key
|
13
|
-
|
14
|
-
def _select_targets(
|
15
|
-
self, expected_output: Dict[str, Any], actual_output: Dict[str, Any]
|
16
|
-
) -> Tuple[Any, Any]:
|
17
|
-
actual_output_copy = copy.deepcopy(actual_output)
|
18
|
-
expected_output_copy = copy.deepcopy(expected_output)
|
19
|
-
if self.target_output_key != "*":
|
20
|
-
if (
|
21
|
-
self.target_output_key not in actual_output
|
22
|
-
or self.target_output_key not in expected_output
|
23
|
-
):
|
24
|
-
raise ValueError(
|
25
|
-
f"Field '{self.target_output_key}' missing from expected or actual output"
|
26
|
-
)
|
27
|
-
actual_output_copy = actual_output_copy[self.target_output_key]
|
28
|
-
expected_output_copy = expected_output[self.target_output_key]
|
29
|
-
return actual_output_copy, expected_output_copy
|
30
|
-
|
31
|
-
def _canonical_json(self, obj: Any) -> str:
|
32
|
-
return json.dumps(
|
33
|
-
self._normalize_numbers(obj),
|
34
|
-
sort_keys=True,
|
35
|
-
separators=(",", ":"),
|
36
|
-
ensure_ascii=False,
|
37
|
-
)
|
38
|
-
|
39
|
-
def _normalize_numbers(self, obj: Any) -> Any:
|
40
|
-
if isinstance(obj, dict):
|
41
|
-
return {k: self._normalize_numbers(v) for k, v in obj.items()}
|
42
|
-
if isinstance(obj, (list, tuple)):
|
43
|
-
return [self._normalize_numbers(v) for v in obj]
|
44
|
-
if isinstance(obj, (int, float)) and not isinstance(obj, bool):
|
45
|
-
return float(obj)
|
46
|
-
return obj
|
@@ -1,124 +0,0 @@
|
|
1
|
-
import functools
|
2
|
-
import time
|
3
|
-
from abc import ABC, abstractmethod
|
4
|
-
from dataclasses import dataclass
|
5
|
-
from typing import Any, Dict
|
6
|
-
|
7
|
-
from uipath._cli._evals._models import (
|
8
|
-
EvaluationResult,
|
9
|
-
EvaluatorCategory,
|
10
|
-
EvaluatorType,
|
11
|
-
)
|
12
|
-
|
13
|
-
|
14
|
-
def measure_execution_time(func):
|
15
|
-
"""Decorator to measure execution time and update EvaluationResult.evaluation_time."""
|
16
|
-
|
17
|
-
@functools.wraps(func)
|
18
|
-
async def wrapper(*args, **kwargs) -> EvaluationResult:
|
19
|
-
start_time = time.time()
|
20
|
-
result = await func(*args, **kwargs)
|
21
|
-
end_time = time.time()
|
22
|
-
execution_time = end_time - start_time
|
23
|
-
|
24
|
-
result.evaluation_time = execution_time
|
25
|
-
return result
|
26
|
-
|
27
|
-
return wrapper
|
28
|
-
|
29
|
-
|
30
|
-
@dataclass
|
31
|
-
class EvaluatorBaseParams:
|
32
|
-
"""Parameters for initializing the base evaluator."""
|
33
|
-
|
34
|
-
evaluator_id: str
|
35
|
-
category: EvaluatorCategory
|
36
|
-
evaluator_type: EvaluatorType
|
37
|
-
name: str
|
38
|
-
description: str
|
39
|
-
created_at: str
|
40
|
-
updated_at: str
|
41
|
-
target_output_key: str
|
42
|
-
|
43
|
-
|
44
|
-
class EvaluatorBase(ABC):
|
45
|
-
"""Abstract base class for all evaluators."""
|
46
|
-
|
47
|
-
def __init__(self):
|
48
|
-
# initialization done via 'from_params' function
|
49
|
-
self.id: str
|
50
|
-
self.name: str
|
51
|
-
self.description: str
|
52
|
-
self.created_at: str
|
53
|
-
self.updated_at: str
|
54
|
-
self.category: EvaluatorCategory
|
55
|
-
self.type: EvaluatorType
|
56
|
-
self.target_output_key: str
|
57
|
-
pass
|
58
|
-
|
59
|
-
@classmethod
|
60
|
-
def from_params(cls, params: EvaluatorBaseParams, **kwargs):
|
61
|
-
"""Initialize the base evaluator from parameters.
|
62
|
-
|
63
|
-
Args:
|
64
|
-
params: EvaluatorBaseParams containing base configuration
|
65
|
-
**kwargs: Additional specific parameters for concrete evaluators
|
66
|
-
|
67
|
-
Returns:
|
68
|
-
Initialized evaluator instance
|
69
|
-
"""
|
70
|
-
instance = cls(**kwargs)
|
71
|
-
instance.id = params.evaluator_id
|
72
|
-
instance.category = params.category
|
73
|
-
instance.type = params.evaluator_type
|
74
|
-
instance.name = params.name
|
75
|
-
instance.description = params.description
|
76
|
-
instance.created_at = params.created_at
|
77
|
-
instance.updated_at = params.updated_at
|
78
|
-
instance.target_output_key = params.target_output_key
|
79
|
-
return instance
|
80
|
-
|
81
|
-
@measure_execution_time
|
82
|
-
@abstractmethod
|
83
|
-
async def evaluate(
|
84
|
-
self,
|
85
|
-
evaluation_id: str,
|
86
|
-
evaluation_name: str,
|
87
|
-
input_data: Dict[str, Any],
|
88
|
-
expected_output: Dict[str, Any],
|
89
|
-
actual_output: Dict[str, Any],
|
90
|
-
) -> EvaluationResult:
|
91
|
-
"""Evaluate the given data and return a result.
|
92
|
-
|
93
|
-
Args:
|
94
|
-
evaluation_id: The ID of the evaluation being processed
|
95
|
-
evaluation_name: The name of the evaluation
|
96
|
-
input_data: The input data for the evaluation
|
97
|
-
expected_output: The expected output
|
98
|
-
actual_output: The actual output from the agent
|
99
|
-
|
100
|
-
Returns:
|
101
|
-
EvaluationResult containing the score and details
|
102
|
-
"""
|
103
|
-
pass
|
104
|
-
|
105
|
-
def to_dict(self) -> Dict[str, Any]:
|
106
|
-
"""Convert the evaluator instance to a dictionary representation.
|
107
|
-
|
108
|
-
Returns:
|
109
|
-
Dict[str, Any]: Dictionary containing all evaluator properties
|
110
|
-
"""
|
111
|
-
return {
|
112
|
-
"id": self.id,
|
113
|
-
"name": self.name,
|
114
|
-
"description": self.description,
|
115
|
-
"created_at": self.created_at,
|
116
|
-
"updated_at": self.updated_at,
|
117
|
-
"category": self.category.name if self.category else None,
|
118
|
-
"type": self.type.name if self.type else None,
|
119
|
-
"target_output_key": self.target_output_key,
|
120
|
-
}
|
121
|
-
|
122
|
-
def __repr__(self) -> str:
|
123
|
-
"""String representation of the evaluator."""
|
124
|
-
return f"{self.__class__.__name__}(id='{self.id}', name='{self.name}', category={self.category.name})"
|
@@ -1,40 +0,0 @@
|
|
1
|
-
import copy
|
2
|
-
from typing import Any, Dict
|
3
|
-
|
4
|
-
from uipath._cli._evals._evaluators._deterministic_evaluator_base import (
|
5
|
-
DeterministicEvaluatorBase,
|
6
|
-
)
|
7
|
-
from uipath._cli._evals._models import EvaluationResult
|
8
|
-
from uipath._cli._evals._models._evaluators import ScoreType
|
9
|
-
|
10
|
-
|
11
|
-
class ExactMatchEvaluator(DeterministicEvaluatorBase):
|
12
|
-
async def evaluate(
|
13
|
-
self,
|
14
|
-
evaluation_id: str,
|
15
|
-
evaluation_name: str,
|
16
|
-
input_data: Dict[str, Any],
|
17
|
-
expected_output: Dict[str, Any],
|
18
|
-
actual_output: Dict[str, Any],
|
19
|
-
) -> EvaluationResult:
|
20
|
-
actual_output_copy = copy.deepcopy(actual_output)
|
21
|
-
expected_output_copy = copy.deepcopy(expected_output)
|
22
|
-
|
23
|
-
actual_output, expected_output = self._select_targets(
|
24
|
-
expected_output, actual_output
|
25
|
-
)
|
26
|
-
are_equal = self._canonical_json(actual_output) == self._canonical_json(
|
27
|
-
expected_output
|
28
|
-
)
|
29
|
-
|
30
|
-
return EvaluationResult(
|
31
|
-
evaluation_id=evaluation_id,
|
32
|
-
evaluation_name=evaluation_name,
|
33
|
-
evaluator_id=self.id,
|
34
|
-
evaluator_name=self.name,
|
35
|
-
score=are_equal,
|
36
|
-
input=input_data,
|
37
|
-
expected_output=expected_output_copy,
|
38
|
-
actual_output=actual_output_copy,
|
39
|
-
score_type=ScoreType.BOOLEAN,
|
40
|
-
)
|
@@ -1,183 +0,0 @@
|
|
1
|
-
import json
|
2
|
-
from typing import Any, Dict
|
3
|
-
|
4
|
-
from ...._config import Config
|
5
|
-
from ...._execution_context import ExecutionContext
|
6
|
-
from ...._services.llm_gateway_service import UiPathLlmChatService
|
7
|
-
from ...._utils.constants import (
|
8
|
-
ENV_BASE_URL,
|
9
|
-
ENV_UIPATH_ACCESS_TOKEN,
|
10
|
-
ENV_UNATTENDED_USER_ACCESS_TOKEN,
|
11
|
-
COMMUNITY_agents_SUFFIX,
|
12
|
-
)
|
13
|
-
from .._models import EvaluationResult, LLMResponse
|
14
|
-
from .._models._evaluators import ScoreType
|
15
|
-
from ._evaluator_base import EvaluatorBase
|
16
|
-
|
17
|
-
|
18
|
-
class LlmAsAJudgeEvaluator(EvaluatorBase):
|
19
|
-
"""Evaluator that uses an LLM to judge the quality of outputs."""
|
20
|
-
|
21
|
-
def __init__(self, prompt: str = "", model: str = "", target_output_key: str = "*"):
|
22
|
-
"""Initialize the LLM-as-a-judge evaluator.
|
23
|
-
|
24
|
-
Args:
|
25
|
-
prompt: The prompt template for the LLM
|
26
|
-
model: The model to use for evaluation
|
27
|
-
target_output_key: Key in output to evaluate ("*" for entire output)
|
28
|
-
"""
|
29
|
-
super().__init__()
|
30
|
-
self.actual_output_placeholder = "{{ActualOutput}}"
|
31
|
-
self.expected_output_placeholder = "{{ExpectedOutput}}"
|
32
|
-
self._initialize_llm()
|
33
|
-
self.prompt = prompt
|
34
|
-
self.model = model
|
35
|
-
self.target_output_key: str = target_output_key
|
36
|
-
|
37
|
-
def _initialize_llm(self):
|
38
|
-
"""Initialize the LLM used for evaluation."""
|
39
|
-
import os
|
40
|
-
|
41
|
-
base_url_value: str = os.getenv(ENV_BASE_URL) # type: ignore
|
42
|
-
secret_value: str = os.getenv(ENV_UNATTENDED_USER_ACCESS_TOKEN) or os.getenv(
|
43
|
-
ENV_UIPATH_ACCESS_TOKEN
|
44
|
-
) # type: ignore
|
45
|
-
config = Config(
|
46
|
-
base_url=base_url_value,
|
47
|
-
secret=secret_value,
|
48
|
-
)
|
49
|
-
self.llm = UiPathLlmChatService(config, ExecutionContext())
|
50
|
-
|
51
|
-
async def evaluate(
|
52
|
-
self,
|
53
|
-
evaluation_id: str,
|
54
|
-
evaluation_name: str,
|
55
|
-
input_data: Dict[str, Any],
|
56
|
-
expected_output: Dict[str, Any],
|
57
|
-
actual_output: Dict[str, Any],
|
58
|
-
) -> EvaluationResult:
|
59
|
-
"""Evaluate using an LLM as a judge.
|
60
|
-
|
61
|
-
Args:
|
62
|
-
evaluation_id: The ID of the evaluation being processed
|
63
|
-
evaluation_name: The name of the evaluation
|
64
|
-
input_data: The input data for the evaluation
|
65
|
-
expected_output: The expected output
|
66
|
-
actual_output: The actual output from the agent
|
67
|
-
|
68
|
-
Returns:
|
69
|
-
EvaluationResult containing the score and details
|
70
|
-
"""
|
71
|
-
# Extract the target value to evaluate
|
72
|
-
target_value = self._extract_target_value(actual_output)
|
73
|
-
expected_value = self._extract_target_value(expected_output)
|
74
|
-
|
75
|
-
# Create the evaluation prompt
|
76
|
-
evaluation_prompt = self._create_evaluation_prompt(expected_value, target_value)
|
77
|
-
|
78
|
-
llm_response = await self._get_llm_response(evaluation_prompt)
|
79
|
-
|
80
|
-
return EvaluationResult(
|
81
|
-
evaluation_id=evaluation_id,
|
82
|
-
evaluation_name=evaluation_name,
|
83
|
-
evaluator_id=self.id,
|
84
|
-
evaluator_name=self.name,
|
85
|
-
score=llm_response.score,
|
86
|
-
input=input_data,
|
87
|
-
expected_output=expected_output,
|
88
|
-
actual_output=actual_output,
|
89
|
-
details=llm_response.justification,
|
90
|
-
score_type=ScoreType.NUMERICAL,
|
91
|
-
)
|
92
|
-
|
93
|
-
def _extract_target_value(self, output: Dict[str, Any]) -> Any:
|
94
|
-
"""Extract the target value from output based on target_output_key."""
|
95
|
-
if self.target_output_key == "*":
|
96
|
-
return output
|
97
|
-
|
98
|
-
# Handle nested keys
|
99
|
-
keys = self.target_output_key.split(".")
|
100
|
-
value = output
|
101
|
-
|
102
|
-
try:
|
103
|
-
for key in keys:
|
104
|
-
if isinstance(value, dict):
|
105
|
-
value = value[key]
|
106
|
-
else:
|
107
|
-
return None
|
108
|
-
return value
|
109
|
-
except (KeyError, TypeError):
|
110
|
-
return None
|
111
|
-
|
112
|
-
def _create_evaluation_prompt(
|
113
|
-
self, expected_output: Any, actual_output: Any
|
114
|
-
) -> str:
|
115
|
-
"""Create the evaluation prompt for the LLM."""
|
116
|
-
formatted_prompt = self.prompt.replace(
|
117
|
-
self.actual_output_placeholder,
|
118
|
-
str(actual_output),
|
119
|
-
)
|
120
|
-
formatted_prompt = formatted_prompt.replace(
|
121
|
-
self.expected_output_placeholder,
|
122
|
-
str(expected_output),
|
123
|
-
)
|
124
|
-
|
125
|
-
return formatted_prompt
|
126
|
-
|
127
|
-
async def _get_llm_response(self, evaluation_prompt: str) -> LLMResponse:
|
128
|
-
"""Get response from the LLM.
|
129
|
-
|
130
|
-
Args:
|
131
|
-
evaluation_prompt: The formatted prompt to send to the LLM
|
132
|
-
|
133
|
-
Returns:
|
134
|
-
LLMResponse with score and justification
|
135
|
-
"""
|
136
|
-
try:
|
137
|
-
# remove community-agents suffix from llm model name
|
138
|
-
model = self.model
|
139
|
-
if model.endswith(COMMUNITY_agents_SUFFIX):
|
140
|
-
model = model.replace(COMMUNITY_agents_SUFFIX, "")
|
141
|
-
|
142
|
-
# Prepare the request
|
143
|
-
request_data = {
|
144
|
-
"model": model,
|
145
|
-
"messages": [{"role": "user", "content": evaluation_prompt}],
|
146
|
-
"response_format": {
|
147
|
-
"type": "json_schema",
|
148
|
-
"json_schema": {
|
149
|
-
"name": "evaluation_response",
|
150
|
-
"schema": {
|
151
|
-
"type": "object",
|
152
|
-
"properties": {
|
153
|
-
"score": {
|
154
|
-
"type": "number",
|
155
|
-
"minimum": 0,
|
156
|
-
"maximum": 100,
|
157
|
-
"description": "Score between 0 and 100",
|
158
|
-
},
|
159
|
-
"justification": {
|
160
|
-
"type": "string",
|
161
|
-
"description": "Explanation for the score",
|
162
|
-
},
|
163
|
-
},
|
164
|
-
"required": ["score", "justification"],
|
165
|
-
},
|
166
|
-
},
|
167
|
-
},
|
168
|
-
}
|
169
|
-
|
170
|
-
response = await self.llm.chat_completions(**request_data)
|
171
|
-
|
172
|
-
try:
|
173
|
-
return LLMResponse(**json.loads(response.choices[-1].message.content))
|
174
|
-
except (json.JSONDecodeError, ValueError) as e:
|
175
|
-
return LLMResponse(
|
176
|
-
score=0.0, justification=f"Error parsing LLM response: {str(e)}"
|
177
|
-
)
|
178
|
-
|
179
|
-
except Exception as e:
|
180
|
-
# Fallback in case of any errors
|
181
|
-
return LLMResponse(
|
182
|
-
score=0.0, justification=f"Error during LLM evaluation: {str(e)}"
|
183
|
-
)
|
@@ -1,48 +0,0 @@
|
|
1
|
-
from typing import Any, Dict
|
2
|
-
|
3
|
-
from .._models import EvaluationResult
|
4
|
-
from ._evaluator_base import EvaluatorBase
|
5
|
-
|
6
|
-
|
7
|
-
class TrajectoryEvaluator(EvaluatorBase):
|
8
|
-
"""Evaluator that analyzes the trajectory/path taken to reach outputs."""
|
9
|
-
|
10
|
-
def __init__(
|
11
|
-
self,
|
12
|
-
trajectory_config: Dict[str, Any],
|
13
|
-
step_weights: Dict[str, float],
|
14
|
-
target_output_key: str = "*",
|
15
|
-
):
|
16
|
-
"""Initialize the trajectory evaluator.
|
17
|
-
|
18
|
-
Args:
|
19
|
-
trajectory_config: Configuration for trajectory analysis
|
20
|
-
step_weights: Weights for different steps in the trajectory
|
21
|
-
target_output_key: Key in output to evaluate ("*" for entire output)
|
22
|
-
"""
|
23
|
-
super().__init__()
|
24
|
-
self.trajectory_config = trajectory_config or {}
|
25
|
-
self.step_weights = step_weights or {}
|
26
|
-
self.target_output_key = target_output_key
|
27
|
-
|
28
|
-
async def evaluate(
|
29
|
-
self,
|
30
|
-
evaluation_id: str,
|
31
|
-
evaluation_name: str,
|
32
|
-
input_data: Dict[str, Any],
|
33
|
-
expected_output: Dict[str, Any],
|
34
|
-
actual_output: Dict[str, Any],
|
35
|
-
) -> EvaluationResult:
|
36
|
-
"""Evaluate using trajectory analysis.
|
37
|
-
|
38
|
-
Args:
|
39
|
-
evaluation_id: The ID of the evaluation being processed
|
40
|
-
evaluation_name: The name of the evaluation
|
41
|
-
input_data: The input data for the evaluation
|
42
|
-
expected_output: The expected output
|
43
|
-
actual_output: The actual output from the agent
|
44
|
-
|
45
|
-
Returns:
|
46
|
-
EvaluationResult containing the score and details
|
47
|
-
"""
|
48
|
-
raise NotImplementedError()
|
@@ -1,18 +0,0 @@
|
|
1
|
-
from uipath._cli._evals._models._evaluation_set import EvaluationItem, EvaluationSet
|
2
|
-
from uipath._cli._evals._models._evaluators import (
|
3
|
-
EvaluationResult,
|
4
|
-
EvaluationSetResult,
|
5
|
-
EvaluatorCategory,
|
6
|
-
EvaluatorType,
|
7
|
-
LLMResponse,
|
8
|
-
)
|
9
|
-
|
10
|
-
__all__ = [
|
11
|
-
"LLMResponse",
|
12
|
-
"EvaluatorCategory",
|
13
|
-
"EvaluatorType",
|
14
|
-
"EvaluationResult",
|
15
|
-
"EvaluationSetResult",
|
16
|
-
"EvaluationItem",
|
17
|
-
"EvaluationSet",
|
18
|
-
]
|
@@ -1,14 +0,0 @@
|
|
1
|
-
from opentelemetry.sdk.trace import ReadableSpan
|
2
|
-
from pydantic import BaseModel, ConfigDict
|
3
|
-
|
4
|
-
from uipath._cli._runtime._contracts import UiPathRuntimeResult
|
5
|
-
|
6
|
-
|
7
|
-
class UiPathEvalRunExecutionOutput(BaseModel):
|
8
|
-
"""Result of a single agent response."""
|
9
|
-
|
10
|
-
model_config = ConfigDict(arbitrary_types_allowed=True)
|
11
|
-
|
12
|
-
execution_time: float
|
13
|
-
spans: list[ReadableSpan]
|
14
|
-
result: UiPathRuntimeResult
|