uipath 2.1.107__py3-none-any.whl → 2.1.109__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of uipath might be problematic. Click here for more details.
- uipath/_cli/__init__.py +4 -0
- uipath/_cli/_evals/_console_progress_reporter.py +2 -2
- uipath/_cli/_evals/_evaluator_factory.py +314 -29
- uipath/_cli/_evals/_helpers.py +194 -0
- uipath/_cli/_evals/_models/_evaluation_set.py +73 -7
- uipath/_cli/_evals/_models/_evaluator.py +183 -9
- uipath/_cli/_evals/_models/_evaluator_base_params.py +3 -3
- uipath/_cli/_evals/_models/_output.py +87 -3
- uipath/_cli/_evals/_progress_reporter.py +288 -28
- uipath/_cli/_evals/_runtime.py +80 -26
- uipath/_cli/_evals/mocks/input_mocker.py +1 -3
- uipath/_cli/_evals/mocks/llm_mocker.py +2 -2
- uipath/_cli/_evals/mocks/mocker_factory.py +2 -2
- uipath/_cli/_evals/mocks/mockito_mocker.py +2 -2
- uipath/_cli/_evals/mocks/mocks.py +5 -3
- uipath/_cli/_push/models.py +17 -0
- uipath/_cli/_push/sw_file_handler.py +336 -3
- uipath/_cli/_runtime/_contracts.py +25 -5
- uipath/_cli/_templates/custom_evaluator.py.template +65 -0
- uipath/_cli/_utils/_eval_set.py +30 -9
- uipath/_cli/_utils/_resources.py +21 -0
- uipath/_cli/_utils/_studio_project.py +18 -0
- uipath/_cli/cli_add.py +114 -0
- uipath/_cli/cli_eval.py +5 -1
- uipath/_cli/cli_pull.py +11 -26
- uipath/_cli/cli_push.py +2 -0
- uipath/_cli/cli_register.py +45 -0
- uipath/_events/_events.py +6 -5
- uipath/_resources/SDK_REFERENCE.md +0 -97
- uipath/_uipath.py +10 -37
- uipath/_utils/constants.py +4 -0
- uipath/eval/_helpers/evaluators_helpers.py +494 -0
- uipath/eval/_helpers/helpers.py +30 -2
- uipath/eval/evaluators/__init__.py +60 -5
- uipath/eval/evaluators/base_evaluator.py +546 -44
- uipath/eval/evaluators/contains_evaluator.py +80 -0
- uipath/eval/evaluators/exact_match_evaluator.py +43 -12
- uipath/eval/evaluators/json_similarity_evaluator.py +41 -12
- uipath/eval/evaluators/legacy_base_evaluator.py +89 -0
- uipath/eval/evaluators/{deterministic_evaluator_base.py → legacy_deterministic_evaluator_base.py} +2 -2
- uipath/eval/evaluators/legacy_exact_match_evaluator.py +37 -0
- uipath/eval/evaluators/legacy_json_similarity_evaluator.py +151 -0
- uipath/eval/evaluators/legacy_llm_as_judge_evaluator.py +137 -0
- uipath/eval/evaluators/{trajectory_evaluator.py → legacy_trajectory_evaluator.py} +5 -6
- uipath/eval/evaluators/llm_as_judge_evaluator.py +143 -78
- uipath/eval/evaluators/llm_judge_output_evaluator.py +112 -0
- uipath/eval/evaluators/llm_judge_trajectory_evaluator.py +142 -0
- uipath/eval/evaluators/output_evaluator.py +117 -0
- uipath/eval/evaluators/tool_call_args_evaluator.py +82 -0
- uipath/eval/evaluators/tool_call_count_evaluator.py +87 -0
- uipath/eval/evaluators/tool_call_order_evaluator.py +84 -0
- uipath/eval/evaluators/tool_call_output_evaluator.py +87 -0
- uipath/eval/evaluators_types/ContainsEvaluator.json +73 -0
- uipath/eval/evaluators_types/ExactMatchEvaluator.json +89 -0
- uipath/eval/evaluators_types/JsonSimilarityEvaluator.json +81 -0
- uipath/eval/evaluators_types/LLMJudgeOutputEvaluator.json +110 -0
- uipath/eval/evaluators_types/LLMJudgeSimulationTrajectoryEvaluator.json +88 -0
- uipath/eval/evaluators_types/LLMJudgeStrictJSONSimilarityOutputEvaluator.json +110 -0
- uipath/eval/evaluators_types/LLMJudgeTrajectoryEvaluator.json +88 -0
- uipath/eval/evaluators_types/ToolCallArgsEvaluator.json +131 -0
- uipath/eval/evaluators_types/ToolCallCountEvaluator.json +104 -0
- uipath/eval/evaluators_types/ToolCallOrderEvaluator.json +100 -0
- uipath/eval/evaluators_types/ToolCallOutputEvaluator.json +124 -0
- uipath/eval/evaluators_types/generate_types.py +31 -0
- uipath/eval/models/__init__.py +16 -1
- uipath/eval/models/llm_judge_types.py +196 -0
- uipath/eval/models/models.py +109 -7
- {uipath-2.1.107.dist-info → uipath-2.1.109.dist-info}/METADATA +1 -1
- {uipath-2.1.107.dist-info → uipath-2.1.109.dist-info}/RECORD +72 -40
- {uipath-2.1.107.dist-info → uipath-2.1.109.dist-info}/WHEEL +0 -0
- {uipath-2.1.107.dist-info → uipath-2.1.109.dist-info}/entry_points.txt +0 -0
- {uipath-2.1.107.dist-info → uipath-2.1.109.dist-info}/licenses/LICENSE +0 -0
uipath/_cli/__init__.py
CHANGED
|
@@ -4,6 +4,7 @@ import sys
|
|
|
4
4
|
import click
|
|
5
5
|
|
|
6
6
|
from ._utils._common import add_cwd_to_path, load_environment_variables
|
|
7
|
+
from .cli_add import add as add
|
|
7
8
|
from .cli_auth import auth as auth
|
|
8
9
|
from .cli_debug import debug as debug # type: ignore
|
|
9
10
|
from .cli_deploy import deploy as deploy # type: ignore
|
|
@@ -16,6 +17,7 @@ from .cli_pack import pack as pack # type: ignore
|
|
|
16
17
|
from .cli_publish import publish as publish # type: ignore
|
|
17
18
|
from .cli_pull import pull as pull # type: ignore
|
|
18
19
|
from .cli_push import push as push # type: ignore
|
|
20
|
+
from .cli_register import register as register # type: ignore
|
|
19
21
|
from .cli_run import run as run # type: ignore
|
|
20
22
|
|
|
21
23
|
|
|
@@ -75,4 +77,6 @@ cli.add_command(push)
|
|
|
75
77
|
cli.add_command(pull)
|
|
76
78
|
cli.add_command(eval)
|
|
77
79
|
cli.add_command(dev)
|
|
80
|
+
cli.add_command(add)
|
|
81
|
+
cli.add_command(register)
|
|
78
82
|
cli.add_command(debug)
|
|
@@ -7,6 +7,7 @@ from rich.console import Console
|
|
|
7
7
|
from rich.rule import Rule
|
|
8
8
|
from rich.table import Table
|
|
9
9
|
|
|
10
|
+
from uipath._cli._evals._models._evaluation_set import AnyEvaluator
|
|
10
11
|
from uipath._events._event_bus import EventBus
|
|
11
12
|
from uipath._events._events import (
|
|
12
13
|
EvalRunCreatedEvent,
|
|
@@ -15,7 +16,6 @@ from uipath._events._events import (
|
|
|
15
16
|
EvalSetRunUpdatedEvent,
|
|
16
17
|
EvaluationEvents,
|
|
17
18
|
)
|
|
18
|
-
from uipath.eval.evaluators import BaseEvaluator
|
|
19
19
|
from uipath.eval.models import ScoreType
|
|
20
20
|
|
|
21
21
|
logger = logging.getLogger(__name__)
|
|
@@ -26,7 +26,7 @@ class ConsoleProgressReporter:
|
|
|
26
26
|
|
|
27
27
|
def __init__(self):
|
|
28
28
|
self.console = Console()
|
|
29
|
-
self.evaluators: Dict[str,
|
|
29
|
+
self.evaluators: Dict[str, AnyEvaluator] = {}
|
|
30
30
|
self.display_started = False
|
|
31
31
|
self.eval_results_by_name: Dict[str, list[Any]] = {}
|
|
32
32
|
|
|
@@ -1,21 +1,69 @@
|
|
|
1
|
+
import importlib.util
|
|
2
|
+
import sys
|
|
3
|
+
from pathlib import Path
|
|
1
4
|
from typing import Any, Dict
|
|
2
5
|
|
|
3
6
|
from pydantic import TypeAdapter
|
|
4
7
|
|
|
8
|
+
from uipath._cli._evals._helpers import try_extract_file_and_class_name # type: ignore
|
|
9
|
+
from uipath._cli._evals._models._evaluation_set import AnyEvaluator
|
|
5
10
|
from uipath._cli._evals._models._evaluator import (
|
|
6
11
|
EqualsEvaluatorParams,
|
|
7
|
-
|
|
12
|
+
EvaluatorConfig,
|
|
8
13
|
JsonSimilarityEvaluatorParams,
|
|
14
|
+
LegacyEvaluator,
|
|
9
15
|
LLMEvaluatorParams,
|
|
10
16
|
TrajectoryEvaluatorParams,
|
|
11
17
|
)
|
|
12
18
|
from uipath._cli._evals._models._evaluator_base_params import EvaluatorBaseParams
|
|
13
19
|
from uipath.eval.evaluators import (
|
|
14
20
|
BaseEvaluator,
|
|
21
|
+
LegacyBaseEvaluator,
|
|
22
|
+
LegacyExactMatchEvaluator,
|
|
23
|
+
LegacyJsonSimilarityEvaluator,
|
|
24
|
+
LegacyLlmAsAJudgeEvaluator,
|
|
25
|
+
LegacyTrajectoryEvaluator,
|
|
26
|
+
)
|
|
27
|
+
from uipath.eval.evaluators.base_evaluator import BaseEvaluatorConfig
|
|
28
|
+
from uipath.eval.evaluators.contains_evaluator import (
|
|
29
|
+
ContainsEvaluator,
|
|
30
|
+
ContainsEvaluatorConfig,
|
|
31
|
+
)
|
|
32
|
+
from uipath.eval.evaluators.exact_match_evaluator import (
|
|
15
33
|
ExactMatchEvaluator,
|
|
34
|
+
ExactMatchEvaluatorConfig,
|
|
35
|
+
)
|
|
36
|
+
from uipath.eval.evaluators.json_similarity_evaluator import (
|
|
16
37
|
JsonSimilarityEvaluator,
|
|
17
|
-
|
|
18
|
-
|
|
38
|
+
JsonSimilarityEvaluatorConfig,
|
|
39
|
+
)
|
|
40
|
+
from uipath.eval.evaluators.llm_judge_output_evaluator import (
|
|
41
|
+
LLMJudgeOutputEvaluator,
|
|
42
|
+
LLMJudgeOutputEvaluatorConfig,
|
|
43
|
+
LLMJudgeStrictJSONSimilarityOutputEvaluator,
|
|
44
|
+
LLMJudgeStrictJSONSimilarityOutputEvaluatorConfig,
|
|
45
|
+
)
|
|
46
|
+
from uipath.eval.evaluators.llm_judge_trajectory_evaluator import (
|
|
47
|
+
LLMJudgeTrajectoryEvaluator,
|
|
48
|
+
LLMJudgeTrajectoryEvaluatorConfig,
|
|
49
|
+
LLMJudgeTrajectorySimulationEvaluator,
|
|
50
|
+
LLMJudgeTrajectorySimulationEvaluatorConfig,
|
|
51
|
+
)
|
|
52
|
+
from uipath.eval.evaluators.tool_call_args_evaluator import (
|
|
53
|
+
ToolCallArgsEvaluator,
|
|
54
|
+
ToolCallArgsEvaluatorConfig,
|
|
55
|
+
)
|
|
56
|
+
from uipath.eval.evaluators.tool_call_count_evaluator import (
|
|
57
|
+
ToolCallCountEvaluator,
|
|
58
|
+
ToolCallCountEvaluatorConfig,
|
|
59
|
+
)
|
|
60
|
+
from uipath.eval.evaluators.tool_call_order_evaluator import (
|
|
61
|
+
ToolCallOrderEvaluator,
|
|
62
|
+
ToolCallOrderEvaluatorConfig,
|
|
63
|
+
)
|
|
64
|
+
from uipath.eval.evaluators.tool_call_output_evaluator import (
|
|
65
|
+
ToolCallOutputEvaluator,
|
|
66
|
+
ToolCallOutputEvaluatorConfig,
|
|
19
67
|
)
|
|
20
68
|
|
|
21
69
|
|
|
@@ -23,7 +71,252 @@ class EvaluatorFactory:
|
|
|
23
71
|
"""Factory class for creating evaluator instances based on configuration."""
|
|
24
72
|
|
|
25
73
|
@classmethod
|
|
26
|
-
def create_evaluator(cls, data: Dict[str, Any]) ->
|
|
74
|
+
def create_evaluator(cls, data: Dict[str, Any]) -> AnyEvaluator:
|
|
75
|
+
if data.get("version", None) == "1.0":
|
|
76
|
+
return cls._create_evaluator_internal(data)
|
|
77
|
+
return cls._create_legacy_evaluator_internal(data)
|
|
78
|
+
|
|
79
|
+
@staticmethod
|
|
80
|
+
def _create_evaluator_internal(
|
|
81
|
+
data: Dict[str, Any],
|
|
82
|
+
) -> BaseEvaluator[Any, Any, Any]:
|
|
83
|
+
# check custom evaluator
|
|
84
|
+
evaluator_schema = data.get("evaluatorSchema", "")
|
|
85
|
+
success, file_path, class_name = try_extract_file_and_class_name(
|
|
86
|
+
evaluator_schema
|
|
87
|
+
)
|
|
88
|
+
if success:
|
|
89
|
+
return EvaluatorFactory._create_coded_evaluator_internal(
|
|
90
|
+
data, file_path, class_name
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
# use built-in evaluators
|
|
94
|
+
config: BaseEvaluatorConfig[Any] = TypeAdapter(EvaluatorConfig).validate_python(
|
|
95
|
+
data
|
|
96
|
+
)
|
|
97
|
+
match config:
|
|
98
|
+
case ContainsEvaluatorConfig():
|
|
99
|
+
return EvaluatorFactory._create_contains_evaluator(data)
|
|
100
|
+
case ExactMatchEvaluatorConfig():
|
|
101
|
+
return EvaluatorFactory._create_exact_match_evaluator(data)
|
|
102
|
+
case JsonSimilarityEvaluatorConfig():
|
|
103
|
+
return EvaluatorFactory._create_json_similarity_evaluator(data)
|
|
104
|
+
case LLMJudgeOutputEvaluatorConfig():
|
|
105
|
+
return EvaluatorFactory._create_llm_judge_output_evaluator(data)
|
|
106
|
+
case LLMJudgeStrictJSONSimilarityOutputEvaluatorConfig():
|
|
107
|
+
return EvaluatorFactory._create_llm_judge_strict_json_similarity_output_evaluator(
|
|
108
|
+
data
|
|
109
|
+
)
|
|
110
|
+
case LLMJudgeTrajectoryEvaluatorConfig():
|
|
111
|
+
return EvaluatorFactory._create_trajectory_evaluator(data)
|
|
112
|
+
case ToolCallArgsEvaluatorConfig():
|
|
113
|
+
return EvaluatorFactory._create_tool_call_args_evaluator(data)
|
|
114
|
+
case ToolCallCountEvaluatorConfig():
|
|
115
|
+
return EvaluatorFactory._create_tool_call_count_evaluator(data)
|
|
116
|
+
case ToolCallOrderEvaluatorConfig():
|
|
117
|
+
return EvaluatorFactory._create_tool_call_order_evaluator(data)
|
|
118
|
+
case ToolCallOutputEvaluatorConfig():
|
|
119
|
+
return EvaluatorFactory._create_tool_call_output_evaluator(data)
|
|
120
|
+
case LLMJudgeTrajectorySimulationEvaluatorConfig():
|
|
121
|
+
return (
|
|
122
|
+
EvaluatorFactory._create_llm_judge_simulation_trajectory_evaluator(
|
|
123
|
+
data
|
|
124
|
+
)
|
|
125
|
+
)
|
|
126
|
+
case _:
|
|
127
|
+
raise ValueError(f"Unknown evaluator configuration: {config}")
|
|
128
|
+
|
|
129
|
+
@staticmethod
|
|
130
|
+
def _create_contains_evaluator(data: Dict[str, Any]) -> ContainsEvaluator:
|
|
131
|
+
evaluator_id = data.get("id")
|
|
132
|
+
if not evaluator_id or not isinstance(evaluator_id, str):
|
|
133
|
+
raise ValueError("Evaluator 'id' must be a non-empty string")
|
|
134
|
+
return ContainsEvaluator(
|
|
135
|
+
id=evaluator_id,
|
|
136
|
+
config=data.get("evaluatorConfig"),
|
|
137
|
+
) # type: ignore
|
|
138
|
+
|
|
139
|
+
@staticmethod
|
|
140
|
+
def _create_coded_evaluator_internal(
|
|
141
|
+
data: Dict[str, Any], file_path_str: str, class_name: str
|
|
142
|
+
) -> BaseEvaluator[Any, Any, Any]:
|
|
143
|
+
"""Create a coded evaluator by dynamically loading from a Python file.
|
|
144
|
+
|
|
145
|
+
Args:
|
|
146
|
+
data: Dictionary containing evaluator configuration with evaluatorTypeId
|
|
147
|
+
in format "file://path/to/file.py:ClassName"
|
|
148
|
+
|
|
149
|
+
Returns:
|
|
150
|
+
Instance of the dynamically loaded evaluator class
|
|
151
|
+
|
|
152
|
+
Raises:
|
|
153
|
+
ValueError: If file or class cannot be loaded, or if the class is not a BaseEvaluator subclass
|
|
154
|
+
"""
|
|
155
|
+
file_path = Path(file_path_str)
|
|
156
|
+
if not file_path.is_absolute():
|
|
157
|
+
if not file_path.exists():
|
|
158
|
+
file_path = (
|
|
159
|
+
Path.cwd() / "evals" / "evaluators" / "custom" / file_path_str
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
if not file_path.exists():
|
|
163
|
+
raise ValueError(
|
|
164
|
+
f"Evaluator file not found: {file_path}. "
|
|
165
|
+
f"Make sure the file exists in evals/evaluators/custom/"
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
module_name = f"_custom_evaluator_{file_path.stem}_{id(data)}"
|
|
169
|
+
spec = importlib.util.spec_from_file_location(module_name, file_path)
|
|
170
|
+
if spec is None or spec.loader is None:
|
|
171
|
+
raise ValueError(f"Could not load module from {file_path}")
|
|
172
|
+
|
|
173
|
+
module = importlib.util.module_from_spec(spec)
|
|
174
|
+
sys.modules[module_name] = module
|
|
175
|
+
try:
|
|
176
|
+
spec.loader.exec_module(module)
|
|
177
|
+
except Exception as e:
|
|
178
|
+
raise ValueError(
|
|
179
|
+
f"Error executing module from {file_path}: {str(e)}"
|
|
180
|
+
) from e
|
|
181
|
+
|
|
182
|
+
# Get the class from the module
|
|
183
|
+
if not hasattr(module, class_name):
|
|
184
|
+
raise ValueError(
|
|
185
|
+
f"Class '{class_name}' not found in {file_path}. "
|
|
186
|
+
f"Available classes: {[name for name in dir(module) if not name.startswith('_')]}"
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
evaluator_class = getattr(module, class_name)
|
|
190
|
+
|
|
191
|
+
if not isinstance(evaluator_class, type) or not issubclass(
|
|
192
|
+
evaluator_class, BaseEvaluator
|
|
193
|
+
):
|
|
194
|
+
raise ValueError(
|
|
195
|
+
f"Class '{class_name}' must be a subclass of BaseEvaluator"
|
|
196
|
+
)
|
|
197
|
+
|
|
198
|
+
evaluator_id = data.get("id")
|
|
199
|
+
if not evaluator_id or not isinstance(evaluator_id, str):
|
|
200
|
+
raise ValueError("Evaluator 'id' must be a non-empty string")
|
|
201
|
+
return evaluator_class(
|
|
202
|
+
id=evaluator_id,
|
|
203
|
+
config=data.get("evaluatorConfig", {}),
|
|
204
|
+
) # type: ignore
|
|
205
|
+
|
|
206
|
+
@staticmethod
|
|
207
|
+
def _create_exact_match_evaluator(
|
|
208
|
+
data: Dict[str, Any],
|
|
209
|
+
) -> ExactMatchEvaluator:
|
|
210
|
+
return TypeAdapter(ExactMatchEvaluator).validate_python(
|
|
211
|
+
{
|
|
212
|
+
"id": data.get("id"),
|
|
213
|
+
"config": data.get("evaluatorConfig"),
|
|
214
|
+
}
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
@staticmethod
|
|
218
|
+
def _create_json_similarity_evaluator(
|
|
219
|
+
data: Dict[str, Any],
|
|
220
|
+
) -> JsonSimilarityEvaluator:
|
|
221
|
+
return TypeAdapter(JsonSimilarityEvaluator).validate_python(
|
|
222
|
+
{
|
|
223
|
+
"id": data.get("id"),
|
|
224
|
+
"config": data.get("evaluatorConfig"),
|
|
225
|
+
}
|
|
226
|
+
)
|
|
227
|
+
|
|
228
|
+
@staticmethod
|
|
229
|
+
def _create_llm_judge_output_evaluator(
|
|
230
|
+
data: Dict[str, Any],
|
|
231
|
+
) -> LLMJudgeOutputEvaluator:
|
|
232
|
+
return TypeAdapter(LLMJudgeOutputEvaluator).validate_python(
|
|
233
|
+
{
|
|
234
|
+
"id": data.get("id"),
|
|
235
|
+
"config": data.get("evaluatorConfig"),
|
|
236
|
+
}
|
|
237
|
+
)
|
|
238
|
+
|
|
239
|
+
@staticmethod
|
|
240
|
+
def _create_llm_judge_strict_json_similarity_output_evaluator(
|
|
241
|
+
data: Dict[str, Any],
|
|
242
|
+
) -> LLMJudgeStrictJSONSimilarityOutputEvaluator:
|
|
243
|
+
return TypeAdapter(LLMJudgeStrictJSONSimilarityOutputEvaluator).validate_python(
|
|
244
|
+
{
|
|
245
|
+
"id": data.get("id"),
|
|
246
|
+
"config": data.get("evaluatorConfig"),
|
|
247
|
+
}
|
|
248
|
+
)
|
|
249
|
+
|
|
250
|
+
@staticmethod
|
|
251
|
+
def _create_trajectory_evaluator(
|
|
252
|
+
data: Dict[str, Any],
|
|
253
|
+
) -> LLMJudgeTrajectoryEvaluator:
|
|
254
|
+
return TypeAdapter(LLMJudgeTrajectoryEvaluator).validate_python(
|
|
255
|
+
{
|
|
256
|
+
"id": data.get("id"),
|
|
257
|
+
"config": data.get("evaluatorConfig"),
|
|
258
|
+
}
|
|
259
|
+
)
|
|
260
|
+
|
|
261
|
+
@staticmethod
|
|
262
|
+
def _create_tool_call_args_evaluator(
|
|
263
|
+
data: Dict[str, Any],
|
|
264
|
+
) -> ToolCallArgsEvaluator:
|
|
265
|
+
return TypeAdapter(ToolCallArgsEvaluator).validate_python(
|
|
266
|
+
{
|
|
267
|
+
"id": data.get("id"),
|
|
268
|
+
"config": data.get("evaluatorConfig"),
|
|
269
|
+
}
|
|
270
|
+
)
|
|
271
|
+
|
|
272
|
+
@staticmethod
|
|
273
|
+
def _create_tool_call_count_evaluator(
|
|
274
|
+
data: Dict[str, Any],
|
|
275
|
+
) -> ToolCallCountEvaluator:
|
|
276
|
+
return TypeAdapter(ToolCallCountEvaluator).validate_python(
|
|
277
|
+
{
|
|
278
|
+
"id": data.get("id"),
|
|
279
|
+
"config": data.get("evaluatorConfig"),
|
|
280
|
+
}
|
|
281
|
+
)
|
|
282
|
+
|
|
283
|
+
@staticmethod
|
|
284
|
+
def _create_tool_call_order_evaluator(
|
|
285
|
+
data: Dict[str, Any],
|
|
286
|
+
) -> ToolCallOrderEvaluator:
|
|
287
|
+
return TypeAdapter(ToolCallOrderEvaluator).validate_python(
|
|
288
|
+
{
|
|
289
|
+
"id": data.get("id"),
|
|
290
|
+
"config": data.get("evaluatorConfig"),
|
|
291
|
+
}
|
|
292
|
+
)
|
|
293
|
+
|
|
294
|
+
@staticmethod
|
|
295
|
+
def _create_tool_call_output_evaluator(
|
|
296
|
+
data: Dict[str, Any],
|
|
297
|
+
) -> ToolCallOutputEvaluator:
|
|
298
|
+
return TypeAdapter(ToolCallOutputEvaluator).validate_python(
|
|
299
|
+
{
|
|
300
|
+
"id": data.get("id"),
|
|
301
|
+
"config": data.get("evaluatorConfig"),
|
|
302
|
+
}
|
|
303
|
+
)
|
|
304
|
+
|
|
305
|
+
@staticmethod
|
|
306
|
+
def _create_llm_judge_simulation_trajectory_evaluator(
|
|
307
|
+
data: Dict[str, Any],
|
|
308
|
+
) -> LLMJudgeTrajectorySimulationEvaluator:
|
|
309
|
+
return TypeAdapter(LLMJudgeTrajectorySimulationEvaluator).validate_python(
|
|
310
|
+
{
|
|
311
|
+
"id": data.get("id"),
|
|
312
|
+
"config": data.get("evaluatorConfig"),
|
|
313
|
+
}
|
|
314
|
+
)
|
|
315
|
+
|
|
316
|
+
@staticmethod
|
|
317
|
+
def _create_legacy_evaluator_internal(
|
|
318
|
+
data: Dict[str, Any],
|
|
319
|
+
) -> LegacyBaseEvaluator[Any]:
|
|
27
320
|
"""Create an evaluator instance from configuration data.
|
|
28
321
|
|
|
29
322
|
Args:
|
|
@@ -35,46 +328,38 @@ class EvaluatorFactory:
|
|
|
35
328
|
Raises:
|
|
36
329
|
ValueError: If category is unknown or required fields are missing
|
|
37
330
|
"""
|
|
38
|
-
|
|
39
|
-
name = data.get("name", "")
|
|
40
|
-
if not name:
|
|
41
|
-
raise ValueError("Evaluator configuration must include 'name' field")
|
|
42
|
-
id = data.get("id", "")
|
|
43
|
-
if not id:
|
|
44
|
-
raise ValueError("Evaluator configuration must include 'id' field")
|
|
45
|
-
|
|
46
|
-
params: EvaluatorBaseParams = TypeAdapter(Evaluator).validate_python(data)
|
|
331
|
+
params: EvaluatorBaseParams = TypeAdapter(LegacyEvaluator).validate_python(data)
|
|
47
332
|
|
|
48
333
|
match params:
|
|
49
334
|
case EqualsEvaluatorParams():
|
|
50
|
-
return EvaluatorFactory.
|
|
335
|
+
return EvaluatorFactory._create_legacy_exact_match_evaluator(params)
|
|
51
336
|
case JsonSimilarityEvaluatorParams():
|
|
52
|
-
return EvaluatorFactory.
|
|
337
|
+
return EvaluatorFactory._create_legacy_json_similarity_evaluator(params)
|
|
53
338
|
case LLMEvaluatorParams():
|
|
54
|
-
return EvaluatorFactory.
|
|
339
|
+
return EvaluatorFactory._create_legacy_llm_as_judge_evaluator(params)
|
|
55
340
|
case TrajectoryEvaluatorParams():
|
|
56
|
-
return EvaluatorFactory.
|
|
341
|
+
return EvaluatorFactory._create_legacy_trajectory_evaluator(params)
|
|
57
342
|
case _:
|
|
58
343
|
raise ValueError(f"Unknown evaluator category: {params}")
|
|
59
344
|
|
|
60
345
|
@staticmethod
|
|
61
|
-
def
|
|
346
|
+
def _create_legacy_exact_match_evaluator(
|
|
62
347
|
params: EqualsEvaluatorParams,
|
|
63
|
-
) ->
|
|
348
|
+
) -> LegacyExactMatchEvaluator:
|
|
64
349
|
"""Create a deterministic evaluator."""
|
|
65
|
-
return
|
|
350
|
+
return LegacyExactMatchEvaluator(**params.model_dump())
|
|
66
351
|
|
|
67
352
|
@staticmethod
|
|
68
|
-
def
|
|
353
|
+
def _create_legacy_json_similarity_evaluator(
|
|
69
354
|
params: JsonSimilarityEvaluatorParams,
|
|
70
|
-
) ->
|
|
355
|
+
) -> LegacyJsonSimilarityEvaluator:
|
|
71
356
|
"""Create a deterministic evaluator."""
|
|
72
|
-
return
|
|
357
|
+
return LegacyJsonSimilarityEvaluator(**params.model_dump())
|
|
73
358
|
|
|
74
359
|
@staticmethod
|
|
75
|
-
def
|
|
360
|
+
def _create_legacy_llm_as_judge_evaluator(
|
|
76
361
|
params: LLMEvaluatorParams,
|
|
77
|
-
) ->
|
|
362
|
+
) -> LegacyLlmAsAJudgeEvaluator:
|
|
78
363
|
"""Create an LLM-as-a-judge evaluator."""
|
|
79
364
|
if not params.prompt:
|
|
80
365
|
raise ValueError("LLM evaluator must include 'prompt' field")
|
|
@@ -86,12 +371,12 @@ class EvaluatorFactory:
|
|
|
86
371
|
"'same-as-agent' model option is not supported by coded agents evaluations. Please select a specific model for the evaluator."
|
|
87
372
|
)
|
|
88
373
|
|
|
89
|
-
return
|
|
374
|
+
return LegacyLlmAsAJudgeEvaluator(**params.model_dump())
|
|
90
375
|
|
|
91
376
|
@staticmethod
|
|
92
|
-
def
|
|
377
|
+
def _create_legacy_trajectory_evaluator(
|
|
93
378
|
params: TrajectoryEvaluatorParams,
|
|
94
|
-
) ->
|
|
379
|
+
) -> LegacyTrajectoryEvaluator:
|
|
95
380
|
"""Create a trajectory evaluator."""
|
|
96
381
|
if not params.prompt:
|
|
97
382
|
raise ValueError("Trajectory evaluator must include 'prompt' field")
|
|
@@ -103,4 +388,4 @@ class EvaluatorFactory:
|
|
|
103
388
|
"'same-as-agent' model option is not supported by coded agents evaluations. Please select a specific model for the evaluator."
|
|
104
389
|
)
|
|
105
390
|
|
|
106
|
-
return
|
|
391
|
+
return LegacyTrajectoryEvaluator(**params.model_dump())
|
|
@@ -0,0 +1,194 @@
|
|
|
1
|
+
# type: ignore
|
|
2
|
+
import ast
|
|
3
|
+
import importlib.util
|
|
4
|
+
import json
|
|
5
|
+
import logging
|
|
6
|
+
import re
|
|
7
|
+
import sys
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Any, Optional
|
|
10
|
+
|
|
11
|
+
import click
|
|
12
|
+
|
|
13
|
+
from uipath._cli._utils._console import ConsoleLogger
|
|
14
|
+
from uipath._utils.constants import CUSTOM_EVALUATOR_PREFIX
|
|
15
|
+
|
|
16
|
+
logger = logging.getLogger(__name__)
|
|
17
|
+
console = ConsoleLogger().get_instance()
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def try_extract_file_and_class_name(text: str) -> tuple[bool, str, str]:
|
|
21
|
+
if text.startswith(CUSTOM_EVALUATOR_PREFIX):
|
|
22
|
+
file_and_class = text[len(CUSTOM_EVALUATOR_PREFIX) :]
|
|
23
|
+
if ":" not in file_and_class:
|
|
24
|
+
raise ValueError(
|
|
25
|
+
f"evaluatorSchema must include class name after ':' - got: {text}"
|
|
26
|
+
)
|
|
27
|
+
file_path_str, class_name = file_and_class.rsplit(":", 1)
|
|
28
|
+
|
|
29
|
+
return True, file_path_str, class_name
|
|
30
|
+
return False, "", ""
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def to_kebab_case(text: str) -> str:
|
|
34
|
+
return re.sub(r"(?<!^)(?=[A-Z])", "-", text).lower()
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def find_evaluator_file(filename: str) -> Optional[Path]:
|
|
38
|
+
"""Find the evaluator file in evals/evaluators/custom folder."""
|
|
39
|
+
custom_evaluators_path = Path.cwd() / "evals" / "evaluators" / "custom"
|
|
40
|
+
|
|
41
|
+
if not custom_evaluators_path.exists():
|
|
42
|
+
return None
|
|
43
|
+
|
|
44
|
+
file_path = custom_evaluators_path / filename
|
|
45
|
+
if file_path.exists():
|
|
46
|
+
return file_path
|
|
47
|
+
|
|
48
|
+
return None
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def find_base_evaluator_class(file_path: Path) -> Optional[str]:
|
|
52
|
+
"""Parse the Python file and find the class that inherits from BaseEvaluator."""
|
|
53
|
+
try:
|
|
54
|
+
with open(file_path, "r") as f:
|
|
55
|
+
tree = ast.parse(f.read(), filename=str(file_path))
|
|
56
|
+
|
|
57
|
+
for node in ast.walk(tree):
|
|
58
|
+
if isinstance(node, ast.ClassDef):
|
|
59
|
+
for base in node.bases:
|
|
60
|
+
if isinstance(base, ast.Name) and base.id == "BaseEvaluator":
|
|
61
|
+
return node.name
|
|
62
|
+
elif isinstance(base, ast.Subscript):
|
|
63
|
+
if (
|
|
64
|
+
isinstance(base.value, ast.Name)
|
|
65
|
+
and base.value.id == "BaseEvaluator"
|
|
66
|
+
):
|
|
67
|
+
return node.name
|
|
68
|
+
|
|
69
|
+
return None
|
|
70
|
+
except Exception as e:
|
|
71
|
+
logger.error(f"Error parsing file: {e}")
|
|
72
|
+
return None
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def load_evaluator_class(file_path: Path, class_name: str) -> Optional[type]:
|
|
76
|
+
"""Dynamically load the evaluator class from the file."""
|
|
77
|
+
try:
|
|
78
|
+
parent_dir = str(file_path.parent)
|
|
79
|
+
if parent_dir not in sys.path:
|
|
80
|
+
sys.path.insert(0, parent_dir)
|
|
81
|
+
|
|
82
|
+
spec = importlib.util.spec_from_file_location("custom_evaluator", file_path)
|
|
83
|
+
if spec is None or spec.loader is None:
|
|
84
|
+
return None
|
|
85
|
+
|
|
86
|
+
module = importlib.util.module_from_spec(spec)
|
|
87
|
+
spec.loader.exec_module(module)
|
|
88
|
+
|
|
89
|
+
if hasattr(module, class_name):
|
|
90
|
+
return getattr(module, class_name)
|
|
91
|
+
|
|
92
|
+
return None
|
|
93
|
+
except Exception as e:
|
|
94
|
+
logger.error(f"Error loading class: {e}")
|
|
95
|
+
return None
|
|
96
|
+
finally:
|
|
97
|
+
# Remove from sys.path
|
|
98
|
+
if parent_dir in sys.path:
|
|
99
|
+
sys.path.remove(parent_dir)
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def generate_evaluator_config(evaluator_class: type, class_name: str) -> dict[str, Any]:
|
|
103
|
+
"""Generate the evaluator config from the class."""
|
|
104
|
+
try:
|
|
105
|
+
config_type = evaluator_class._extract_config_type()
|
|
106
|
+
config_instance = config_type()
|
|
107
|
+
config_dict = config_instance.model_dump(by_alias=True, exclude_none=False)
|
|
108
|
+
|
|
109
|
+
return config_dict
|
|
110
|
+
except Exception as e:
|
|
111
|
+
console.error(f"Error inferring evaluator config: {e}")
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def register_evaluator(filename: str) -> tuple[str, str]:
|
|
115
|
+
"""Infers the schema and types of a custom evaluator.
|
|
116
|
+
|
|
117
|
+
Returns:
|
|
118
|
+
tuple[str, str]:
|
|
119
|
+
- The first string is the path to the python evaluator file.
|
|
120
|
+
- The second string is the evaluator type that corresponds to the schema file.
|
|
121
|
+
"""
|
|
122
|
+
if not filename.endswith(".py"):
|
|
123
|
+
filename = filename + ".py"
|
|
124
|
+
file_path = find_evaluator_file(filename)
|
|
125
|
+
if file_path is None:
|
|
126
|
+
console.error(f"Could not find '{filename}' in evals/evaluators/custom folder")
|
|
127
|
+
|
|
128
|
+
relative_path = f"evals/evaluators/custom/{filename}"
|
|
129
|
+
console.info(
|
|
130
|
+
f"Found custom evaluator file: {click.style(relative_path, fg='cyan')}"
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
class_name = find_base_evaluator_class(file_path)
|
|
134
|
+
if class_name is None:
|
|
135
|
+
console.error(
|
|
136
|
+
f"Could not find a class inheriting from BaseEvaluator in {filename}"
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
console.info(f"Found custom evaluator class: {click.style(class_name, fg='cyan')}")
|
|
140
|
+
|
|
141
|
+
evaluator_class = load_evaluator_class(file_path, class_name)
|
|
142
|
+
if evaluator_class is None:
|
|
143
|
+
console.error(f"Could not load class {class_name} from {filename}")
|
|
144
|
+
|
|
145
|
+
try:
|
|
146
|
+
evaluator_id = evaluator_class.get_evaluator_id()
|
|
147
|
+
except Exception as e:
|
|
148
|
+
console.error(f"Error getting evaluator ID: {e}")
|
|
149
|
+
|
|
150
|
+
evaluator_config = generate_evaluator_config(evaluator_class, class_name)
|
|
151
|
+
evaluator_json_type = evaluator_class.generate_json_type()
|
|
152
|
+
|
|
153
|
+
evaluators_dir = Path.cwd() / "evals" / "evaluators"
|
|
154
|
+
evaluators_dir.mkdir(parents=True, exist_ok=True)
|
|
155
|
+
|
|
156
|
+
evaluator_types_dir = evaluators_dir / "custom" / "types"
|
|
157
|
+
evaluator_types_dir.mkdir(parents=True, exist_ok=True)
|
|
158
|
+
|
|
159
|
+
kebab_class_name = to_kebab_case(class_name)
|
|
160
|
+
output_file_evaluator_types = kebab_class_name + "-types.json"
|
|
161
|
+
evaluator_types_output_path = (
|
|
162
|
+
evaluators_dir / "custom" / "types" / output_file_evaluator_types
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
with open(evaluator_types_output_path, "w") as f:
|
|
166
|
+
json.dump(evaluator_json_type, f, indent=2)
|
|
167
|
+
|
|
168
|
+
relative_output_path = (
|
|
169
|
+
f"evals/evaluators/custom/types/{output_file_evaluator_types}"
|
|
170
|
+
)
|
|
171
|
+
console.success(
|
|
172
|
+
f"Generated evaluator types: {click.style(relative_output_path, fg='cyan')}"
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
output = {
|
|
176
|
+
"version": "1.0",
|
|
177
|
+
"id": evaluator_id,
|
|
178
|
+
"evaluatorTypeId": f"{CUSTOM_EVALUATOR_PREFIX}types/{output_file_evaluator_types}",
|
|
179
|
+
"evaluatorSchema": f"{CUSTOM_EVALUATOR_PREFIX}{filename}:{class_name}",
|
|
180
|
+
"description": evaluator_class.__doc__,
|
|
181
|
+
"evaluatorConfig": evaluator_config,
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
output_file_evaluator_spec = kebab_class_name + ".json"
|
|
185
|
+
evaluator_spec_output_path = evaluators_dir / output_file_evaluator_spec
|
|
186
|
+
with open(evaluator_spec_output_path, "w") as f:
|
|
187
|
+
json.dump(output, f, indent=2)
|
|
188
|
+
|
|
189
|
+
relative_output_path = f"evals/evaluators/{output_file_evaluator_spec}"
|
|
190
|
+
console.success(
|
|
191
|
+
f"Generated evaluator spec: {click.style(relative_output_path, fg='cyan')}"
|
|
192
|
+
)
|
|
193
|
+
|
|
194
|
+
return str(file_path), str(evaluator_types_output_path)
|