uipath 2.1.107__py3-none-any.whl → 2.1.109__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of uipath might be problematic. Click here for more details.
- uipath/_cli/__init__.py +4 -0
- uipath/_cli/_evals/_console_progress_reporter.py +2 -2
- uipath/_cli/_evals/_evaluator_factory.py +314 -29
- uipath/_cli/_evals/_helpers.py +194 -0
- uipath/_cli/_evals/_models/_evaluation_set.py +73 -7
- uipath/_cli/_evals/_models/_evaluator.py +183 -9
- uipath/_cli/_evals/_models/_evaluator_base_params.py +3 -3
- uipath/_cli/_evals/_models/_output.py +87 -3
- uipath/_cli/_evals/_progress_reporter.py +288 -28
- uipath/_cli/_evals/_runtime.py +80 -26
- uipath/_cli/_evals/mocks/input_mocker.py +1 -3
- uipath/_cli/_evals/mocks/llm_mocker.py +2 -2
- uipath/_cli/_evals/mocks/mocker_factory.py +2 -2
- uipath/_cli/_evals/mocks/mockito_mocker.py +2 -2
- uipath/_cli/_evals/mocks/mocks.py +5 -3
- uipath/_cli/_push/models.py +17 -0
- uipath/_cli/_push/sw_file_handler.py +336 -3
- uipath/_cli/_runtime/_contracts.py +25 -5
- uipath/_cli/_templates/custom_evaluator.py.template +65 -0
- uipath/_cli/_utils/_eval_set.py +30 -9
- uipath/_cli/_utils/_resources.py +21 -0
- uipath/_cli/_utils/_studio_project.py +18 -0
- uipath/_cli/cli_add.py +114 -0
- uipath/_cli/cli_eval.py +5 -1
- uipath/_cli/cli_pull.py +11 -26
- uipath/_cli/cli_push.py +2 -0
- uipath/_cli/cli_register.py +45 -0
- uipath/_events/_events.py +6 -5
- uipath/_resources/SDK_REFERENCE.md +0 -97
- uipath/_uipath.py +10 -37
- uipath/_utils/constants.py +4 -0
- uipath/eval/_helpers/evaluators_helpers.py +494 -0
- uipath/eval/_helpers/helpers.py +30 -2
- uipath/eval/evaluators/__init__.py +60 -5
- uipath/eval/evaluators/base_evaluator.py +546 -44
- uipath/eval/evaluators/contains_evaluator.py +80 -0
- uipath/eval/evaluators/exact_match_evaluator.py +43 -12
- uipath/eval/evaluators/json_similarity_evaluator.py +41 -12
- uipath/eval/evaluators/legacy_base_evaluator.py +89 -0
- uipath/eval/evaluators/{deterministic_evaluator_base.py → legacy_deterministic_evaluator_base.py} +2 -2
- uipath/eval/evaluators/legacy_exact_match_evaluator.py +37 -0
- uipath/eval/evaluators/legacy_json_similarity_evaluator.py +151 -0
- uipath/eval/evaluators/legacy_llm_as_judge_evaluator.py +137 -0
- uipath/eval/evaluators/{trajectory_evaluator.py → legacy_trajectory_evaluator.py} +5 -6
- uipath/eval/evaluators/llm_as_judge_evaluator.py +143 -78
- uipath/eval/evaluators/llm_judge_output_evaluator.py +112 -0
- uipath/eval/evaluators/llm_judge_trajectory_evaluator.py +142 -0
- uipath/eval/evaluators/output_evaluator.py +117 -0
- uipath/eval/evaluators/tool_call_args_evaluator.py +82 -0
- uipath/eval/evaluators/tool_call_count_evaluator.py +87 -0
- uipath/eval/evaluators/tool_call_order_evaluator.py +84 -0
- uipath/eval/evaluators/tool_call_output_evaluator.py +87 -0
- uipath/eval/evaluators_types/ContainsEvaluator.json +73 -0
- uipath/eval/evaluators_types/ExactMatchEvaluator.json +89 -0
- uipath/eval/evaluators_types/JsonSimilarityEvaluator.json +81 -0
- uipath/eval/evaluators_types/LLMJudgeOutputEvaluator.json +110 -0
- uipath/eval/evaluators_types/LLMJudgeSimulationTrajectoryEvaluator.json +88 -0
- uipath/eval/evaluators_types/LLMJudgeStrictJSONSimilarityOutputEvaluator.json +110 -0
- uipath/eval/evaluators_types/LLMJudgeTrajectoryEvaluator.json +88 -0
- uipath/eval/evaluators_types/ToolCallArgsEvaluator.json +131 -0
- uipath/eval/evaluators_types/ToolCallCountEvaluator.json +104 -0
- uipath/eval/evaluators_types/ToolCallOrderEvaluator.json +100 -0
- uipath/eval/evaluators_types/ToolCallOutputEvaluator.json +124 -0
- uipath/eval/evaluators_types/generate_types.py +31 -0
- uipath/eval/models/__init__.py +16 -1
- uipath/eval/models/llm_judge_types.py +196 -0
- uipath/eval/models/models.py +109 -7
- {uipath-2.1.107.dist-info → uipath-2.1.109.dist-info}/METADATA +1 -1
- {uipath-2.1.107.dist-info → uipath-2.1.109.dist-info}/RECORD +72 -40
- {uipath-2.1.107.dist-info → uipath-2.1.109.dist-info}/WHEEL +0 -0
- {uipath-2.1.107.dist-info → uipath-2.1.109.dist-info}/entry_points.txt +0 -0
- {uipath-2.1.107.dist-info → uipath-2.1.109.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,9 +1,11 @@
|
|
|
1
1
|
from enum import Enum, IntEnum
|
|
2
2
|
from typing import Annotated, Any, Dict, List, Literal, Optional, Union
|
|
3
3
|
|
|
4
|
-
from pydantic import BaseModel, ConfigDict, Field
|
|
4
|
+
from pydantic import BaseModel, ConfigDict, Discriminator, Field, Tag
|
|
5
5
|
from pydantic.alias_generators import to_camel
|
|
6
6
|
|
|
7
|
+
from uipath.eval.evaluators import BaseEvaluator, LegacyBaseEvaluator
|
|
8
|
+
|
|
7
9
|
|
|
8
10
|
class EvaluationSimulationTool(BaseModel):
|
|
9
11
|
name: str = Field(..., alias="name")
|
|
@@ -103,6 +105,27 @@ MockingStrategy = Union[KnownMockingStrategy, UnknownMockingStrategy]
|
|
|
103
105
|
class EvaluationItem(BaseModel):
|
|
104
106
|
"""Individual evaluation item within an evaluation set."""
|
|
105
107
|
|
|
108
|
+
model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True)
|
|
109
|
+
id: str
|
|
110
|
+
name: str
|
|
111
|
+
inputs: Dict[str, Any]
|
|
112
|
+
evaluation_criterias: dict[str, dict[str, Any] | None] = Field(
|
|
113
|
+
..., alias="evaluationCriterias"
|
|
114
|
+
)
|
|
115
|
+
expected_agent_behavior: str = Field(default="", alias="expectedAgentBehavior")
|
|
116
|
+
mocking_strategy: Optional[MockingStrategy] = Field(
|
|
117
|
+
default=None,
|
|
118
|
+
alias="mockingStrategy",
|
|
119
|
+
)
|
|
120
|
+
input_mocking_strategy: Optional[InputMockingStrategy] = Field(
|
|
121
|
+
default=None,
|
|
122
|
+
alias="inputMockingStrategy",
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
class LegacyEvaluationItem(BaseModel):
|
|
127
|
+
"""Individual evaluation item within an evaluation set."""
|
|
128
|
+
|
|
106
129
|
model_config = ConfigDict(
|
|
107
130
|
alias_generator=to_camel, populate_by_name=True, extra="allow"
|
|
108
131
|
)
|
|
@@ -119,21 +142,41 @@ class EvaluationItem(BaseModel):
|
|
|
119
142
|
default=None,
|
|
120
143
|
alias="mockingStrategy",
|
|
121
144
|
)
|
|
122
|
-
input_mocking_strategy: Optional[InputMockingStrategy] = Field(
|
|
123
|
-
default=None,
|
|
124
|
-
alias="inputMockingStrategy",
|
|
125
|
-
)
|
|
126
145
|
|
|
127
146
|
|
|
128
147
|
class EvaluationSet(BaseModel):
|
|
129
148
|
"""Complete evaluation set model."""
|
|
130
149
|
|
|
150
|
+
model_config = ConfigDict(
|
|
151
|
+
alias_generator=to_camel, populate_by_name=True, extra="allow"
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
id: str
|
|
155
|
+
name: str
|
|
156
|
+
version: Literal["1.0"] = "1.0"
|
|
157
|
+
evaluator_refs: List[str] = Field(default_factory=list)
|
|
158
|
+
evaluations: List[EvaluationItem] = Field(default_factory=list)
|
|
159
|
+
|
|
160
|
+
def extract_selected_evals(self, eval_ids) -> None:
|
|
161
|
+
selected_evals: list[EvaluationItem] = []
|
|
162
|
+
for evaluation in self.evaluations:
|
|
163
|
+
if evaluation.id in eval_ids:
|
|
164
|
+
selected_evals.append(evaluation)
|
|
165
|
+
eval_ids.remove(evaluation.id)
|
|
166
|
+
if len(eval_ids) > 0:
|
|
167
|
+
raise ValueError("Unknown evaluation ids: {}".format(eval_ids))
|
|
168
|
+
self.evaluations = selected_evals
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
class LegacyEvaluationSet(BaseModel):
|
|
172
|
+
"""Complete evaluation set model."""
|
|
173
|
+
|
|
131
174
|
model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True)
|
|
132
175
|
|
|
133
176
|
id: str
|
|
134
177
|
file_name: str = Field(..., alias="fileName")
|
|
135
178
|
evaluator_refs: List[str] = Field(default_factory=list)
|
|
136
|
-
evaluations: List[
|
|
179
|
+
evaluations: List[LegacyEvaluationItem] = Field(default_factory=list)
|
|
137
180
|
name: str
|
|
138
181
|
batch_size: int = Field(10, alias="batchSize")
|
|
139
182
|
timeout_minutes: int = Field(default=20, alias="timeoutMinutes")
|
|
@@ -144,7 +187,7 @@ class EvaluationSet(BaseModel):
|
|
|
144
187
|
updated_at: str = Field(alias="updatedAt")
|
|
145
188
|
|
|
146
189
|
def extract_selected_evals(self, eval_ids) -> None:
|
|
147
|
-
selected_evals: list[
|
|
190
|
+
selected_evals: list[LegacyEvaluationItem] = []
|
|
148
191
|
for evaluation in self.evaluations:
|
|
149
192
|
if evaluation.id in eval_ids:
|
|
150
193
|
selected_evals.append(evaluation)
|
|
@@ -158,3 +201,26 @@ class EvaluationStatus(IntEnum):
|
|
|
158
201
|
PENDING = 0
|
|
159
202
|
IN_PROGRESS = 1
|
|
160
203
|
COMPLETED = 2
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
def _discriminate_eval_set(
|
|
207
|
+
v: Any,
|
|
208
|
+
) -> Literal["evaluation_set", "legacy_evaluation_set"]:
|
|
209
|
+
"""Discriminator function that returns a tag based on version field."""
|
|
210
|
+
if isinstance(v, dict):
|
|
211
|
+
version = v.get("version")
|
|
212
|
+
if version == "1.0":
|
|
213
|
+
return "evaluation_set"
|
|
214
|
+
return "legacy_evaluation_set"
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
AnyEvaluationSet = Annotated[
|
|
218
|
+
Union[
|
|
219
|
+
Annotated[EvaluationSet, Tag("evaluation_set")],
|
|
220
|
+
Annotated[LegacyEvaluationSet, Tag("legacy_evaluation_set")],
|
|
221
|
+
],
|
|
222
|
+
Discriminator(_discriminate_eval_set),
|
|
223
|
+
]
|
|
224
|
+
|
|
225
|
+
AnyEvaluationItem = Union[EvaluationItem, LegacyEvaluationItem]
|
|
226
|
+
AnyEvaluator = Union[LegacyBaseEvaluator[Any], BaseEvaluator[Any, Any, Any]]
|
|
@@ -2,7 +2,37 @@ from typing import Annotated, Any, Literal, Union
|
|
|
2
2
|
|
|
3
3
|
from pydantic import BaseModel, ConfigDict, Discriminator, Field, Tag
|
|
4
4
|
|
|
5
|
-
from uipath.eval.
|
|
5
|
+
from uipath.eval.evaluators.base_evaluator import BaseEvaluatorConfig
|
|
6
|
+
from uipath.eval.evaluators.contains_evaluator import ContainsEvaluatorConfig
|
|
7
|
+
from uipath.eval.evaluators.exact_match_evaluator import ExactMatchEvaluatorConfig
|
|
8
|
+
from uipath.eval.evaluators.json_similarity_evaluator import (
|
|
9
|
+
JsonSimilarityEvaluatorConfig,
|
|
10
|
+
)
|
|
11
|
+
from uipath.eval.evaluators.llm_judge_output_evaluator import (
|
|
12
|
+
LLMJudgeOutputEvaluatorConfig,
|
|
13
|
+
LLMJudgeStrictJSONSimilarityOutputEvaluatorConfig,
|
|
14
|
+
)
|
|
15
|
+
from uipath.eval.evaluators.llm_judge_trajectory_evaluator import (
|
|
16
|
+
LLMJudgeTrajectoryEvaluatorConfig,
|
|
17
|
+
LLMJudgeTrajectorySimulationEvaluatorConfig,
|
|
18
|
+
)
|
|
19
|
+
from uipath.eval.evaluators.tool_call_args_evaluator import (
|
|
20
|
+
ToolCallArgsEvaluatorConfig,
|
|
21
|
+
)
|
|
22
|
+
from uipath.eval.evaluators.tool_call_count_evaluator import (
|
|
23
|
+
ToolCallCountEvaluatorConfig,
|
|
24
|
+
)
|
|
25
|
+
from uipath.eval.evaluators.tool_call_order_evaluator import (
|
|
26
|
+
ToolCallOrderEvaluatorConfig,
|
|
27
|
+
)
|
|
28
|
+
from uipath.eval.evaluators.tool_call_output_evaluator import (
|
|
29
|
+
ToolCallOutputEvaluatorConfig,
|
|
30
|
+
)
|
|
31
|
+
from uipath.eval.models import (
|
|
32
|
+
EvaluatorType,
|
|
33
|
+
LegacyEvaluatorCategory,
|
|
34
|
+
LegacyEvaluatorType,
|
|
35
|
+
)
|
|
6
36
|
|
|
7
37
|
|
|
8
38
|
class EvaluatorBaseParams(BaseModel):
|
|
@@ -11,7 +41,7 @@ class EvaluatorBaseParams(BaseModel):
|
|
|
11
41
|
id: str
|
|
12
42
|
name: str
|
|
13
43
|
description: str
|
|
14
|
-
evaluator_type:
|
|
44
|
+
evaluator_type: LegacyEvaluatorType = Field(..., alias="type")
|
|
15
45
|
created_at: str = Field(..., alias="createdAt")
|
|
16
46
|
updated_at: str = Field(..., alias="updatedAt")
|
|
17
47
|
target_output_key: str = Field(..., alias="targetOutputKey")
|
|
@@ -19,7 +49,9 @@ class EvaluatorBaseParams(BaseModel):
|
|
|
19
49
|
|
|
20
50
|
|
|
21
51
|
class LLMEvaluatorParams(EvaluatorBaseParams):
|
|
22
|
-
category: Literal[
|
|
52
|
+
category: Literal[LegacyEvaluatorCategory.LlmAsAJudge] = Field(
|
|
53
|
+
..., alias="category"
|
|
54
|
+
)
|
|
23
55
|
prompt: str = Field(..., alias="prompt")
|
|
24
56
|
model: str = Field(..., alias="model")
|
|
25
57
|
|
|
@@ -29,7 +61,7 @@ class LLMEvaluatorParams(EvaluatorBaseParams):
|
|
|
29
61
|
|
|
30
62
|
|
|
31
63
|
class TrajectoryEvaluatorParams(EvaluatorBaseParams):
|
|
32
|
-
category: Literal[
|
|
64
|
+
category: Literal[LegacyEvaluatorCategory.Trajectory] = Field(..., alias="category")
|
|
33
65
|
prompt: str = Field(..., alias="prompt")
|
|
34
66
|
model: str = Field(..., alias="model")
|
|
35
67
|
|
|
@@ -61,15 +93,15 @@ def evaluator_discriminator(data: Any) -> str:
|
|
|
61
93
|
category = data.get("category")
|
|
62
94
|
evaluator_type = data.get("type")
|
|
63
95
|
match category:
|
|
64
|
-
case
|
|
96
|
+
case LegacyEvaluatorCategory.LlmAsAJudge:
|
|
65
97
|
return "LLMEvaluatorParams"
|
|
66
|
-
case
|
|
98
|
+
case LegacyEvaluatorCategory.Trajectory:
|
|
67
99
|
return "TrajectoryEvaluatorParams"
|
|
68
|
-
case
|
|
100
|
+
case LegacyEvaluatorCategory.Deterministic:
|
|
69
101
|
match evaluator_type:
|
|
70
|
-
case
|
|
102
|
+
case LegacyEvaluatorType.Equals:
|
|
71
103
|
return "EqualsEvaluatorParams"
|
|
72
|
-
case
|
|
104
|
+
case LegacyEvaluatorType.JsonSimilarity:
|
|
73
105
|
return "JsonSimilarityEvaluatorParams"
|
|
74
106
|
case _:
|
|
75
107
|
return "UnknownEvaluatorParams"
|
|
@@ -104,3 +136,145 @@ Evaluator = Annotated[
|
|
|
104
136
|
],
|
|
105
137
|
Field(discriminator=Discriminator(evaluator_discriminator)),
|
|
106
138
|
]
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
class UnknownEvaluatorConfig(BaseEvaluatorConfig[Any]):
|
|
142
|
+
model_config = ConfigDict(
|
|
143
|
+
validate_by_name=True, validate_by_alias=True, extra="allow"
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def legacy_evaluator_discriminator(data: Any) -> str:
|
|
148
|
+
if isinstance(data, dict):
|
|
149
|
+
category = data.get("category")
|
|
150
|
+
evaluator_type = data.get("type")
|
|
151
|
+
match category:
|
|
152
|
+
case LegacyEvaluatorCategory.LlmAsAJudge:
|
|
153
|
+
return "LLMEvaluatorParams"
|
|
154
|
+
case LegacyEvaluatorCategory.Trajectory:
|
|
155
|
+
return "TrajectoryEvaluatorParams"
|
|
156
|
+
case LegacyEvaluatorCategory.Deterministic:
|
|
157
|
+
match evaluator_type:
|
|
158
|
+
case LegacyEvaluatorType.Equals:
|
|
159
|
+
return "EqualsEvaluatorParams"
|
|
160
|
+
case LegacyEvaluatorType.JsonSimilarity:
|
|
161
|
+
return "JsonSimilarityEvaluatorParams"
|
|
162
|
+
case _:
|
|
163
|
+
return "UnknownEvaluatorParams"
|
|
164
|
+
case _:
|
|
165
|
+
return "UnknownEvaluatorParams"
|
|
166
|
+
else:
|
|
167
|
+
return "UnknownEvaluatorParams"
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
def evaluator_config_discriminator(data: Any) -> str:
|
|
171
|
+
if isinstance(data, dict):
|
|
172
|
+
evaluator_type_id = data.get("evaluatorTypeId")
|
|
173
|
+
match evaluator_type_id:
|
|
174
|
+
case EvaluatorType.CONTAINS:
|
|
175
|
+
return "ContainsEvaluatorConfig"
|
|
176
|
+
case EvaluatorType.EXACT_MATCH:
|
|
177
|
+
return "ExactMatchEvaluatorConfig"
|
|
178
|
+
case EvaluatorType.JSON_SIMILARITY:
|
|
179
|
+
return "JsonSimilarityEvaluatorConfig"
|
|
180
|
+
case EvaluatorType.LLM_JUDGE_OUTPUT_SEMANTIC_SIMILARITY:
|
|
181
|
+
return "LLMJudgeOutputEvaluatorConfig"
|
|
182
|
+
case EvaluatorType.LLM_JUDGE_OUTPUT_STRICT_JSON_SIMILARITY:
|
|
183
|
+
return "LLMJudgeStrictJSONSimilarityOutputEvaluatorConfig"
|
|
184
|
+
case EvaluatorType.LLM_JUDGE_TRAJECTORY_SIMILARITY:
|
|
185
|
+
return "LLMJudgeTrajectoryEvaluatorConfig"
|
|
186
|
+
case EvaluatorType.LLM_JUDGE_TRAJECTORY_SIMULATION:
|
|
187
|
+
return "LLMJudgeTrajectorySimulationEvaluatorConfig"
|
|
188
|
+
case EvaluatorType.TOOL_CALL_ARGS:
|
|
189
|
+
return "ToolCallArgsEvaluatorConfig"
|
|
190
|
+
case EvaluatorType.TOOL_CALL_COUNT:
|
|
191
|
+
return "ToolCallCountEvaluatorConfig"
|
|
192
|
+
case EvaluatorType.TOOL_CALL_ORDER:
|
|
193
|
+
return "ToolCallOrderEvaluatorConfig"
|
|
194
|
+
case EvaluatorType.TOOL_CALL_OUTPUT:
|
|
195
|
+
return "ToolCallOutputEvaluatorConfig"
|
|
196
|
+
case _:
|
|
197
|
+
return "UnknownEvaluatorConfig"
|
|
198
|
+
else:
|
|
199
|
+
return "UnknownEvaluatorConfig"
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
LegacyEvaluator = Annotated[
|
|
203
|
+
Union[
|
|
204
|
+
Annotated[
|
|
205
|
+
LLMEvaluatorParams,
|
|
206
|
+
Tag("LLMEvaluatorParams"),
|
|
207
|
+
],
|
|
208
|
+
Annotated[
|
|
209
|
+
TrajectoryEvaluatorParams,
|
|
210
|
+
Tag("TrajectoryEvaluatorParams"),
|
|
211
|
+
],
|
|
212
|
+
Annotated[
|
|
213
|
+
EqualsEvaluatorParams,
|
|
214
|
+
Tag("EqualsEvaluatorParams"),
|
|
215
|
+
],
|
|
216
|
+
Annotated[
|
|
217
|
+
JsonSimilarityEvaluatorParams,
|
|
218
|
+
Tag("JsonSimilarityEvaluatorParams"),
|
|
219
|
+
],
|
|
220
|
+
Annotated[
|
|
221
|
+
UnknownEvaluatorParams,
|
|
222
|
+
Tag("UnknownEvaluatorParams"),
|
|
223
|
+
],
|
|
224
|
+
],
|
|
225
|
+
Field(discriminator=Discriminator(legacy_evaluator_discriminator)),
|
|
226
|
+
]
|
|
227
|
+
|
|
228
|
+
EvaluatorConfig = Annotated[
|
|
229
|
+
Union[
|
|
230
|
+
Annotated[
|
|
231
|
+
ContainsEvaluatorConfig,
|
|
232
|
+
Tag("ContainsEvaluatorConfig"),
|
|
233
|
+
],
|
|
234
|
+
Annotated[
|
|
235
|
+
ExactMatchEvaluatorConfig,
|
|
236
|
+
Tag("ExactMatchEvaluatorConfig"),
|
|
237
|
+
],
|
|
238
|
+
Annotated[
|
|
239
|
+
JsonSimilarityEvaluatorConfig,
|
|
240
|
+
Tag("JsonSimilarityEvaluatorConfig"),
|
|
241
|
+
],
|
|
242
|
+
Annotated[
|
|
243
|
+
LLMJudgeOutputEvaluatorConfig,
|
|
244
|
+
Tag("LLMJudgeOutputEvaluatorConfig"),
|
|
245
|
+
],
|
|
246
|
+
Annotated[
|
|
247
|
+
LLMJudgeStrictJSONSimilarityOutputEvaluatorConfig,
|
|
248
|
+
Tag("LLMJudgeStrictJSONSimilarityOutputEvaluatorConfig"),
|
|
249
|
+
],
|
|
250
|
+
Annotated[
|
|
251
|
+
LLMJudgeTrajectoryEvaluatorConfig,
|
|
252
|
+
Tag("LLMJudgeTrajectoryEvaluatorConfig"),
|
|
253
|
+
],
|
|
254
|
+
Annotated[
|
|
255
|
+
ToolCallArgsEvaluatorConfig,
|
|
256
|
+
Tag("ToolCallArgsEvaluatorConfig"),
|
|
257
|
+
],
|
|
258
|
+
Annotated[
|
|
259
|
+
ToolCallCountEvaluatorConfig,
|
|
260
|
+
Tag("ToolCallCountEvaluatorConfig"),
|
|
261
|
+
],
|
|
262
|
+
Annotated[
|
|
263
|
+
ToolCallOrderEvaluatorConfig,
|
|
264
|
+
Tag("ToolCallOrderEvaluatorConfig"),
|
|
265
|
+
],
|
|
266
|
+
Annotated[
|
|
267
|
+
ToolCallOutputEvaluatorConfig,
|
|
268
|
+
Tag("ToolCallOutputEvaluatorConfig"),
|
|
269
|
+
],
|
|
270
|
+
Annotated[
|
|
271
|
+
LLMJudgeTrajectorySimulationEvaluatorConfig,
|
|
272
|
+
Tag("LLMJudgeTrajectorySimulationEvaluatorConfig"),
|
|
273
|
+
],
|
|
274
|
+
Annotated[
|
|
275
|
+
UnknownEvaluatorConfig,
|
|
276
|
+
Tag("UnknownEvaluatorConfig"),
|
|
277
|
+
],
|
|
278
|
+
],
|
|
279
|
+
Field(discriminator=Discriminator(evaluator_config_discriminator)),
|
|
280
|
+
]
|
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
from pydantic import BaseModel
|
|
2
2
|
|
|
3
|
-
from uipath.eval.models.models import
|
|
3
|
+
from uipath.eval.models.models import LegacyEvaluatorCategory, LegacyEvaluatorType
|
|
4
4
|
|
|
5
5
|
|
|
6
6
|
class EvaluatorBaseParams(BaseModel):
|
|
7
7
|
"""Parameters for initializing the base evaluator."""
|
|
8
8
|
|
|
9
9
|
id: str
|
|
10
|
-
category:
|
|
11
|
-
evaluator_type:
|
|
10
|
+
category: LegacyEvaluatorCategory
|
|
11
|
+
evaluator_type: LegacyEvaluatorType
|
|
12
12
|
name: str
|
|
13
13
|
description: str
|
|
14
14
|
created_at: str
|
|
@@ -1,9 +1,11 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
from
|
|
2
|
+
from collections import defaultdict
|
|
3
|
+
from typing import Any, Dict, List, Optional
|
|
3
4
|
|
|
4
5
|
from opentelemetry.sdk.trace import ReadableSpan
|
|
5
6
|
from pydantic import BaseModel, ConfigDict, model_serializer
|
|
6
7
|
from pydantic.alias_generators import to_camel
|
|
8
|
+
from pydantic_core import core_schema
|
|
7
9
|
|
|
8
10
|
from uipath._cli._runtime._contracts import UiPathRuntimeResult
|
|
9
11
|
from uipath.eval.models.models import EvaluationResult, ScoreType
|
|
@@ -24,11 +26,15 @@ class EvaluationResultDto(BaseModel):
|
|
|
24
26
|
model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True)
|
|
25
27
|
|
|
26
28
|
score: float
|
|
27
|
-
details: Optional[str] = None
|
|
29
|
+
details: Optional[str | BaseModel] = None
|
|
28
30
|
evaluation_time: Optional[float] = None
|
|
29
31
|
|
|
30
32
|
@model_serializer(mode="wrap")
|
|
31
|
-
def serialize_model(
|
|
33
|
+
def serialize_model(
|
|
34
|
+
self,
|
|
35
|
+
serializer: core_schema.SerializerFunctionWrapHandler,
|
|
36
|
+
info: core_schema.SerializationInfo,
|
|
37
|
+
) -> Any:
|
|
32
38
|
data = serializer(self)
|
|
33
39
|
if self.details is None and isinstance(data, dict):
|
|
34
40
|
data.pop("details", None)
|
|
@@ -101,3 +107,81 @@ class UiPathEvalOutput(BaseModel):
|
|
|
101
107
|
eval_result.score for eval_result in self.evaluation_set_results
|
|
102
108
|
]
|
|
103
109
|
return sum(eval_item_scores) / len(eval_item_scores)
|
|
110
|
+
|
|
111
|
+
def calculate_final_score(
|
|
112
|
+
self,
|
|
113
|
+
evaluator_weights: Dict[str, float] | None = None,
|
|
114
|
+
default_weight: float = 1.0,
|
|
115
|
+
) -> tuple[float, Dict[str, float]]:
|
|
116
|
+
"""Aggregate evaluation results with deduplication and weighted scoring.
|
|
117
|
+
|
|
118
|
+
This function performs the following steps:
|
|
119
|
+
1. Flattens the nested evaluation_set_results structure
|
|
120
|
+
2. Deduplicates results by datapoint_id (evaluation_name) and evaluator_name (averages duplicates)
|
|
121
|
+
3. Calculates average score per evaluator across all datapoints
|
|
122
|
+
4. Computes final weighted score across evaluators
|
|
123
|
+
|
|
124
|
+
Args:
|
|
125
|
+
evaluator_weights: Optional dict mapping evaluator names to weights
|
|
126
|
+
default_weight: Default weight for evaluators not in evaluator_weights (default: 1.0)
|
|
127
|
+
|
|
128
|
+
Returns:
|
|
129
|
+
Tuple of (final_score, agg_metrics_per_evaluator)
|
|
130
|
+
- final_score: Weighted average across evaluators
|
|
131
|
+
- agg_metrics_per_evaluator: Dict mapping evaluator names to their average scores
|
|
132
|
+
"""
|
|
133
|
+
if not self.evaluation_set_results:
|
|
134
|
+
return 0.0, {}
|
|
135
|
+
|
|
136
|
+
if evaluator_weights is None:
|
|
137
|
+
evaluator_weights = {}
|
|
138
|
+
|
|
139
|
+
# Step 1: Flatten the nested structure and group by datapoint_id and evaluator_name for deduplication
|
|
140
|
+
# datapoint_id = evaluation_name, evaluator_name from EvaluationRunResultDto
|
|
141
|
+
grouped_by_datapoint_evaluator: defaultdict[
|
|
142
|
+
str, defaultdict[str, list[float]]
|
|
143
|
+
] = defaultdict(lambda: defaultdict(list))
|
|
144
|
+
|
|
145
|
+
for eval_run_result in self.evaluation_set_results:
|
|
146
|
+
datapoint_id = eval_run_result.evaluation_name
|
|
147
|
+
for eval_run_result_dto in eval_run_result.evaluation_run_results:
|
|
148
|
+
evaluator_name = eval_run_result_dto.evaluator_name
|
|
149
|
+
score = eval_run_result_dto.result.score
|
|
150
|
+
grouped_by_datapoint_evaluator[datapoint_id][evaluator_name].append(
|
|
151
|
+
score
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
# Step 2: Deduplicate by averaging same evaluator results for same datapoint
|
|
155
|
+
dedup_scores: list[tuple[str, str, float]] = []
|
|
156
|
+
for datapoint_id, evaluators_dict in grouped_by_datapoint_evaluator.items():
|
|
157
|
+
for evaluator_name, scores_list in evaluators_dict.items():
|
|
158
|
+
if scores_list:
|
|
159
|
+
# Average the scores for this evaluator on this datapoint
|
|
160
|
+
avg_score = sum(scores_list) / len(scores_list)
|
|
161
|
+
dedup_scores.append((datapoint_id, evaluator_name, avg_score))
|
|
162
|
+
|
|
163
|
+
# Step 3: Group by evaluator and calculate average score per evaluator
|
|
164
|
+
grouped_by_evaluator: defaultdict[str, list[float]] = defaultdict(list)
|
|
165
|
+
for _datapoint_id, evaluator_name, score in dedup_scores:
|
|
166
|
+
grouped_by_evaluator[evaluator_name].append(score)
|
|
167
|
+
|
|
168
|
+
agg_metrics_per_evaluator = {}
|
|
169
|
+
for evaluator_name, scores_list in grouped_by_evaluator.items():
|
|
170
|
+
avg_score = sum(scores_list) / len(scores_list)
|
|
171
|
+
agg_metrics_per_evaluator[evaluator_name] = avg_score
|
|
172
|
+
|
|
173
|
+
# Step 4: Calculate final weighted score
|
|
174
|
+
if not agg_metrics_per_evaluator:
|
|
175
|
+
return 0.0, {}
|
|
176
|
+
|
|
177
|
+
total_weighted_score = 0.0
|
|
178
|
+
total_weight = 0.0
|
|
179
|
+
|
|
180
|
+
for evaluator_name, avg_score in agg_metrics_per_evaluator.items():
|
|
181
|
+
weight = evaluator_weights.get(evaluator_name, default_weight)
|
|
182
|
+
total_weighted_score += avg_score * weight
|
|
183
|
+
total_weight += weight
|
|
184
|
+
|
|
185
|
+
final_score = total_weighted_score / total_weight if total_weight > 0 else 0.0
|
|
186
|
+
|
|
187
|
+
return final_score, agg_metrics_per_evaluator
|