uipath 2.1.108__py3-none-any.whl → 2.1.110__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of uipath might be problematic. Click here for more details.

Files changed (72) hide show
  1. uipath/_cli/__init__.py +4 -0
  2. uipath/_cli/_evals/_console_progress_reporter.py +2 -2
  3. uipath/_cli/_evals/_evaluator_factory.py +314 -29
  4. uipath/_cli/_evals/_helpers.py +194 -0
  5. uipath/_cli/_evals/_models/_evaluation_set.py +73 -7
  6. uipath/_cli/_evals/_models/_evaluator.py +183 -9
  7. uipath/_cli/_evals/_models/_evaluator_base_params.py +3 -3
  8. uipath/_cli/_evals/_models/_output.py +87 -3
  9. uipath/_cli/_evals/_progress_reporter.py +288 -28
  10. uipath/_cli/_evals/_runtime.py +80 -26
  11. uipath/_cli/_evals/mocks/input_mocker.py +1 -3
  12. uipath/_cli/_evals/mocks/llm_mocker.py +2 -2
  13. uipath/_cli/_evals/mocks/mocker_factory.py +2 -2
  14. uipath/_cli/_evals/mocks/mockito_mocker.py +2 -2
  15. uipath/_cli/_evals/mocks/mocks.py +5 -3
  16. uipath/_cli/_push/models.py +17 -0
  17. uipath/_cli/_push/sw_file_handler.py +336 -3
  18. uipath/_cli/_runtime/_contracts.py +2 -4
  19. uipath/_cli/_runtime/_runtime.py +2 -5
  20. uipath/_cli/_templates/custom_evaluator.py.template +65 -0
  21. uipath/_cli/_utils/_eval_set.py +30 -9
  22. uipath/_cli/_utils/_resources.py +21 -0
  23. uipath/_cli/_utils/_studio_project.py +18 -0
  24. uipath/_cli/cli_add.py +114 -0
  25. uipath/_cli/cli_eval.py +5 -1
  26. uipath/_cli/cli_init.py +5 -4
  27. uipath/_cli/cli_pull.py +11 -26
  28. uipath/_cli/cli_push.py +2 -0
  29. uipath/_cli/cli_register.py +45 -0
  30. uipath/_events/_events.py +6 -5
  31. uipath/_utils/constants.py +4 -0
  32. uipath/eval/_helpers/evaluators_helpers.py +494 -0
  33. uipath/eval/_helpers/helpers.py +30 -2
  34. uipath/eval/evaluators/__init__.py +60 -5
  35. uipath/eval/evaluators/base_evaluator.py +546 -44
  36. uipath/eval/evaluators/contains_evaluator.py +80 -0
  37. uipath/eval/evaluators/exact_match_evaluator.py +43 -12
  38. uipath/eval/evaluators/json_similarity_evaluator.py +41 -12
  39. uipath/eval/evaluators/legacy_base_evaluator.py +89 -0
  40. uipath/eval/evaluators/{deterministic_evaluator_base.py → legacy_deterministic_evaluator_base.py} +2 -2
  41. uipath/eval/evaluators/legacy_exact_match_evaluator.py +37 -0
  42. uipath/eval/evaluators/legacy_json_similarity_evaluator.py +151 -0
  43. uipath/eval/evaluators/legacy_llm_as_judge_evaluator.py +137 -0
  44. uipath/eval/evaluators/{trajectory_evaluator.py → legacy_trajectory_evaluator.py} +5 -6
  45. uipath/eval/evaluators/llm_as_judge_evaluator.py +143 -78
  46. uipath/eval/evaluators/llm_judge_output_evaluator.py +112 -0
  47. uipath/eval/evaluators/llm_judge_trajectory_evaluator.py +142 -0
  48. uipath/eval/evaluators/output_evaluator.py +117 -0
  49. uipath/eval/evaluators/tool_call_args_evaluator.py +82 -0
  50. uipath/eval/evaluators/tool_call_count_evaluator.py +87 -0
  51. uipath/eval/evaluators/tool_call_order_evaluator.py +84 -0
  52. uipath/eval/evaluators/tool_call_output_evaluator.py +87 -0
  53. uipath/eval/evaluators_types/ContainsEvaluator.json +73 -0
  54. uipath/eval/evaluators_types/ExactMatchEvaluator.json +89 -0
  55. uipath/eval/evaluators_types/JsonSimilarityEvaluator.json +81 -0
  56. uipath/eval/evaluators_types/LLMJudgeOutputEvaluator.json +110 -0
  57. uipath/eval/evaluators_types/LLMJudgeSimulationTrajectoryEvaluator.json +88 -0
  58. uipath/eval/evaluators_types/LLMJudgeStrictJSONSimilarityOutputEvaluator.json +110 -0
  59. uipath/eval/evaluators_types/LLMJudgeTrajectoryEvaluator.json +88 -0
  60. uipath/eval/evaluators_types/ToolCallArgsEvaluator.json +131 -0
  61. uipath/eval/evaluators_types/ToolCallCountEvaluator.json +104 -0
  62. uipath/eval/evaluators_types/ToolCallOrderEvaluator.json +100 -0
  63. uipath/eval/evaluators_types/ToolCallOutputEvaluator.json +124 -0
  64. uipath/eval/evaluators_types/generate_types.py +31 -0
  65. uipath/eval/models/__init__.py +16 -1
  66. uipath/eval/models/llm_judge_types.py +196 -0
  67. uipath/eval/models/models.py +109 -7
  68. {uipath-2.1.108.dist-info → uipath-2.1.110.dist-info}/METADATA +1 -1
  69. {uipath-2.1.108.dist-info → uipath-2.1.110.dist-info}/RECORD +72 -40
  70. {uipath-2.1.108.dist-info → uipath-2.1.110.dist-info}/WHEEL +0 -0
  71. {uipath-2.1.108.dist-info → uipath-2.1.110.dist-info}/entry_points.txt +0 -0
  72. {uipath-2.1.108.dist-info → uipath-2.1.110.dist-info}/licenses/LICENSE +0 -0
@@ -1,9 +1,11 @@
1
1
  from enum import Enum, IntEnum
2
2
  from typing import Annotated, Any, Dict, List, Literal, Optional, Union
3
3
 
4
- from pydantic import BaseModel, ConfigDict, Field
4
+ from pydantic import BaseModel, ConfigDict, Discriminator, Field, Tag
5
5
  from pydantic.alias_generators import to_camel
6
6
 
7
+ from uipath.eval.evaluators import BaseEvaluator, LegacyBaseEvaluator
8
+
7
9
 
8
10
  class EvaluationSimulationTool(BaseModel):
9
11
  name: str = Field(..., alias="name")
@@ -103,6 +105,27 @@ MockingStrategy = Union[KnownMockingStrategy, UnknownMockingStrategy]
103
105
  class EvaluationItem(BaseModel):
104
106
  """Individual evaluation item within an evaluation set."""
105
107
 
108
+ model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True)
109
+ id: str
110
+ name: str
111
+ inputs: Dict[str, Any]
112
+ evaluation_criterias: dict[str, dict[str, Any] | None] = Field(
113
+ ..., alias="evaluationCriterias"
114
+ )
115
+ expected_agent_behavior: str = Field(default="", alias="expectedAgentBehavior")
116
+ mocking_strategy: Optional[MockingStrategy] = Field(
117
+ default=None,
118
+ alias="mockingStrategy",
119
+ )
120
+ input_mocking_strategy: Optional[InputMockingStrategy] = Field(
121
+ default=None,
122
+ alias="inputMockingStrategy",
123
+ )
124
+
125
+
126
+ class LegacyEvaluationItem(BaseModel):
127
+ """Individual evaluation item within an evaluation set."""
128
+
106
129
  model_config = ConfigDict(
107
130
  alias_generator=to_camel, populate_by_name=True, extra="allow"
108
131
  )
@@ -119,21 +142,41 @@ class EvaluationItem(BaseModel):
119
142
  default=None,
120
143
  alias="mockingStrategy",
121
144
  )
122
- input_mocking_strategy: Optional[InputMockingStrategy] = Field(
123
- default=None,
124
- alias="inputMockingStrategy",
125
- )
126
145
 
127
146
 
128
147
  class EvaluationSet(BaseModel):
129
148
  """Complete evaluation set model."""
130
149
 
150
+ model_config = ConfigDict(
151
+ alias_generator=to_camel, populate_by_name=True, extra="allow"
152
+ )
153
+
154
+ id: str
155
+ name: str
156
+ version: Literal["1.0"] = "1.0"
157
+ evaluator_refs: List[str] = Field(default_factory=list)
158
+ evaluations: List[EvaluationItem] = Field(default_factory=list)
159
+
160
+ def extract_selected_evals(self, eval_ids) -> None:
161
+ selected_evals: list[EvaluationItem] = []
162
+ for evaluation in self.evaluations:
163
+ if evaluation.id in eval_ids:
164
+ selected_evals.append(evaluation)
165
+ eval_ids.remove(evaluation.id)
166
+ if len(eval_ids) > 0:
167
+ raise ValueError("Unknown evaluation ids: {}".format(eval_ids))
168
+ self.evaluations = selected_evals
169
+
170
+
171
+ class LegacyEvaluationSet(BaseModel):
172
+ """Complete evaluation set model."""
173
+
131
174
  model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True)
132
175
 
133
176
  id: str
134
177
  file_name: str = Field(..., alias="fileName")
135
178
  evaluator_refs: List[str] = Field(default_factory=list)
136
- evaluations: List[EvaluationItem] = Field(default_factory=list)
179
+ evaluations: List[LegacyEvaluationItem] = Field(default_factory=list)
137
180
  name: str
138
181
  batch_size: int = Field(10, alias="batchSize")
139
182
  timeout_minutes: int = Field(default=20, alias="timeoutMinutes")
@@ -144,7 +187,7 @@ class EvaluationSet(BaseModel):
144
187
  updated_at: str = Field(alias="updatedAt")
145
188
 
146
189
  def extract_selected_evals(self, eval_ids) -> None:
147
- selected_evals: list[EvaluationItem] = []
190
+ selected_evals: list[LegacyEvaluationItem] = []
148
191
  for evaluation in self.evaluations:
149
192
  if evaluation.id in eval_ids:
150
193
  selected_evals.append(evaluation)
@@ -158,3 +201,26 @@ class EvaluationStatus(IntEnum):
158
201
  PENDING = 0
159
202
  IN_PROGRESS = 1
160
203
  COMPLETED = 2
204
+
205
+
206
+ def _discriminate_eval_set(
207
+ v: Any,
208
+ ) -> Literal["evaluation_set", "legacy_evaluation_set"]:
209
+ """Discriminator function that returns a tag based on version field."""
210
+ if isinstance(v, dict):
211
+ version = v.get("version")
212
+ if version == "1.0":
213
+ return "evaluation_set"
214
+ return "legacy_evaluation_set"
215
+
216
+
217
+ AnyEvaluationSet = Annotated[
218
+ Union[
219
+ Annotated[EvaluationSet, Tag("evaluation_set")],
220
+ Annotated[LegacyEvaluationSet, Tag("legacy_evaluation_set")],
221
+ ],
222
+ Discriminator(_discriminate_eval_set),
223
+ ]
224
+
225
+ AnyEvaluationItem = Union[EvaluationItem, LegacyEvaluationItem]
226
+ AnyEvaluator = Union[LegacyBaseEvaluator[Any], BaseEvaluator[Any, Any, Any]]
@@ -2,7 +2,37 @@ from typing import Annotated, Any, Literal, Union
2
2
 
3
3
  from pydantic import BaseModel, ConfigDict, Discriminator, Field, Tag
4
4
 
5
- from uipath.eval.models.models import EvaluatorCategory, EvaluatorType
5
+ from uipath.eval.evaluators.base_evaluator import BaseEvaluatorConfig
6
+ from uipath.eval.evaluators.contains_evaluator import ContainsEvaluatorConfig
7
+ from uipath.eval.evaluators.exact_match_evaluator import ExactMatchEvaluatorConfig
8
+ from uipath.eval.evaluators.json_similarity_evaluator import (
9
+ JsonSimilarityEvaluatorConfig,
10
+ )
11
+ from uipath.eval.evaluators.llm_judge_output_evaluator import (
12
+ LLMJudgeOutputEvaluatorConfig,
13
+ LLMJudgeStrictJSONSimilarityOutputEvaluatorConfig,
14
+ )
15
+ from uipath.eval.evaluators.llm_judge_trajectory_evaluator import (
16
+ LLMJudgeTrajectoryEvaluatorConfig,
17
+ LLMJudgeTrajectorySimulationEvaluatorConfig,
18
+ )
19
+ from uipath.eval.evaluators.tool_call_args_evaluator import (
20
+ ToolCallArgsEvaluatorConfig,
21
+ )
22
+ from uipath.eval.evaluators.tool_call_count_evaluator import (
23
+ ToolCallCountEvaluatorConfig,
24
+ )
25
+ from uipath.eval.evaluators.tool_call_order_evaluator import (
26
+ ToolCallOrderEvaluatorConfig,
27
+ )
28
+ from uipath.eval.evaluators.tool_call_output_evaluator import (
29
+ ToolCallOutputEvaluatorConfig,
30
+ )
31
+ from uipath.eval.models import (
32
+ EvaluatorType,
33
+ LegacyEvaluatorCategory,
34
+ LegacyEvaluatorType,
35
+ )
6
36
 
7
37
 
8
38
  class EvaluatorBaseParams(BaseModel):
@@ -11,7 +41,7 @@ class EvaluatorBaseParams(BaseModel):
11
41
  id: str
12
42
  name: str
13
43
  description: str
14
- evaluator_type: EvaluatorType = Field(..., alias="type")
44
+ evaluator_type: LegacyEvaluatorType = Field(..., alias="type")
15
45
  created_at: str = Field(..., alias="createdAt")
16
46
  updated_at: str = Field(..., alias="updatedAt")
17
47
  target_output_key: str = Field(..., alias="targetOutputKey")
@@ -19,7 +49,9 @@ class EvaluatorBaseParams(BaseModel):
19
49
 
20
50
 
21
51
  class LLMEvaluatorParams(EvaluatorBaseParams):
22
- category: Literal[EvaluatorCategory.LlmAsAJudge] = Field(..., alias="category")
52
+ category: Literal[LegacyEvaluatorCategory.LlmAsAJudge] = Field(
53
+ ..., alias="category"
54
+ )
23
55
  prompt: str = Field(..., alias="prompt")
24
56
  model: str = Field(..., alias="model")
25
57
 
@@ -29,7 +61,7 @@ class LLMEvaluatorParams(EvaluatorBaseParams):
29
61
 
30
62
 
31
63
  class TrajectoryEvaluatorParams(EvaluatorBaseParams):
32
- category: Literal[EvaluatorCategory.Trajectory] = Field(..., alias="category")
64
+ category: Literal[LegacyEvaluatorCategory.Trajectory] = Field(..., alias="category")
33
65
  prompt: str = Field(..., alias="prompt")
34
66
  model: str = Field(..., alias="model")
35
67
 
@@ -61,15 +93,15 @@ def evaluator_discriminator(data: Any) -> str:
61
93
  category = data.get("category")
62
94
  evaluator_type = data.get("type")
63
95
  match category:
64
- case EvaluatorCategory.LlmAsAJudge:
96
+ case LegacyEvaluatorCategory.LlmAsAJudge:
65
97
  return "LLMEvaluatorParams"
66
- case EvaluatorCategory.Trajectory:
98
+ case LegacyEvaluatorCategory.Trajectory:
67
99
  return "TrajectoryEvaluatorParams"
68
- case EvaluatorCategory.Deterministic:
100
+ case LegacyEvaluatorCategory.Deterministic:
69
101
  match evaluator_type:
70
- case EvaluatorType.Equals:
102
+ case LegacyEvaluatorType.Equals:
71
103
  return "EqualsEvaluatorParams"
72
- case EvaluatorType.JsonSimilarity:
104
+ case LegacyEvaluatorType.JsonSimilarity:
73
105
  return "JsonSimilarityEvaluatorParams"
74
106
  case _:
75
107
  return "UnknownEvaluatorParams"
@@ -104,3 +136,145 @@ Evaluator = Annotated[
104
136
  ],
105
137
  Field(discriminator=Discriminator(evaluator_discriminator)),
106
138
  ]
139
+
140
+
141
+ class UnknownEvaluatorConfig(BaseEvaluatorConfig[Any]):
142
+ model_config = ConfigDict(
143
+ validate_by_name=True, validate_by_alias=True, extra="allow"
144
+ )
145
+
146
+
147
+ def legacy_evaluator_discriminator(data: Any) -> str:
148
+ if isinstance(data, dict):
149
+ category = data.get("category")
150
+ evaluator_type = data.get("type")
151
+ match category:
152
+ case LegacyEvaluatorCategory.LlmAsAJudge:
153
+ return "LLMEvaluatorParams"
154
+ case LegacyEvaluatorCategory.Trajectory:
155
+ return "TrajectoryEvaluatorParams"
156
+ case LegacyEvaluatorCategory.Deterministic:
157
+ match evaluator_type:
158
+ case LegacyEvaluatorType.Equals:
159
+ return "EqualsEvaluatorParams"
160
+ case LegacyEvaluatorType.JsonSimilarity:
161
+ return "JsonSimilarityEvaluatorParams"
162
+ case _:
163
+ return "UnknownEvaluatorParams"
164
+ case _:
165
+ return "UnknownEvaluatorParams"
166
+ else:
167
+ return "UnknownEvaluatorParams"
168
+
169
+
170
+ def evaluator_config_discriminator(data: Any) -> str:
171
+ if isinstance(data, dict):
172
+ evaluator_type_id = data.get("evaluatorTypeId")
173
+ match evaluator_type_id:
174
+ case EvaluatorType.CONTAINS:
175
+ return "ContainsEvaluatorConfig"
176
+ case EvaluatorType.EXACT_MATCH:
177
+ return "ExactMatchEvaluatorConfig"
178
+ case EvaluatorType.JSON_SIMILARITY:
179
+ return "JsonSimilarityEvaluatorConfig"
180
+ case EvaluatorType.LLM_JUDGE_OUTPUT_SEMANTIC_SIMILARITY:
181
+ return "LLMJudgeOutputEvaluatorConfig"
182
+ case EvaluatorType.LLM_JUDGE_OUTPUT_STRICT_JSON_SIMILARITY:
183
+ return "LLMJudgeStrictJSONSimilarityOutputEvaluatorConfig"
184
+ case EvaluatorType.LLM_JUDGE_TRAJECTORY_SIMILARITY:
185
+ return "LLMJudgeTrajectoryEvaluatorConfig"
186
+ case EvaluatorType.LLM_JUDGE_TRAJECTORY_SIMULATION:
187
+ return "LLMJudgeTrajectorySimulationEvaluatorConfig"
188
+ case EvaluatorType.TOOL_CALL_ARGS:
189
+ return "ToolCallArgsEvaluatorConfig"
190
+ case EvaluatorType.TOOL_CALL_COUNT:
191
+ return "ToolCallCountEvaluatorConfig"
192
+ case EvaluatorType.TOOL_CALL_ORDER:
193
+ return "ToolCallOrderEvaluatorConfig"
194
+ case EvaluatorType.TOOL_CALL_OUTPUT:
195
+ return "ToolCallOutputEvaluatorConfig"
196
+ case _:
197
+ return "UnknownEvaluatorConfig"
198
+ else:
199
+ return "UnknownEvaluatorConfig"
200
+
201
+
202
+ LegacyEvaluator = Annotated[
203
+ Union[
204
+ Annotated[
205
+ LLMEvaluatorParams,
206
+ Tag("LLMEvaluatorParams"),
207
+ ],
208
+ Annotated[
209
+ TrajectoryEvaluatorParams,
210
+ Tag("TrajectoryEvaluatorParams"),
211
+ ],
212
+ Annotated[
213
+ EqualsEvaluatorParams,
214
+ Tag("EqualsEvaluatorParams"),
215
+ ],
216
+ Annotated[
217
+ JsonSimilarityEvaluatorParams,
218
+ Tag("JsonSimilarityEvaluatorParams"),
219
+ ],
220
+ Annotated[
221
+ UnknownEvaluatorParams,
222
+ Tag("UnknownEvaluatorParams"),
223
+ ],
224
+ ],
225
+ Field(discriminator=Discriminator(legacy_evaluator_discriminator)),
226
+ ]
227
+
228
+ EvaluatorConfig = Annotated[
229
+ Union[
230
+ Annotated[
231
+ ContainsEvaluatorConfig,
232
+ Tag("ContainsEvaluatorConfig"),
233
+ ],
234
+ Annotated[
235
+ ExactMatchEvaluatorConfig,
236
+ Tag("ExactMatchEvaluatorConfig"),
237
+ ],
238
+ Annotated[
239
+ JsonSimilarityEvaluatorConfig,
240
+ Tag("JsonSimilarityEvaluatorConfig"),
241
+ ],
242
+ Annotated[
243
+ LLMJudgeOutputEvaluatorConfig,
244
+ Tag("LLMJudgeOutputEvaluatorConfig"),
245
+ ],
246
+ Annotated[
247
+ LLMJudgeStrictJSONSimilarityOutputEvaluatorConfig,
248
+ Tag("LLMJudgeStrictJSONSimilarityOutputEvaluatorConfig"),
249
+ ],
250
+ Annotated[
251
+ LLMJudgeTrajectoryEvaluatorConfig,
252
+ Tag("LLMJudgeTrajectoryEvaluatorConfig"),
253
+ ],
254
+ Annotated[
255
+ ToolCallArgsEvaluatorConfig,
256
+ Tag("ToolCallArgsEvaluatorConfig"),
257
+ ],
258
+ Annotated[
259
+ ToolCallCountEvaluatorConfig,
260
+ Tag("ToolCallCountEvaluatorConfig"),
261
+ ],
262
+ Annotated[
263
+ ToolCallOrderEvaluatorConfig,
264
+ Tag("ToolCallOrderEvaluatorConfig"),
265
+ ],
266
+ Annotated[
267
+ ToolCallOutputEvaluatorConfig,
268
+ Tag("ToolCallOutputEvaluatorConfig"),
269
+ ],
270
+ Annotated[
271
+ LLMJudgeTrajectorySimulationEvaluatorConfig,
272
+ Tag("LLMJudgeTrajectorySimulationEvaluatorConfig"),
273
+ ],
274
+ Annotated[
275
+ UnknownEvaluatorConfig,
276
+ Tag("UnknownEvaluatorConfig"),
277
+ ],
278
+ ],
279
+ Field(discriminator=Discriminator(evaluator_config_discriminator)),
280
+ ]
@@ -1,14 +1,14 @@
1
1
  from pydantic import BaseModel
2
2
 
3
- from uipath.eval.models.models import EvaluatorCategory, EvaluatorType
3
+ from uipath.eval.models.models import LegacyEvaluatorCategory, LegacyEvaluatorType
4
4
 
5
5
 
6
6
  class EvaluatorBaseParams(BaseModel):
7
7
  """Parameters for initializing the base evaluator."""
8
8
 
9
9
  id: str
10
- category: EvaluatorCategory
11
- evaluator_type: EvaluatorType
10
+ category: LegacyEvaluatorCategory
11
+ evaluator_type: LegacyEvaluatorType
12
12
  name: str
13
13
  description: str
14
14
  created_at: str
@@ -1,9 +1,11 @@
1
1
  import logging
2
- from typing import List, Optional
2
+ from collections import defaultdict
3
+ from typing import Any, Dict, List, Optional
3
4
 
4
5
  from opentelemetry.sdk.trace import ReadableSpan
5
6
  from pydantic import BaseModel, ConfigDict, model_serializer
6
7
  from pydantic.alias_generators import to_camel
8
+ from pydantic_core import core_schema
7
9
 
8
10
  from uipath._cli._runtime._contracts import UiPathRuntimeResult
9
11
  from uipath.eval.models.models import EvaluationResult, ScoreType
@@ -24,11 +26,15 @@ class EvaluationResultDto(BaseModel):
24
26
  model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True)
25
27
 
26
28
  score: float
27
- details: Optional[str] = None
29
+ details: Optional[str | BaseModel] = None
28
30
  evaluation_time: Optional[float] = None
29
31
 
30
32
  @model_serializer(mode="wrap")
31
- def serialize_model(self, serializer, info):
33
+ def serialize_model(
34
+ self,
35
+ serializer: core_schema.SerializerFunctionWrapHandler,
36
+ info: core_schema.SerializationInfo,
37
+ ) -> Any:
32
38
  data = serializer(self)
33
39
  if self.details is None and isinstance(data, dict):
34
40
  data.pop("details", None)
@@ -101,3 +107,81 @@ class UiPathEvalOutput(BaseModel):
101
107
  eval_result.score for eval_result in self.evaluation_set_results
102
108
  ]
103
109
  return sum(eval_item_scores) / len(eval_item_scores)
110
+
111
+ def calculate_final_score(
112
+ self,
113
+ evaluator_weights: Dict[str, float] | None = None,
114
+ default_weight: float = 1.0,
115
+ ) -> tuple[float, Dict[str, float]]:
116
+ """Aggregate evaluation results with deduplication and weighted scoring.
117
+
118
+ This function performs the following steps:
119
+ 1. Flattens the nested evaluation_set_results structure
120
+ 2. Deduplicates results by datapoint_id (evaluation_name) and evaluator_name (averages duplicates)
121
+ 3. Calculates average score per evaluator across all datapoints
122
+ 4. Computes final weighted score across evaluators
123
+
124
+ Args:
125
+ evaluator_weights: Optional dict mapping evaluator names to weights
126
+ default_weight: Default weight for evaluators not in evaluator_weights (default: 1.0)
127
+
128
+ Returns:
129
+ Tuple of (final_score, agg_metrics_per_evaluator)
130
+ - final_score: Weighted average across evaluators
131
+ - agg_metrics_per_evaluator: Dict mapping evaluator names to their average scores
132
+ """
133
+ if not self.evaluation_set_results:
134
+ return 0.0, {}
135
+
136
+ if evaluator_weights is None:
137
+ evaluator_weights = {}
138
+
139
+ # Step 1: Flatten the nested structure and group by datapoint_id and evaluator_name for deduplication
140
+ # datapoint_id = evaluation_name, evaluator_name from EvaluationRunResultDto
141
+ grouped_by_datapoint_evaluator: defaultdict[
142
+ str, defaultdict[str, list[float]]
143
+ ] = defaultdict(lambda: defaultdict(list))
144
+
145
+ for eval_run_result in self.evaluation_set_results:
146
+ datapoint_id = eval_run_result.evaluation_name
147
+ for eval_run_result_dto in eval_run_result.evaluation_run_results:
148
+ evaluator_name = eval_run_result_dto.evaluator_name
149
+ score = eval_run_result_dto.result.score
150
+ grouped_by_datapoint_evaluator[datapoint_id][evaluator_name].append(
151
+ score
152
+ )
153
+
154
+ # Step 2: Deduplicate by averaging same evaluator results for same datapoint
155
+ dedup_scores: list[tuple[str, str, float]] = []
156
+ for datapoint_id, evaluators_dict in grouped_by_datapoint_evaluator.items():
157
+ for evaluator_name, scores_list in evaluators_dict.items():
158
+ if scores_list:
159
+ # Average the scores for this evaluator on this datapoint
160
+ avg_score = sum(scores_list) / len(scores_list)
161
+ dedup_scores.append((datapoint_id, evaluator_name, avg_score))
162
+
163
+ # Step 3: Group by evaluator and calculate average score per evaluator
164
+ grouped_by_evaluator: defaultdict[str, list[float]] = defaultdict(list)
165
+ for _datapoint_id, evaluator_name, score in dedup_scores:
166
+ grouped_by_evaluator[evaluator_name].append(score)
167
+
168
+ agg_metrics_per_evaluator = {}
169
+ for evaluator_name, scores_list in grouped_by_evaluator.items():
170
+ avg_score = sum(scores_list) / len(scores_list)
171
+ agg_metrics_per_evaluator[evaluator_name] = avg_score
172
+
173
+ # Step 4: Calculate final weighted score
174
+ if not agg_metrics_per_evaluator:
175
+ return 0.0, {}
176
+
177
+ total_weighted_score = 0.0
178
+ total_weight = 0.0
179
+
180
+ for evaluator_name, avg_score in agg_metrics_per_evaluator.items():
181
+ weight = evaluator_weights.get(evaluator_name, default_weight)
182
+ total_weighted_score += avg_score * weight
183
+ total_weight += weight
184
+
185
+ final_score = total_weighted_score / total_weight if total_weight > 0 else 0.0
186
+
187
+ return final_score, agg_metrics_per_evaluator