uipath 2.1.52__py3-none-any.whl → 2.1.53__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- uipath/_cli/_evals/{_evaluators/_evaluator_factory.py → _evaluator_factory.py} +24 -23
- uipath/_cli/_evals/_models/_evaluation_set.py +23 -18
- uipath/_cli/_evals/_models/_evaluator_base_params.py +16 -0
- uipath/_cli/_evals/_models/_output.py +85 -0
- uipath/_cli/_evals/_runtime.py +102 -10
- uipath/_cli/_runtime/_contracts.py +12 -3
- uipath/_cli/_utils/_eval_set.py +1 -1
- uipath/_cli/cli_eval.py +46 -61
- uipath/eval/evaluators/__init__.py +15 -0
- uipath/eval/evaluators/base_evaluator.py +88 -0
- uipath/eval/evaluators/deterministic_evaluator_base.py +53 -0
- uipath/eval/evaluators/exact_match_evaluator.py +37 -0
- uipath/{_cli/_evals/_evaluators/_json_similarity_evaluator.py → eval/evaluators/json_similarity_evaluator.py} +23 -40
- uipath/eval/evaluators/llm_as_judge_evaluator.py +137 -0
- uipath/eval/evaluators/trajectory_evaluator.py +36 -0
- uipath/eval/models/__init__.py +19 -0
- uipath/{_cli/_evals/_models/_evaluators.py → eval/models/models.py} +67 -43
- {uipath-2.1.52.dist-info → uipath-2.1.53.dist-info}/METADATA +1 -1
- {uipath-2.1.52.dist-info → uipath-2.1.53.dist-info}/RECORD +22 -22
- uipath/_cli/_evals/_evaluators/__init__.py +0 -22
- uipath/_cli/_evals/_evaluators/_deterministic_evaluator_base.py +0 -46
- uipath/_cli/_evals/_evaluators/_evaluator_base.py +0 -124
- uipath/_cli/_evals/_evaluators/_exact_match_evaluator.py +0 -40
- uipath/_cli/_evals/_evaluators/_llm_as_judge_evaluator.py +0 -183
- uipath/_cli/_evals/_evaluators/_trajectory_evaluator.py +0 -48
- uipath/_cli/_evals/_models/__init__.py +0 -18
- uipath/_cli/_evals/_models/_agent_execution_output.py +0 -14
- uipath/_cli/_evals/progress_reporter.py +0 -304
- {uipath-2.1.52.dist-info → uipath-2.1.53.dist-info}/WHEEL +0 -0
- {uipath-2.1.52.dist-info → uipath-2.1.53.dist-info}/entry_points.txt +0 -0
- {uipath-2.1.52.dist-info → uipath-2.1.53.dist-info}/licenses/LICENSE +0 -0
@@ -1,304 +0,0 @@
|
|
1
|
-
"""Progress reporter for sending evaluation updates to StudioWeb."""
|
2
|
-
|
3
|
-
import json
|
4
|
-
import logging
|
5
|
-
import os
|
6
|
-
from typing import Any, List
|
7
|
-
|
8
|
-
from uipath import UiPath
|
9
|
-
from uipath._cli._evals._evaluators import EvaluatorBase
|
10
|
-
from uipath._cli._evals._models._evaluation_set import EvaluationStatus
|
11
|
-
from uipath._cli._evals._models._evaluators import EvalItemResult, ScoreType
|
12
|
-
from uipath._cli._utils._console import ConsoleLogger
|
13
|
-
from uipath._utils import Endpoint, RequestSpec
|
14
|
-
from uipath._utils.constants import ENV_TENANT_ID, HEADER_INTERNAL_TENANT_ID
|
15
|
-
|
16
|
-
|
17
|
-
class ProgressReporter:
|
18
|
-
"""Handles reporting evaluation progress to StudioWeb via API calls."""
|
19
|
-
|
20
|
-
def __init__(
|
21
|
-
self,
|
22
|
-
eval_set_id: str,
|
23
|
-
agent_snapshot: str,
|
24
|
-
no_of_evals: int,
|
25
|
-
evaluators: List[EvaluatorBase],
|
26
|
-
):
|
27
|
-
"""Initialize the progress reporter.
|
28
|
-
|
29
|
-
Args:
|
30
|
-
eval_set_id: ID of the evaluation set
|
31
|
-
agent_snapshot: JSON snapshot of the agent configuration
|
32
|
-
no_of_evals: Number of evaluations in the set
|
33
|
-
evaluators: List of evaluator instances
|
34
|
-
"""
|
35
|
-
self._eval_set_id = eval_set_id
|
36
|
-
self.agent_snapshot = agent_snapshot
|
37
|
-
self._no_of_evals = no_of_evals
|
38
|
-
self._evaluators: dict[str, EvaluatorBase] = {
|
39
|
-
evaluator.id: evaluator for evaluator in evaluators
|
40
|
-
}
|
41
|
-
self._evaluator_scores: dict[str, list[float]] = {
|
42
|
-
evaluator.id: [] for evaluator in evaluators
|
43
|
-
}
|
44
|
-
|
45
|
-
# Disable middleware logging and use the same console as ConsoleLogger
|
46
|
-
logging.getLogger("uipath._cli.middlewares").setLevel(logging.CRITICAL)
|
47
|
-
|
48
|
-
console_logger = ConsoleLogger.get_instance()
|
49
|
-
|
50
|
-
uipath = UiPath()
|
51
|
-
|
52
|
-
self._eval_set_run_id = None
|
53
|
-
self._client = uipath.api_client
|
54
|
-
self._console = console_logger
|
55
|
-
self._project_id = os.getenv("UIPATH_PROJECT_ID", None)
|
56
|
-
if not self._project_id:
|
57
|
-
self._console.warning(
|
58
|
-
"Cannot report data to StudioWeb. Please set UIPATH_PROJECT_ID."
|
59
|
-
)
|
60
|
-
|
61
|
-
async def create_eval_set_run(self):
|
62
|
-
"""Create a new evaluation set run in StudioWeb."""
|
63
|
-
spec = self._create_eval_set_run_spec()
|
64
|
-
response = await self._client.request_async(
|
65
|
-
method=spec.method,
|
66
|
-
url=spec.endpoint,
|
67
|
-
params=spec.params,
|
68
|
-
json=spec.json,
|
69
|
-
headers=spec.headers,
|
70
|
-
)
|
71
|
-
self._eval_set_run_id = json.loads(response.content)["id"]
|
72
|
-
|
73
|
-
async def create_eval_run(self, eval_item: dict[str, Any]):
|
74
|
-
"""Create a new evaluation run in StudioWeb.
|
75
|
-
|
76
|
-
Args:
|
77
|
-
eval_item: Dictionary containing evaluation data
|
78
|
-
|
79
|
-
Returns:
|
80
|
-
The ID of the created evaluation run
|
81
|
-
"""
|
82
|
-
spec = self._create_eval_run_spec(eval_item)
|
83
|
-
response = await self._client.request_async(
|
84
|
-
method=spec.method,
|
85
|
-
url=spec.endpoint,
|
86
|
-
params=spec.params,
|
87
|
-
json=spec.json,
|
88
|
-
headers=spec.headers,
|
89
|
-
)
|
90
|
-
return json.loads(response.content)["id"]
|
91
|
-
|
92
|
-
async def update_eval_run(
|
93
|
-
self,
|
94
|
-
eval_results: list[EvalItemResult],
|
95
|
-
eval_run_id: str,
|
96
|
-
execution_time: float,
|
97
|
-
):
|
98
|
-
"""Update an evaluation run with results.
|
99
|
-
|
100
|
-
Args:
|
101
|
-
eval_results: Dictionary mapping evaluator IDs to evaluation results
|
102
|
-
eval_run_id: ID of the evaluation run to update
|
103
|
-
execution_time: The agent execution time
|
104
|
-
"""
|
105
|
-
assertion_runs, evaluator_scores, actual_output = self._collect_results(
|
106
|
-
eval_results
|
107
|
-
)
|
108
|
-
spec = self._update_eval_run_spec(
|
109
|
-
assertion_runs=assertion_runs,
|
110
|
-
evaluator_scores=evaluator_scores,
|
111
|
-
eval_run_id=eval_run_id,
|
112
|
-
execution_time=execution_time,
|
113
|
-
actual_output=actual_output,
|
114
|
-
)
|
115
|
-
await self._client.request_async(
|
116
|
-
method=spec.method,
|
117
|
-
url=spec.endpoint,
|
118
|
-
params=spec.params,
|
119
|
-
json=spec.json,
|
120
|
-
headers=spec.headers,
|
121
|
-
)
|
122
|
-
|
123
|
-
async def update_eval_set_run(self):
|
124
|
-
"""Update the evaluation set run status to complete."""
|
125
|
-
spec = self._update_eval_set_run_spec()
|
126
|
-
await self._client.request_async(
|
127
|
-
method=spec.method,
|
128
|
-
url=spec.endpoint,
|
129
|
-
params=spec.params,
|
130
|
-
json=spec.json,
|
131
|
-
headers=spec.headers,
|
132
|
-
)
|
133
|
-
|
134
|
-
def _collect_results(
|
135
|
-
self, eval_results: list[EvalItemResult]
|
136
|
-
) -> tuple[list[dict[str, Any]], list[dict[str, Any]], dict[str, Any]]:
|
137
|
-
assertion_runs: list[dict[str, Any]] = []
|
138
|
-
evaluator_scores: list[dict[str, Any]] = []
|
139
|
-
actual_output: dict[str, Any] = {}
|
140
|
-
for eval_result in eval_results:
|
141
|
-
# keep track of evaluator scores. this should be removed after this computation is done server-side
|
142
|
-
|
143
|
-
# check the evaluator score type
|
144
|
-
match eval_result.result.score_type:
|
145
|
-
case ScoreType.NUMERICAL:
|
146
|
-
self._evaluator_scores[eval_result.evaluator_id].append(
|
147
|
-
eval_result.result.score
|
148
|
-
)
|
149
|
-
case ScoreType.BOOLEAN:
|
150
|
-
self._evaluator_scores[eval_result.evaluator_id].append(
|
151
|
-
100 if eval_result.result.score else 0
|
152
|
-
)
|
153
|
-
case ScoreType.ERROR:
|
154
|
-
self._evaluator_scores[eval_result.evaluator_id].append(0)
|
155
|
-
|
156
|
-
evaluator_scores.append(
|
157
|
-
{
|
158
|
-
"type": eval_result.result.score_type.value,
|
159
|
-
"value": eval_result.result.score,
|
160
|
-
"justification": eval_result.result.details,
|
161
|
-
"evaluatorId": eval_result.evaluator_id,
|
162
|
-
}
|
163
|
-
)
|
164
|
-
assertion_runs.append(
|
165
|
-
{
|
166
|
-
"status": EvaluationStatus.COMPLETED.value,
|
167
|
-
"evaluatorId": eval_result.evaluator_id,
|
168
|
-
"completionMetrics": {
|
169
|
-
"duration": eval_result.result.evaluation_time,
|
170
|
-
"cost": None,
|
171
|
-
"tokens": 0,
|
172
|
-
"completionTokens": 0,
|
173
|
-
"promptTokens": 0,
|
174
|
-
},
|
175
|
-
"assertionSnapshot": {
|
176
|
-
"assertionType": self._evaluators[
|
177
|
-
eval_result.evaluator_id
|
178
|
-
].type.name,
|
179
|
-
"outputKey": self._evaluators[
|
180
|
-
eval_result.evaluator_id
|
181
|
-
].target_output_key,
|
182
|
-
},
|
183
|
-
}
|
184
|
-
)
|
185
|
-
|
186
|
-
# we extract the actual output here. we should have the same 'actual_output' for each 'EvalItemResult'
|
187
|
-
actual_output = eval_result.result.actual_output
|
188
|
-
|
189
|
-
return assertion_runs, evaluator_scores, actual_output
|
190
|
-
|
191
|
-
def _update_eval_run_spec(
|
192
|
-
self,
|
193
|
-
assertion_runs: list[dict[str, Any]],
|
194
|
-
evaluator_scores: list[dict[str, Any]],
|
195
|
-
eval_run_id: str,
|
196
|
-
actual_output: dict[str, Any],
|
197
|
-
execution_time: float,
|
198
|
-
) -> RequestSpec:
|
199
|
-
return RequestSpec(
|
200
|
-
method="PUT",
|
201
|
-
endpoint=Endpoint(
|
202
|
-
f"agentsruntime_/api/execution/agents/{self._project_id}/evalRun"
|
203
|
-
),
|
204
|
-
json={
|
205
|
-
"evalRunId": eval_run_id,
|
206
|
-
"status": EvaluationStatus.COMPLETED.value,
|
207
|
-
"result": {
|
208
|
-
"output": {"content": {**actual_output}},
|
209
|
-
"evaluatorScores": evaluator_scores,
|
210
|
-
},
|
211
|
-
"completionMetrics": {"duration": int(execution_time)},
|
212
|
-
"assertionRuns": assertion_runs,
|
213
|
-
},
|
214
|
-
headers=self._tenant_header(),
|
215
|
-
)
|
216
|
-
|
217
|
-
def _create_eval_run_spec(self, eval_item: dict[str, Any]) -> RequestSpec:
|
218
|
-
return RequestSpec(
|
219
|
-
method="POST",
|
220
|
-
endpoint=Endpoint(
|
221
|
-
f"agentsruntime_/api/execution/agents/{self._project_id}/evalRun"
|
222
|
-
),
|
223
|
-
json={
|
224
|
-
"evalSetRunId": self._eval_set_run_id,
|
225
|
-
"evalSnapshot": {
|
226
|
-
"id": eval_item["id"],
|
227
|
-
"name": eval_item["name"],
|
228
|
-
"inputs": eval_item.get("inputs"),
|
229
|
-
"expectedOutput": eval_item.get("expectedOutput", {}),
|
230
|
-
},
|
231
|
-
"status": EvaluationStatus.IN_PROGRESS.value,
|
232
|
-
},
|
233
|
-
headers=self._tenant_header(),
|
234
|
-
)
|
235
|
-
|
236
|
-
def _create_eval_set_run_spec(
|
237
|
-
self,
|
238
|
-
) -> RequestSpec:
|
239
|
-
agent_snapshot_dict = json.loads(self.agent_snapshot)
|
240
|
-
|
241
|
-
return RequestSpec(
|
242
|
-
method="POST",
|
243
|
-
endpoint=Endpoint(
|
244
|
-
f"agentsruntime_/api/execution/agents/{self._project_id}/evalSetRun"
|
245
|
-
),
|
246
|
-
json={
|
247
|
-
"agentId": self._project_id,
|
248
|
-
"evalSetId": self._eval_set_id,
|
249
|
-
"agentSnapshot": agent_snapshot_dict,
|
250
|
-
"status": EvaluationStatus.IN_PROGRESS.value,
|
251
|
-
"numberOfEvalsExecuted": self._no_of_evals,
|
252
|
-
},
|
253
|
-
headers=self._tenant_header(),
|
254
|
-
)
|
255
|
-
|
256
|
-
def _compute_evaluator_scores(self):
|
257
|
-
evaluator_scores = []
|
258
|
-
evaluator_averages = []
|
259
|
-
|
260
|
-
for evaluator in self._evaluators.values():
|
261
|
-
scores = self._evaluator_scores[evaluator.id]
|
262
|
-
if scores:
|
263
|
-
avg_score = sum(scores) / len(scores)
|
264
|
-
evaluator_scores.append(
|
265
|
-
{"value": avg_score, "evaluatorId": evaluator.id}
|
266
|
-
)
|
267
|
-
evaluator_averages.append(avg_score)
|
268
|
-
else:
|
269
|
-
# fallback to score 0
|
270
|
-
evaluator_scores.append({"value": 0, "evaluatorId": evaluator.id})
|
271
|
-
evaluator_averages.append(0)
|
272
|
-
|
273
|
-
overall_score = (
|
274
|
-
sum(evaluator_averages) / len(evaluator_averages)
|
275
|
-
if evaluator_averages
|
276
|
-
else 0
|
277
|
-
)
|
278
|
-
return evaluator_scores, overall_score
|
279
|
-
|
280
|
-
def _update_eval_set_run_spec(
|
281
|
-
self,
|
282
|
-
) -> RequestSpec:
|
283
|
-
# this should be removed after computations are done server-side
|
284
|
-
evaluator_scores, overall_score = self._compute_evaluator_scores()
|
285
|
-
return RequestSpec(
|
286
|
-
method="PUT",
|
287
|
-
endpoint=Endpoint(
|
288
|
-
f"agentsruntime_/api/execution/agents/{self._project_id}/evalSetRun"
|
289
|
-
),
|
290
|
-
json={
|
291
|
-
"evalSetRunId": self._eval_set_run_id,
|
292
|
-
"status": EvaluationStatus.COMPLETED.value,
|
293
|
-
"evaluatorScores": evaluator_scores,
|
294
|
-
},
|
295
|
-
headers=self._tenant_header(),
|
296
|
-
)
|
297
|
-
|
298
|
-
def _tenant_header(self) -> dict[str, str]:
|
299
|
-
tenant_id = os.getenv(ENV_TENANT_ID, None)
|
300
|
-
if not tenant_id:
|
301
|
-
self._console.error(
|
302
|
-
f"{ENV_TENANT_ID} env var is not set. Please run 'uipath auth'."
|
303
|
-
)
|
304
|
-
return {HEADER_INTERNAL_TENANT_ID: tenant_id} # type: ignore
|
File without changes
|
File without changes
|
File without changes
|