uipath 2.1.59__py3-none-any.whl → 2.1.61__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- uipath/_cli/_evals/_models/_output.py +22 -9
- uipath/_cli/_evals/_models/_sw_reporting.py +21 -0
- uipath/_cli/_evals/_progress_reporter.py +436 -0
- uipath/_cli/_evals/_runtime.py +103 -10
- uipath/_cli/_runtime/_contracts.py +11 -4
- uipath/_cli/_utils/_folders.py +30 -24
- uipath/_cli/cli_eval.py +28 -6
- uipath/_cli/cli_invoke.py +5 -2
- uipath/_cli/cli_publish.py +4 -3
- uipath/_events/__init__.py +0 -0
- uipath/_events/_event_bus.py +157 -0
- uipath/_events/_events.py +53 -0
- uipath/agent/models/agent.py +13 -0
- uipath/eval/models/models.py +1 -1
- uipath/tracing/_otel_exporters.py +95 -91
- uipath/tracing/_traced.py +16 -0
- uipath/tracing/_utils.py +9 -2
- {uipath-2.1.59.dist-info → uipath-2.1.61.dist-info}/METADATA +1 -1
- {uipath-2.1.59.dist-info → uipath-2.1.61.dist-info}/RECORD +22 -17
- {uipath-2.1.59.dist-info → uipath-2.1.61.dist-info}/WHEEL +0 -0
- {uipath-2.1.59.dist-info → uipath-2.1.61.dist-info}/entry_points.txt +0 -0
- {uipath-2.1.59.dist-info → uipath-2.1.61.dist-info}/licenses/LICENSE +0 -0
@@ -62,9 +62,19 @@ class EvaluationRunResultDto(BaseModel):
|
|
62
62
|
class EvaluationRunResult(BaseModel):
|
63
63
|
model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True)
|
64
64
|
|
65
|
+
score: float = 0.0
|
65
66
|
evaluation_name: str
|
66
67
|
evaluation_run_results: List[EvaluationRunResultDto]
|
67
68
|
|
69
|
+
def compute_average_score(self) -> None:
|
70
|
+
"""Compute average score for this single eval_item."""
|
71
|
+
if not self.evaluation_run_results:
|
72
|
+
self.score = 0.0
|
73
|
+
return
|
74
|
+
|
75
|
+
total_score = sum(dto.result.score for dto in self.evaluation_run_results)
|
76
|
+
self.score = total_score / len(self.evaluation_run_results)
|
77
|
+
|
68
78
|
|
69
79
|
class UiPathEvalOutput(BaseModel):
|
70
80
|
model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True)
|
@@ -74,12 +84,15 @@ class UiPathEvalOutput(BaseModel):
|
|
74
84
|
evaluation_set_results: List[EvaluationRunResult]
|
75
85
|
|
76
86
|
def compute_average_score(self) -> None:
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
87
|
+
"""Compute overall average by calling eval_item.compute_average_score()."""
|
88
|
+
if not self.evaluation_set_results:
|
89
|
+
self.score = 0.0
|
90
|
+
return
|
91
|
+
|
92
|
+
for eval_result in self.evaluation_set_results:
|
93
|
+
eval_result.compute_average_score()
|
94
|
+
|
95
|
+
eval_item_scores = [
|
96
|
+
eval_result.score for eval_result in self.evaluation_set_results
|
97
|
+
]
|
98
|
+
self.score = sum(eval_item_scores) / len(eval_item_scores)
|
@@ -0,0 +1,21 @@
|
|
1
|
+
from typing import Any
|
2
|
+
|
3
|
+
from pydantic import BaseModel, ConfigDict
|
4
|
+
from pydantic.alias_generators import to_camel
|
5
|
+
|
6
|
+
from uipath.eval.models import EvalItemResult
|
7
|
+
|
8
|
+
|
9
|
+
class StudioWebProgressItem(BaseModel):
|
10
|
+
eval_run_id: str
|
11
|
+
eval_results: list[EvalItemResult]
|
12
|
+
success: bool
|
13
|
+
agent_output: dict[str, Any]
|
14
|
+
agent_execution_time: float
|
15
|
+
|
16
|
+
|
17
|
+
class StudioWebAgentSnapshot(BaseModel):
|
18
|
+
model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True)
|
19
|
+
|
20
|
+
input_schema: dict[str, Any]
|
21
|
+
output_schema: dict[str, Any]
|
@@ -0,0 +1,436 @@
|
|
1
|
+
"""Progress reporter for sending evaluation updates to StudioWeb."""
|
2
|
+
|
3
|
+
import functools
|
4
|
+
import json
|
5
|
+
import logging
|
6
|
+
import os
|
7
|
+
from typing import Any, Dict, List
|
8
|
+
|
9
|
+
from opentelemetry import trace
|
10
|
+
|
11
|
+
from uipath import UiPath
|
12
|
+
from uipath._cli._evals._models._evaluation_set import EvaluationItem, EvaluationStatus
|
13
|
+
from uipath._cli._evals._models._sw_reporting import (
|
14
|
+
StudioWebAgentSnapshot,
|
15
|
+
StudioWebProgressItem,
|
16
|
+
)
|
17
|
+
from uipath._cli._utils._console import ConsoleLogger
|
18
|
+
from uipath._cli._utils._project_files import ( # type: ignore
|
19
|
+
get_project_config,
|
20
|
+
)
|
21
|
+
from uipath._events._event_bus import EventBus
|
22
|
+
from uipath._events._events import (
|
23
|
+
EvalRunCreatedEvent,
|
24
|
+
EvalRunUpdatedEvent,
|
25
|
+
EvalSetRunCreatedEvent,
|
26
|
+
EvalSetRunUpdatedEvent,
|
27
|
+
EvaluationEvents,
|
28
|
+
)
|
29
|
+
from uipath._utils import Endpoint, RequestSpec
|
30
|
+
from uipath._utils.constants import ENV_TENANT_ID, HEADER_INTERNAL_TENANT_ID
|
31
|
+
from uipath.eval.evaluators import BaseEvaluator
|
32
|
+
from uipath.eval.models import EvalItemResult, ScoreType
|
33
|
+
from uipath.tracing import LlmOpsHttpExporter
|
34
|
+
|
35
|
+
logger = logging.getLogger(__name__)
|
36
|
+
|
37
|
+
|
38
|
+
def gracefully_handle_errors(func):
|
39
|
+
"""Decorator to catch and log errors without stopping execution."""
|
40
|
+
|
41
|
+
@functools.wraps(func)
|
42
|
+
async def wrapper(self, *args, **kwargs):
|
43
|
+
try:
|
44
|
+
return await func(self, *args, **kwargs)
|
45
|
+
except Exception as e:
|
46
|
+
if hasattr(self, "_console"):
|
47
|
+
error_type = type(e).__name__
|
48
|
+
logger.warning(
|
49
|
+
f"Cannot report progress to SW. "
|
50
|
+
f"Function: {func.__name__}, "
|
51
|
+
f"Error type: {error_type}, "
|
52
|
+
f"Details: {e}"
|
53
|
+
)
|
54
|
+
return None
|
55
|
+
|
56
|
+
return wrapper
|
57
|
+
|
58
|
+
|
59
|
+
class StudioWebProgressReporter:
|
60
|
+
"""Handles reporting evaluation progress to StudioWeb."""
|
61
|
+
|
62
|
+
def __init__(self):
|
63
|
+
logging.getLogger("uipath._cli.middlewares").setLevel(logging.CRITICAL)
|
64
|
+
console_logger = ConsoleLogger.get_instance()
|
65
|
+
uipath = UiPath()
|
66
|
+
|
67
|
+
self._client = uipath.api_client
|
68
|
+
self._console = console_logger
|
69
|
+
self._project_id = os.getenv("UIPATH_PROJECT_ID", None)
|
70
|
+
if not self._project_id:
|
71
|
+
logger.warning(
|
72
|
+
"Cannot report data to StudioWeb. Please set UIPATH_PROJECT_ID."
|
73
|
+
)
|
74
|
+
|
75
|
+
self.eval_set_run_ids: Dict[str, str] = {}
|
76
|
+
self.evaluators: Dict[str, Any] = {}
|
77
|
+
self.evaluator_scores: Dict[str, List[float]] = {}
|
78
|
+
self.eval_run_ids: Dict[str, str] = {}
|
79
|
+
|
80
|
+
@gracefully_handle_errors
|
81
|
+
async def create_eval_set_run(
|
82
|
+
self,
|
83
|
+
eval_set_id: str,
|
84
|
+
agent_snapshot: StudioWebAgentSnapshot,
|
85
|
+
no_of_evals: int,
|
86
|
+
evaluators: List[BaseEvaluator[Any]],
|
87
|
+
) -> str:
|
88
|
+
"""Create a new evaluation set run in StudioWeb."""
|
89
|
+
spec = self._create_eval_set_run_spec(eval_set_id, agent_snapshot, no_of_evals)
|
90
|
+
response = await self._client.request_async(
|
91
|
+
method=spec.method,
|
92
|
+
url=spec.endpoint,
|
93
|
+
params=spec.params,
|
94
|
+
json=spec.json,
|
95
|
+
headers=spec.headers,
|
96
|
+
)
|
97
|
+
eval_set_run_id = json.loads(response.content)["id"]
|
98
|
+
return eval_set_run_id
|
99
|
+
|
100
|
+
@gracefully_handle_errors
|
101
|
+
async def create_eval_run(
|
102
|
+
self, eval_item: EvaluationItem, eval_set_run_id: str
|
103
|
+
) -> str:
|
104
|
+
"""Create a new evaluation run in StudioWeb.
|
105
|
+
|
106
|
+
Args:
|
107
|
+
eval_item: Dictionary containing evaluation data
|
108
|
+
eval_set_run_id: The ID of the evaluation set run
|
109
|
+
|
110
|
+
Returns:
|
111
|
+
The ID of the created evaluation run
|
112
|
+
"""
|
113
|
+
spec = self._create_eval_run_spec(eval_item, eval_set_run_id)
|
114
|
+
response = await self._client.request_async(
|
115
|
+
method=spec.method,
|
116
|
+
url=spec.endpoint,
|
117
|
+
params=spec.params,
|
118
|
+
json=spec.json,
|
119
|
+
headers=spec.headers,
|
120
|
+
)
|
121
|
+
return json.loads(response.content)["id"]
|
122
|
+
|
123
|
+
@gracefully_handle_errors
|
124
|
+
async def update_eval_run(
|
125
|
+
self,
|
126
|
+
sw_progress_item: StudioWebProgressItem,
|
127
|
+
evaluators: dict[str, BaseEvaluator[Any]],
|
128
|
+
):
|
129
|
+
"""Update an evaluation run with results."""
|
130
|
+
assertion_runs, evaluator_scores = self._collect_results(
|
131
|
+
sw_progress_item.eval_results, evaluators
|
132
|
+
)
|
133
|
+
spec = self._update_eval_run_spec(
|
134
|
+
assertion_runs=assertion_runs,
|
135
|
+
evaluator_scores=evaluator_scores,
|
136
|
+
eval_run_id=sw_progress_item.eval_run_id,
|
137
|
+
execution_time=sw_progress_item.agent_execution_time,
|
138
|
+
actual_output=sw_progress_item.agent_output,
|
139
|
+
)
|
140
|
+
await self._client.request_async(
|
141
|
+
method=spec.method,
|
142
|
+
url=spec.endpoint,
|
143
|
+
params=spec.params,
|
144
|
+
json=spec.json,
|
145
|
+
headers=spec.headers,
|
146
|
+
)
|
147
|
+
|
148
|
+
@gracefully_handle_errors
|
149
|
+
async def update_eval_set_run(
|
150
|
+
self,
|
151
|
+
eval_set_run_id: str,
|
152
|
+
evaluator_scores: dict[str, float],
|
153
|
+
):
|
154
|
+
"""Update the evaluation set run status to complete."""
|
155
|
+
spec = self._update_eval_set_run_spec(eval_set_run_id, evaluator_scores)
|
156
|
+
await self._client.request_async(
|
157
|
+
method=spec.method,
|
158
|
+
url=spec.endpoint,
|
159
|
+
params=spec.params,
|
160
|
+
json=spec.json,
|
161
|
+
headers=spec.headers,
|
162
|
+
)
|
163
|
+
|
164
|
+
async def handle_create_eval_set_run(self, payload: EvalSetRunCreatedEvent) -> None:
|
165
|
+
try:
|
166
|
+
self.evaluators = {eval.id: eval for eval in payload.evaluators}
|
167
|
+
self.evaluator_scores = {eval.id: [] for eval in payload.evaluators}
|
168
|
+
|
169
|
+
eval_set_run_id = await self.create_eval_set_run(
|
170
|
+
eval_set_id=payload.eval_set_id,
|
171
|
+
agent_snapshot=self._extract_agent_snapshot(payload.entrypoint),
|
172
|
+
no_of_evals=payload.no_of_evals,
|
173
|
+
evaluators=payload.evaluators,
|
174
|
+
)
|
175
|
+
self.eval_set_run_ids[payload.execution_id] = eval_set_run_id
|
176
|
+
current_span = trace.get_current_span()
|
177
|
+
if current_span.is_recording():
|
178
|
+
current_span.set_attribute("eval_set_run_id", eval_set_run_id)
|
179
|
+
|
180
|
+
logger.debug(f"Created eval set run with ID: {eval_set_run_id}")
|
181
|
+
|
182
|
+
except Exception as e:
|
183
|
+
logger.error(f"Failed to handle create eval set run event: {e}")
|
184
|
+
|
185
|
+
async def handle_create_eval_run(self, payload: EvalRunCreatedEvent) -> None:
|
186
|
+
try:
|
187
|
+
if eval_set_run_id := self.eval_set_run_ids.get(payload.execution_id):
|
188
|
+
eval_run_id = await self.create_eval_run(
|
189
|
+
payload.eval_item, eval_set_run_id
|
190
|
+
)
|
191
|
+
if eval_run_id:
|
192
|
+
self.eval_run_ids[payload.execution_id] = eval_run_id
|
193
|
+
logger.debug(f"Created eval run with ID: {eval_run_id}")
|
194
|
+
else:
|
195
|
+
logger.warning("Cannot create eval run: eval_set_run_id not available")
|
196
|
+
|
197
|
+
except Exception as e:
|
198
|
+
logger.error(f"Failed to handle create eval run event: {e}")
|
199
|
+
|
200
|
+
async def handle_update_eval_run(self, payload: EvalRunUpdatedEvent) -> None:
|
201
|
+
try:
|
202
|
+
spans_exporter = LlmOpsHttpExporter(
|
203
|
+
trace_id=self.eval_set_run_ids.get(payload.execution_id),
|
204
|
+
)
|
205
|
+
|
206
|
+
spans_exporter.export(payload.spans)
|
207
|
+
|
208
|
+
for eval_result in payload.eval_results:
|
209
|
+
evaluator_id = eval_result.evaluator_id
|
210
|
+
if evaluator_id in self.evaluator_scores:
|
211
|
+
match eval_result.result.score_type:
|
212
|
+
case ScoreType.NUMERICAL:
|
213
|
+
self.evaluator_scores[evaluator_id].append(
|
214
|
+
eval_result.result.score
|
215
|
+
)
|
216
|
+
case ScoreType.BOOLEAN:
|
217
|
+
self.evaluator_scores[evaluator_id].append(
|
218
|
+
100 if eval_result.result.score else 0
|
219
|
+
)
|
220
|
+
case ScoreType.ERROR:
|
221
|
+
self.evaluator_scores[evaluator_id].append(0)
|
222
|
+
|
223
|
+
eval_run_id = self.eval_run_ids[payload.execution_id]
|
224
|
+
if eval_run_id:
|
225
|
+
await self.update_eval_run(
|
226
|
+
StudioWebProgressItem(
|
227
|
+
eval_run_id=eval_run_id,
|
228
|
+
eval_results=payload.eval_results,
|
229
|
+
success=payload.success,
|
230
|
+
agent_output=payload.agent_output,
|
231
|
+
agent_execution_time=payload.agent_execution_time,
|
232
|
+
),
|
233
|
+
self.evaluators,
|
234
|
+
)
|
235
|
+
|
236
|
+
logger.debug(f"Updated eval run with ID: {eval_run_id}")
|
237
|
+
|
238
|
+
except Exception as e:
|
239
|
+
logger.error(f"Failed to handle update eval run event: {e}")
|
240
|
+
|
241
|
+
async def handle_update_eval_set_run(self, payload: EvalSetRunUpdatedEvent) -> None:
|
242
|
+
try:
|
243
|
+
if eval_set_run_id := self.eval_set_run_ids.get(payload.execution_id):
|
244
|
+
await self.update_eval_set_run(
|
245
|
+
eval_set_run_id,
|
246
|
+
payload.evaluator_scores,
|
247
|
+
)
|
248
|
+
logger.debug(f"Updated eval set run with ID: {eval_set_run_id}")
|
249
|
+
else:
|
250
|
+
logger.warning(
|
251
|
+
"Cannot update eval set run: eval_set_run_id not available"
|
252
|
+
)
|
253
|
+
|
254
|
+
except Exception as e:
|
255
|
+
logger.error(f"Failed to handle update eval set run event: {e}")
|
256
|
+
|
257
|
+
async def subscribe_to_eval_runtime_events(self, event_bus: EventBus) -> None:
|
258
|
+
event_bus.subscribe(
|
259
|
+
EvaluationEvents.CREATE_EVAL_SET_RUN, self.handle_create_eval_set_run
|
260
|
+
)
|
261
|
+
event_bus.subscribe(
|
262
|
+
EvaluationEvents.CREATE_EVAL_RUN, self.handle_create_eval_run
|
263
|
+
)
|
264
|
+
event_bus.subscribe(
|
265
|
+
EvaluationEvents.UPDATE_EVAL_RUN, self.handle_update_eval_run
|
266
|
+
)
|
267
|
+
event_bus.subscribe(
|
268
|
+
EvaluationEvents.UPDATE_EVAL_SET_RUN, self.handle_update_eval_set_run
|
269
|
+
)
|
270
|
+
|
271
|
+
logger.info("StudioWeb progress reporter subscribed to evaluation events")
|
272
|
+
|
273
|
+
def _extract_agent_snapshot(self, entrypoint: str) -> StudioWebAgentSnapshot:
|
274
|
+
try:
|
275
|
+
project_config = get_project_config(os.getcwd())
|
276
|
+
ep = None
|
277
|
+
for entry_point in project_config.get("entryPoints", []):
|
278
|
+
if entry_point.get("filePath") == entrypoint:
|
279
|
+
ep = entry_point
|
280
|
+
break
|
281
|
+
|
282
|
+
if not ep:
|
283
|
+
logger.warning(
|
284
|
+
f"Entrypoint {entrypoint} not found in configuration file"
|
285
|
+
)
|
286
|
+
return StudioWebAgentSnapshot(input_schema={}, output_schema={})
|
287
|
+
|
288
|
+
input_schema = ep.get("input", {})
|
289
|
+
output_schema = ep.get("output", {})
|
290
|
+
|
291
|
+
return StudioWebAgentSnapshot(
|
292
|
+
input_schema=input_schema, output_schema=output_schema
|
293
|
+
)
|
294
|
+
except Exception as e:
|
295
|
+
logger.warning(f"Failed to extract agent snapshot: {e}")
|
296
|
+
return StudioWebAgentSnapshot(input_schema={}, output_schema={})
|
297
|
+
|
298
|
+
def _collect_results(
|
299
|
+
self,
|
300
|
+
eval_results: list[EvalItemResult],
|
301
|
+
evaluators: dict[str, BaseEvaluator[Any]],
|
302
|
+
) -> tuple[list[dict[str, Any]], list[dict[str, Any]]]:
|
303
|
+
assertion_runs: list[dict[str, Any]] = []
|
304
|
+
evaluator_scores_list: list[dict[str, Any]] = []
|
305
|
+
for eval_result in eval_results:
|
306
|
+
evaluator_scores_list.append(
|
307
|
+
{
|
308
|
+
"type": eval_result.result.score_type.value,
|
309
|
+
"value": eval_result.result.score,
|
310
|
+
"justification": eval_result.result.details,
|
311
|
+
"evaluatorId": eval_result.evaluator_id,
|
312
|
+
}
|
313
|
+
)
|
314
|
+
assertion_runs.append(
|
315
|
+
{
|
316
|
+
"status": EvaluationStatus.COMPLETED.value,
|
317
|
+
"evaluatorId": eval_result.evaluator_id,
|
318
|
+
"completionMetrics": {
|
319
|
+
"duration": int(eval_result.result.evaluation_time)
|
320
|
+
if eval_result.result.evaluation_time
|
321
|
+
else 0,
|
322
|
+
"cost": None,
|
323
|
+
"tokens": 0,
|
324
|
+
"completionTokens": 0,
|
325
|
+
"promptTokens": 0,
|
326
|
+
},
|
327
|
+
"assertionSnapshot": {
|
328
|
+
"assertionType": evaluators[
|
329
|
+
eval_result.evaluator_id
|
330
|
+
].evaluator_type.name,
|
331
|
+
"outputKey": evaluators[
|
332
|
+
eval_result.evaluator_id
|
333
|
+
].target_output_key,
|
334
|
+
},
|
335
|
+
}
|
336
|
+
)
|
337
|
+
return assertion_runs, evaluator_scores_list
|
338
|
+
|
339
|
+
def _update_eval_run_spec(
|
340
|
+
self,
|
341
|
+
assertion_runs: list[dict[str, Any]],
|
342
|
+
evaluator_scores: list[dict[str, Any]],
|
343
|
+
eval_run_id: str,
|
344
|
+
actual_output: dict[str, Any],
|
345
|
+
execution_time: float,
|
346
|
+
) -> RequestSpec:
|
347
|
+
return RequestSpec(
|
348
|
+
method="PUT",
|
349
|
+
endpoint=Endpoint(
|
350
|
+
f"agentsruntime_/api/execution/agents/{self._project_id}/evalRun"
|
351
|
+
),
|
352
|
+
json={
|
353
|
+
"evalRunId": eval_run_id,
|
354
|
+
"status": EvaluationStatus.COMPLETED.value,
|
355
|
+
"result": {
|
356
|
+
"output": {"content": {**actual_output}},
|
357
|
+
"evaluatorScores": evaluator_scores,
|
358
|
+
},
|
359
|
+
"completionMetrics": {"duration": int(execution_time)},
|
360
|
+
"assertionRuns": assertion_runs,
|
361
|
+
},
|
362
|
+
headers=self._tenant_header(),
|
363
|
+
)
|
364
|
+
|
365
|
+
def _create_eval_run_spec(
|
366
|
+
self, eval_item: EvaluationItem, eval_set_run_id: str
|
367
|
+
) -> RequestSpec:
|
368
|
+
return RequestSpec(
|
369
|
+
method="POST",
|
370
|
+
endpoint=Endpoint(
|
371
|
+
f"agentsruntime_/api/execution/agents/{self._project_id}/evalRun"
|
372
|
+
),
|
373
|
+
json={
|
374
|
+
"evalSetRunId": eval_set_run_id,
|
375
|
+
"evalSnapshot": {
|
376
|
+
"id": eval_item.id,
|
377
|
+
"name": eval_item.name,
|
378
|
+
"inputs": eval_item.inputs,
|
379
|
+
"expectedOutput": eval_item.expected_output,
|
380
|
+
},
|
381
|
+
"status": EvaluationStatus.IN_PROGRESS.value,
|
382
|
+
},
|
383
|
+
headers=self._tenant_header(),
|
384
|
+
)
|
385
|
+
|
386
|
+
def _create_eval_set_run_spec(
|
387
|
+
self,
|
388
|
+
eval_set_id: str,
|
389
|
+
agent_snapshot: StudioWebAgentSnapshot,
|
390
|
+
no_of_evals: int,
|
391
|
+
) -> RequestSpec:
|
392
|
+
return RequestSpec(
|
393
|
+
method="POST",
|
394
|
+
endpoint=Endpoint(
|
395
|
+
f"agentsruntime_/api/execution/agents/{self._project_id}/evalSetRun"
|
396
|
+
),
|
397
|
+
json={
|
398
|
+
"agentId": self._project_id,
|
399
|
+
"evalSetId": eval_set_id,
|
400
|
+
"agentSnapshot": agent_snapshot.model_dump(by_alias=True),
|
401
|
+
"status": EvaluationStatus.IN_PROGRESS.value,
|
402
|
+
"numberOfEvalsExecuted": no_of_evals,
|
403
|
+
},
|
404
|
+
headers=self._tenant_header(),
|
405
|
+
)
|
406
|
+
|
407
|
+
def _update_eval_set_run_spec(
|
408
|
+
self,
|
409
|
+
eval_set_run_id: str,
|
410
|
+
evaluator_scores: dict[str, float],
|
411
|
+
) -> RequestSpec:
|
412
|
+
evaluator_scores_list = [
|
413
|
+
{"value": avg_score, "evaluatorId": evaluator_id}
|
414
|
+
for evaluator_id, avg_score in evaluator_scores.items()
|
415
|
+
]
|
416
|
+
|
417
|
+
return RequestSpec(
|
418
|
+
method="PUT",
|
419
|
+
endpoint=Endpoint(
|
420
|
+
f"agentsruntime_/api/execution/agents/{self._project_id}/evalSetRun"
|
421
|
+
),
|
422
|
+
json={
|
423
|
+
"evalSetRunId": eval_set_run_id,
|
424
|
+
"status": EvaluationStatus.COMPLETED.value,
|
425
|
+
"evaluatorScores": evaluator_scores_list,
|
426
|
+
},
|
427
|
+
headers=self._tenant_header(),
|
428
|
+
)
|
429
|
+
|
430
|
+
def _tenant_header(self) -> dict[str, str]:
|
431
|
+
tenant_id = os.getenv(ENV_TENANT_ID, None)
|
432
|
+
if not tenant_id:
|
433
|
+
self._console.error(
|
434
|
+
f"{ENV_TENANT_ID} env var is not set. Please run 'uipath auth'."
|
435
|
+
)
|
436
|
+
return {HEADER_INTERNAL_TENANT_ID: tenant_id} # type: ignore
|
uipath/_cli/_evals/_runtime.py
CHANGED
@@ -7,9 +7,17 @@ from typing import Any, Dict, Generic, List, Optional, Sequence, TypeVar
|
|
7
7
|
from opentelemetry.sdk.trace import ReadableSpan
|
8
8
|
from opentelemetry.sdk.trace.export import SpanExporter, SpanExportResult
|
9
9
|
|
10
|
+
from ..._events._event_bus import EventBus
|
11
|
+
from ..._events._events import (
|
12
|
+
EvalRunCreatedEvent,
|
13
|
+
EvalRunUpdatedEvent,
|
14
|
+
EvalSetRunCreatedEvent,
|
15
|
+
EvalSetRunUpdatedEvent,
|
16
|
+
EvaluationEvents,
|
17
|
+
)
|
10
18
|
from ...eval.evaluators import BaseEvaluator
|
11
19
|
from ...eval.models import EvaluationResult
|
12
|
-
from ...eval.models.models import AgentExecution
|
20
|
+
from ...eval.models.models import AgentExecution, EvalItemResult
|
13
21
|
from .._runtime._contracts import (
|
14
22
|
UiPathBaseRuntime,
|
15
23
|
UiPathRuntimeContext,
|
@@ -75,10 +83,16 @@ class UiPathEvalContext(UiPathRuntimeContext):
|
|
75
83
|
class UiPathEvalRuntime(UiPathBaseRuntime, Generic[T, C]):
|
76
84
|
"""Specialized runtime for evaluation runs, with access to the factory."""
|
77
85
|
|
78
|
-
def __init__(
|
86
|
+
def __init__(
|
87
|
+
self,
|
88
|
+
context: UiPathEvalContext,
|
89
|
+
factory: UiPathRuntimeFactory[T, C],
|
90
|
+
event_bus: EventBus,
|
91
|
+
):
|
79
92
|
super().__init__(context)
|
80
93
|
self.context: UiPathEvalContext = context
|
81
94
|
self.factory: UiPathRuntimeFactory[T, C] = factory
|
95
|
+
self.event_bus: EventBus = event_bus
|
82
96
|
self.span_exporter: ExecutionSpanExporter = ExecutionSpanExporter()
|
83
97
|
self.factory.add_span_exporter(self.span_exporter)
|
84
98
|
|
@@ -87,50 +101,119 @@ class UiPathEvalRuntime(UiPathBaseRuntime, Generic[T, C]):
|
|
87
101
|
cls,
|
88
102
|
context: UiPathEvalContext,
|
89
103
|
factory: UiPathRuntimeFactory[T, C],
|
104
|
+
event_bus: EventBus,
|
90
105
|
) -> "UiPathEvalRuntime[T, C]":
|
91
|
-
return cls(context, factory)
|
106
|
+
return cls(context, factory, event_bus)
|
92
107
|
|
93
108
|
async def execute(self) -> Optional[UiPathRuntimeResult]:
|
94
|
-
"""Evaluation logic. Can spawn other runtimes through the factory."""
|
95
109
|
if self.context.eval_set is None:
|
96
110
|
raise ValueError("eval_set must be provided for evaluation runs")
|
97
111
|
|
112
|
+
if not self.context.execution_id:
|
113
|
+
raise ValueError("execution_id must be provided for evaluation runs")
|
114
|
+
|
115
|
+
event_bus = self.event_bus
|
116
|
+
|
98
117
|
evaluation_set = EvalHelpers.load_eval_set(
|
99
118
|
self.context.eval_set, self.context.eval_ids
|
100
119
|
)
|
101
120
|
evaluators = self._load_evaluators(evaluation_set)
|
121
|
+
|
122
|
+
evaluator_averages = {evaluator.id: 0.0 for evaluator in evaluators}
|
123
|
+
evaluator_counts = {evaluator.id: 0 for evaluator in evaluators}
|
124
|
+
|
125
|
+
await event_bus.publish(
|
126
|
+
EvaluationEvents.CREATE_EVAL_SET_RUN,
|
127
|
+
EvalSetRunCreatedEvent(
|
128
|
+
execution_id=self.context.execution_id,
|
129
|
+
entrypoint=self.context.entrypoint or "",
|
130
|
+
eval_set_id=evaluation_set.id,
|
131
|
+
no_of_evals=len(evaluation_set.evaluations),
|
132
|
+
evaluators=evaluators,
|
133
|
+
),
|
134
|
+
)
|
135
|
+
|
102
136
|
results = UiPathEvalOutput(
|
103
137
|
evaluation_set_name=evaluation_set.name, score=0, evaluation_set_results=[]
|
104
138
|
)
|
105
139
|
for eval_item in evaluation_set.evaluations:
|
140
|
+
await event_bus.publish(
|
141
|
+
EvaluationEvents.CREATE_EVAL_RUN,
|
142
|
+
EvalRunCreatedEvent(
|
143
|
+
execution_id=self.context.execution_id,
|
144
|
+
eval_item=eval_item,
|
145
|
+
),
|
146
|
+
)
|
147
|
+
|
106
148
|
evaluation_run_results = EvaluationRunResult(
|
107
149
|
evaluation_name=eval_item.name, evaluation_run_results=[]
|
108
150
|
)
|
109
151
|
|
110
152
|
results.evaluation_set_results.append(evaluation_run_results)
|
153
|
+
|
111
154
|
agent_execution_output = await self.execute_runtime(eval_item)
|
112
|
-
|
155
|
+
evaluation_item_results: list[EvalItemResult] = []
|
156
|
+
|
113
157
|
for evaluator in evaluators:
|
114
158
|
evaluation_result = await self.run_evaluator(
|
115
159
|
evaluator=evaluator,
|
116
160
|
execution_output=agent_execution_output,
|
117
161
|
eval_item=eval_item,
|
118
162
|
)
|
163
|
+
|
164
|
+
dto_result = EvaluationResultDto.from_evaluation_result(
|
165
|
+
evaluation_result
|
166
|
+
)
|
167
|
+
evaluator_counts[evaluator.id] += 1
|
168
|
+
count = evaluator_counts[evaluator.id]
|
169
|
+
evaluator_averages[evaluator.id] += (
|
170
|
+
dto_result.score - evaluator_averages[evaluator.id]
|
171
|
+
) / count
|
172
|
+
|
119
173
|
evaluation_run_results.evaluation_run_results.append(
|
120
174
|
EvaluationRunResultDto(
|
121
175
|
evaluator_name=evaluator.name,
|
122
|
-
result=
|
123
|
-
|
124
|
-
|
176
|
+
result=dto_result,
|
177
|
+
)
|
178
|
+
)
|
179
|
+
evaluation_item_results.append(
|
180
|
+
EvalItemResult(
|
181
|
+
evaluator_id=evaluator.id,
|
182
|
+
result=evaluation_result,
|
125
183
|
)
|
126
184
|
)
|
127
185
|
|
186
|
+
evaluation_run_results.compute_average_score()
|
187
|
+
|
188
|
+
await event_bus.publish(
|
189
|
+
EvaluationEvents.UPDATE_EVAL_RUN,
|
190
|
+
EvalRunUpdatedEvent(
|
191
|
+
execution_id=self.context.execution_id,
|
192
|
+
eval_item=eval_item,
|
193
|
+
eval_results=evaluation_item_results,
|
194
|
+
success=not agent_execution_output.result.error,
|
195
|
+
agent_output=agent_execution_output.result.output,
|
196
|
+
agent_execution_time=agent_execution_output.execution_time,
|
197
|
+
spans=agent_execution_output.spans,
|
198
|
+
),
|
199
|
+
wait_for_completion=False,
|
200
|
+
)
|
201
|
+
|
128
202
|
results.compute_average_score()
|
203
|
+
|
204
|
+
await event_bus.publish(
|
205
|
+
EvaluationEvents.UPDATE_EVAL_SET_RUN,
|
206
|
+
EvalSetRunUpdatedEvent(
|
207
|
+
execution_id=self.context.execution_id,
|
208
|
+
evaluator_scores=evaluator_averages,
|
209
|
+
),
|
210
|
+
wait_for_completion=False,
|
211
|
+
)
|
212
|
+
|
129
213
|
self.context.result = UiPathRuntimeResult(
|
130
214
|
output={**results.model_dump(by_alias=True)},
|
131
215
|
status=UiPathRuntimeStatus.SUCCESSFUL,
|
132
216
|
)
|
133
|
-
|
134
217
|
return self.context.result
|
135
218
|
|
136
219
|
async def execute_runtime(
|
@@ -141,11 +224,21 @@ class UiPathEvalRuntime(UiPathBaseRuntime, Generic[T, C]):
|
|
141
224
|
input_json=eval_item.inputs,
|
142
225
|
is_eval_run=True,
|
143
226
|
)
|
227
|
+
attributes = {
|
228
|
+
"evalId": eval_item.id,
|
229
|
+
"span_type": "eval",
|
230
|
+
}
|
231
|
+
if runtime_context.execution_id:
|
232
|
+
attributes["execution.id"] = runtime_context.execution_id
|
233
|
+
|
144
234
|
start_time = time()
|
235
|
+
|
145
236
|
result = await self.factory.execute_in_root_span(
|
146
|
-
runtime_context, root_span=eval_item.name
|
237
|
+
runtime_context, root_span=eval_item.name, attributes=attributes
|
147
238
|
)
|
239
|
+
|
148
240
|
end_time = time()
|
241
|
+
|
149
242
|
if runtime_context.execution_id is None:
|
150
243
|
raise ValueError("execution_id must be set for eval runs")
|
151
244
|
|