uipath 2.1.60__py3-none-any.whl → 2.1.61__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -62,9 +62,19 @@ class EvaluationRunResultDto(BaseModel):
62
62
  class EvaluationRunResult(BaseModel):
63
63
  model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True)
64
64
 
65
+ score: float = 0.0
65
66
  evaluation_name: str
66
67
  evaluation_run_results: List[EvaluationRunResultDto]
67
68
 
69
+ def compute_average_score(self) -> None:
70
+ """Compute average score for this single eval_item."""
71
+ if not self.evaluation_run_results:
72
+ self.score = 0.0
73
+ return
74
+
75
+ total_score = sum(dto.result.score for dto in self.evaluation_run_results)
76
+ self.score = total_score / len(self.evaluation_run_results)
77
+
68
78
 
69
79
  class UiPathEvalOutput(BaseModel):
70
80
  model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True)
@@ -74,12 +84,15 @@ class UiPathEvalOutput(BaseModel):
74
84
  evaluation_set_results: List[EvaluationRunResult]
75
85
 
76
86
  def compute_average_score(self) -> None:
77
- total_score = 0.0
78
- total_count = 0
79
-
80
- for evaluation_set_result in self.evaluation_set_results:
81
- for evaluation_run_result in evaluation_set_result.evaluation_run_results:
82
- total_score += evaluation_run_result.result.score
83
- total_count += 1
84
-
85
- self.score = total_score / total_count if total_count > 0 else 0.0
87
+ """Compute overall average by calling eval_item.compute_average_score()."""
88
+ if not self.evaluation_set_results:
89
+ self.score = 0.0
90
+ return
91
+
92
+ for eval_result in self.evaluation_set_results:
93
+ eval_result.compute_average_score()
94
+
95
+ eval_item_scores = [
96
+ eval_result.score for eval_result in self.evaluation_set_results
97
+ ]
98
+ self.score = sum(eval_item_scores) / len(eval_item_scores)
@@ -0,0 +1,21 @@
1
+ from typing import Any
2
+
3
+ from pydantic import BaseModel, ConfigDict
4
+ from pydantic.alias_generators import to_camel
5
+
6
+ from uipath.eval.models import EvalItemResult
7
+
8
+
9
+ class StudioWebProgressItem(BaseModel):
10
+ eval_run_id: str
11
+ eval_results: list[EvalItemResult]
12
+ success: bool
13
+ agent_output: dict[str, Any]
14
+ agent_execution_time: float
15
+
16
+
17
+ class StudioWebAgentSnapshot(BaseModel):
18
+ model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True)
19
+
20
+ input_schema: dict[str, Any]
21
+ output_schema: dict[str, Any]
@@ -0,0 +1,436 @@
1
+ """Progress reporter for sending evaluation updates to StudioWeb."""
2
+
3
+ import functools
4
+ import json
5
+ import logging
6
+ import os
7
+ from typing import Any, Dict, List
8
+
9
+ from opentelemetry import trace
10
+
11
+ from uipath import UiPath
12
+ from uipath._cli._evals._models._evaluation_set import EvaluationItem, EvaluationStatus
13
+ from uipath._cli._evals._models._sw_reporting import (
14
+ StudioWebAgentSnapshot,
15
+ StudioWebProgressItem,
16
+ )
17
+ from uipath._cli._utils._console import ConsoleLogger
18
+ from uipath._cli._utils._project_files import ( # type: ignore
19
+ get_project_config,
20
+ )
21
+ from uipath._events._event_bus import EventBus
22
+ from uipath._events._events import (
23
+ EvalRunCreatedEvent,
24
+ EvalRunUpdatedEvent,
25
+ EvalSetRunCreatedEvent,
26
+ EvalSetRunUpdatedEvent,
27
+ EvaluationEvents,
28
+ )
29
+ from uipath._utils import Endpoint, RequestSpec
30
+ from uipath._utils.constants import ENV_TENANT_ID, HEADER_INTERNAL_TENANT_ID
31
+ from uipath.eval.evaluators import BaseEvaluator
32
+ from uipath.eval.models import EvalItemResult, ScoreType
33
+ from uipath.tracing import LlmOpsHttpExporter
34
+
35
+ logger = logging.getLogger(__name__)
36
+
37
+
38
+ def gracefully_handle_errors(func):
39
+ """Decorator to catch and log errors without stopping execution."""
40
+
41
+ @functools.wraps(func)
42
+ async def wrapper(self, *args, **kwargs):
43
+ try:
44
+ return await func(self, *args, **kwargs)
45
+ except Exception as e:
46
+ if hasattr(self, "_console"):
47
+ error_type = type(e).__name__
48
+ logger.warning(
49
+ f"Cannot report progress to SW. "
50
+ f"Function: {func.__name__}, "
51
+ f"Error type: {error_type}, "
52
+ f"Details: {e}"
53
+ )
54
+ return None
55
+
56
+ return wrapper
57
+
58
+
59
+ class StudioWebProgressReporter:
60
+ """Handles reporting evaluation progress to StudioWeb."""
61
+
62
+ def __init__(self):
63
+ logging.getLogger("uipath._cli.middlewares").setLevel(logging.CRITICAL)
64
+ console_logger = ConsoleLogger.get_instance()
65
+ uipath = UiPath()
66
+
67
+ self._client = uipath.api_client
68
+ self._console = console_logger
69
+ self._project_id = os.getenv("UIPATH_PROJECT_ID", None)
70
+ if not self._project_id:
71
+ logger.warning(
72
+ "Cannot report data to StudioWeb. Please set UIPATH_PROJECT_ID."
73
+ )
74
+
75
+ self.eval_set_run_ids: Dict[str, str] = {}
76
+ self.evaluators: Dict[str, Any] = {}
77
+ self.evaluator_scores: Dict[str, List[float]] = {}
78
+ self.eval_run_ids: Dict[str, str] = {}
79
+
80
+ @gracefully_handle_errors
81
+ async def create_eval_set_run(
82
+ self,
83
+ eval_set_id: str,
84
+ agent_snapshot: StudioWebAgentSnapshot,
85
+ no_of_evals: int,
86
+ evaluators: List[BaseEvaluator[Any]],
87
+ ) -> str:
88
+ """Create a new evaluation set run in StudioWeb."""
89
+ spec = self._create_eval_set_run_spec(eval_set_id, agent_snapshot, no_of_evals)
90
+ response = await self._client.request_async(
91
+ method=spec.method,
92
+ url=spec.endpoint,
93
+ params=spec.params,
94
+ json=spec.json,
95
+ headers=spec.headers,
96
+ )
97
+ eval_set_run_id = json.loads(response.content)["id"]
98
+ return eval_set_run_id
99
+
100
+ @gracefully_handle_errors
101
+ async def create_eval_run(
102
+ self, eval_item: EvaluationItem, eval_set_run_id: str
103
+ ) -> str:
104
+ """Create a new evaluation run in StudioWeb.
105
+
106
+ Args:
107
+ eval_item: Dictionary containing evaluation data
108
+ eval_set_run_id: The ID of the evaluation set run
109
+
110
+ Returns:
111
+ The ID of the created evaluation run
112
+ """
113
+ spec = self._create_eval_run_spec(eval_item, eval_set_run_id)
114
+ response = await self._client.request_async(
115
+ method=spec.method,
116
+ url=spec.endpoint,
117
+ params=spec.params,
118
+ json=spec.json,
119
+ headers=spec.headers,
120
+ )
121
+ return json.loads(response.content)["id"]
122
+
123
+ @gracefully_handle_errors
124
+ async def update_eval_run(
125
+ self,
126
+ sw_progress_item: StudioWebProgressItem,
127
+ evaluators: dict[str, BaseEvaluator[Any]],
128
+ ):
129
+ """Update an evaluation run with results."""
130
+ assertion_runs, evaluator_scores = self._collect_results(
131
+ sw_progress_item.eval_results, evaluators
132
+ )
133
+ spec = self._update_eval_run_spec(
134
+ assertion_runs=assertion_runs,
135
+ evaluator_scores=evaluator_scores,
136
+ eval_run_id=sw_progress_item.eval_run_id,
137
+ execution_time=sw_progress_item.agent_execution_time,
138
+ actual_output=sw_progress_item.agent_output,
139
+ )
140
+ await self._client.request_async(
141
+ method=spec.method,
142
+ url=spec.endpoint,
143
+ params=spec.params,
144
+ json=spec.json,
145
+ headers=spec.headers,
146
+ )
147
+
148
+ @gracefully_handle_errors
149
+ async def update_eval_set_run(
150
+ self,
151
+ eval_set_run_id: str,
152
+ evaluator_scores: dict[str, float],
153
+ ):
154
+ """Update the evaluation set run status to complete."""
155
+ spec = self._update_eval_set_run_spec(eval_set_run_id, evaluator_scores)
156
+ await self._client.request_async(
157
+ method=spec.method,
158
+ url=spec.endpoint,
159
+ params=spec.params,
160
+ json=spec.json,
161
+ headers=spec.headers,
162
+ )
163
+
164
+ async def handle_create_eval_set_run(self, payload: EvalSetRunCreatedEvent) -> None:
165
+ try:
166
+ self.evaluators = {eval.id: eval for eval in payload.evaluators}
167
+ self.evaluator_scores = {eval.id: [] for eval in payload.evaluators}
168
+
169
+ eval_set_run_id = await self.create_eval_set_run(
170
+ eval_set_id=payload.eval_set_id,
171
+ agent_snapshot=self._extract_agent_snapshot(payload.entrypoint),
172
+ no_of_evals=payload.no_of_evals,
173
+ evaluators=payload.evaluators,
174
+ )
175
+ self.eval_set_run_ids[payload.execution_id] = eval_set_run_id
176
+ current_span = trace.get_current_span()
177
+ if current_span.is_recording():
178
+ current_span.set_attribute("eval_set_run_id", eval_set_run_id)
179
+
180
+ logger.debug(f"Created eval set run with ID: {eval_set_run_id}")
181
+
182
+ except Exception as e:
183
+ logger.error(f"Failed to handle create eval set run event: {e}")
184
+
185
+ async def handle_create_eval_run(self, payload: EvalRunCreatedEvent) -> None:
186
+ try:
187
+ if eval_set_run_id := self.eval_set_run_ids.get(payload.execution_id):
188
+ eval_run_id = await self.create_eval_run(
189
+ payload.eval_item, eval_set_run_id
190
+ )
191
+ if eval_run_id:
192
+ self.eval_run_ids[payload.execution_id] = eval_run_id
193
+ logger.debug(f"Created eval run with ID: {eval_run_id}")
194
+ else:
195
+ logger.warning("Cannot create eval run: eval_set_run_id not available")
196
+
197
+ except Exception as e:
198
+ logger.error(f"Failed to handle create eval run event: {e}")
199
+
200
+ async def handle_update_eval_run(self, payload: EvalRunUpdatedEvent) -> None:
201
+ try:
202
+ spans_exporter = LlmOpsHttpExporter(
203
+ trace_id=self.eval_set_run_ids.get(payload.execution_id),
204
+ )
205
+
206
+ spans_exporter.export(payload.spans)
207
+
208
+ for eval_result in payload.eval_results:
209
+ evaluator_id = eval_result.evaluator_id
210
+ if evaluator_id in self.evaluator_scores:
211
+ match eval_result.result.score_type:
212
+ case ScoreType.NUMERICAL:
213
+ self.evaluator_scores[evaluator_id].append(
214
+ eval_result.result.score
215
+ )
216
+ case ScoreType.BOOLEAN:
217
+ self.evaluator_scores[evaluator_id].append(
218
+ 100 if eval_result.result.score else 0
219
+ )
220
+ case ScoreType.ERROR:
221
+ self.evaluator_scores[evaluator_id].append(0)
222
+
223
+ eval_run_id = self.eval_run_ids[payload.execution_id]
224
+ if eval_run_id:
225
+ await self.update_eval_run(
226
+ StudioWebProgressItem(
227
+ eval_run_id=eval_run_id,
228
+ eval_results=payload.eval_results,
229
+ success=payload.success,
230
+ agent_output=payload.agent_output,
231
+ agent_execution_time=payload.agent_execution_time,
232
+ ),
233
+ self.evaluators,
234
+ )
235
+
236
+ logger.debug(f"Updated eval run with ID: {eval_run_id}")
237
+
238
+ except Exception as e:
239
+ logger.error(f"Failed to handle update eval run event: {e}")
240
+
241
+ async def handle_update_eval_set_run(self, payload: EvalSetRunUpdatedEvent) -> None:
242
+ try:
243
+ if eval_set_run_id := self.eval_set_run_ids.get(payload.execution_id):
244
+ await self.update_eval_set_run(
245
+ eval_set_run_id,
246
+ payload.evaluator_scores,
247
+ )
248
+ logger.debug(f"Updated eval set run with ID: {eval_set_run_id}")
249
+ else:
250
+ logger.warning(
251
+ "Cannot update eval set run: eval_set_run_id not available"
252
+ )
253
+
254
+ except Exception as e:
255
+ logger.error(f"Failed to handle update eval set run event: {e}")
256
+
257
+ async def subscribe_to_eval_runtime_events(self, event_bus: EventBus) -> None:
258
+ event_bus.subscribe(
259
+ EvaluationEvents.CREATE_EVAL_SET_RUN, self.handle_create_eval_set_run
260
+ )
261
+ event_bus.subscribe(
262
+ EvaluationEvents.CREATE_EVAL_RUN, self.handle_create_eval_run
263
+ )
264
+ event_bus.subscribe(
265
+ EvaluationEvents.UPDATE_EVAL_RUN, self.handle_update_eval_run
266
+ )
267
+ event_bus.subscribe(
268
+ EvaluationEvents.UPDATE_EVAL_SET_RUN, self.handle_update_eval_set_run
269
+ )
270
+
271
+ logger.info("StudioWeb progress reporter subscribed to evaluation events")
272
+
273
+ def _extract_agent_snapshot(self, entrypoint: str) -> StudioWebAgentSnapshot:
274
+ try:
275
+ project_config = get_project_config(os.getcwd())
276
+ ep = None
277
+ for entry_point in project_config.get("entryPoints", []):
278
+ if entry_point.get("filePath") == entrypoint:
279
+ ep = entry_point
280
+ break
281
+
282
+ if not ep:
283
+ logger.warning(
284
+ f"Entrypoint {entrypoint} not found in configuration file"
285
+ )
286
+ return StudioWebAgentSnapshot(input_schema={}, output_schema={})
287
+
288
+ input_schema = ep.get("input", {})
289
+ output_schema = ep.get("output", {})
290
+
291
+ return StudioWebAgentSnapshot(
292
+ input_schema=input_schema, output_schema=output_schema
293
+ )
294
+ except Exception as e:
295
+ logger.warning(f"Failed to extract agent snapshot: {e}")
296
+ return StudioWebAgentSnapshot(input_schema={}, output_schema={})
297
+
298
+ def _collect_results(
299
+ self,
300
+ eval_results: list[EvalItemResult],
301
+ evaluators: dict[str, BaseEvaluator[Any]],
302
+ ) -> tuple[list[dict[str, Any]], list[dict[str, Any]]]:
303
+ assertion_runs: list[dict[str, Any]] = []
304
+ evaluator_scores_list: list[dict[str, Any]] = []
305
+ for eval_result in eval_results:
306
+ evaluator_scores_list.append(
307
+ {
308
+ "type": eval_result.result.score_type.value,
309
+ "value": eval_result.result.score,
310
+ "justification": eval_result.result.details,
311
+ "evaluatorId": eval_result.evaluator_id,
312
+ }
313
+ )
314
+ assertion_runs.append(
315
+ {
316
+ "status": EvaluationStatus.COMPLETED.value,
317
+ "evaluatorId": eval_result.evaluator_id,
318
+ "completionMetrics": {
319
+ "duration": int(eval_result.result.evaluation_time)
320
+ if eval_result.result.evaluation_time
321
+ else 0,
322
+ "cost": None,
323
+ "tokens": 0,
324
+ "completionTokens": 0,
325
+ "promptTokens": 0,
326
+ },
327
+ "assertionSnapshot": {
328
+ "assertionType": evaluators[
329
+ eval_result.evaluator_id
330
+ ].evaluator_type.name,
331
+ "outputKey": evaluators[
332
+ eval_result.evaluator_id
333
+ ].target_output_key,
334
+ },
335
+ }
336
+ )
337
+ return assertion_runs, evaluator_scores_list
338
+
339
+ def _update_eval_run_spec(
340
+ self,
341
+ assertion_runs: list[dict[str, Any]],
342
+ evaluator_scores: list[dict[str, Any]],
343
+ eval_run_id: str,
344
+ actual_output: dict[str, Any],
345
+ execution_time: float,
346
+ ) -> RequestSpec:
347
+ return RequestSpec(
348
+ method="PUT",
349
+ endpoint=Endpoint(
350
+ f"agentsruntime_/api/execution/agents/{self._project_id}/evalRun"
351
+ ),
352
+ json={
353
+ "evalRunId": eval_run_id,
354
+ "status": EvaluationStatus.COMPLETED.value,
355
+ "result": {
356
+ "output": {"content": {**actual_output}},
357
+ "evaluatorScores": evaluator_scores,
358
+ },
359
+ "completionMetrics": {"duration": int(execution_time)},
360
+ "assertionRuns": assertion_runs,
361
+ },
362
+ headers=self._tenant_header(),
363
+ )
364
+
365
+ def _create_eval_run_spec(
366
+ self, eval_item: EvaluationItem, eval_set_run_id: str
367
+ ) -> RequestSpec:
368
+ return RequestSpec(
369
+ method="POST",
370
+ endpoint=Endpoint(
371
+ f"agentsruntime_/api/execution/agents/{self._project_id}/evalRun"
372
+ ),
373
+ json={
374
+ "evalSetRunId": eval_set_run_id,
375
+ "evalSnapshot": {
376
+ "id": eval_item.id,
377
+ "name": eval_item.name,
378
+ "inputs": eval_item.inputs,
379
+ "expectedOutput": eval_item.expected_output,
380
+ },
381
+ "status": EvaluationStatus.IN_PROGRESS.value,
382
+ },
383
+ headers=self._tenant_header(),
384
+ )
385
+
386
+ def _create_eval_set_run_spec(
387
+ self,
388
+ eval_set_id: str,
389
+ agent_snapshot: StudioWebAgentSnapshot,
390
+ no_of_evals: int,
391
+ ) -> RequestSpec:
392
+ return RequestSpec(
393
+ method="POST",
394
+ endpoint=Endpoint(
395
+ f"agentsruntime_/api/execution/agents/{self._project_id}/evalSetRun"
396
+ ),
397
+ json={
398
+ "agentId": self._project_id,
399
+ "evalSetId": eval_set_id,
400
+ "agentSnapshot": agent_snapshot.model_dump(by_alias=True),
401
+ "status": EvaluationStatus.IN_PROGRESS.value,
402
+ "numberOfEvalsExecuted": no_of_evals,
403
+ },
404
+ headers=self._tenant_header(),
405
+ )
406
+
407
+ def _update_eval_set_run_spec(
408
+ self,
409
+ eval_set_run_id: str,
410
+ evaluator_scores: dict[str, float],
411
+ ) -> RequestSpec:
412
+ evaluator_scores_list = [
413
+ {"value": avg_score, "evaluatorId": evaluator_id}
414
+ for evaluator_id, avg_score in evaluator_scores.items()
415
+ ]
416
+
417
+ return RequestSpec(
418
+ method="PUT",
419
+ endpoint=Endpoint(
420
+ f"agentsruntime_/api/execution/agents/{self._project_id}/evalSetRun"
421
+ ),
422
+ json={
423
+ "evalSetRunId": eval_set_run_id,
424
+ "status": EvaluationStatus.COMPLETED.value,
425
+ "evaluatorScores": evaluator_scores_list,
426
+ },
427
+ headers=self._tenant_header(),
428
+ )
429
+
430
+ def _tenant_header(self) -> dict[str, str]:
431
+ tenant_id = os.getenv(ENV_TENANT_ID, None)
432
+ if not tenant_id:
433
+ self._console.error(
434
+ f"{ENV_TENANT_ID} env var is not set. Please run 'uipath auth'."
435
+ )
436
+ return {HEADER_INTERNAL_TENANT_ID: tenant_id} # type: ignore
@@ -7,9 +7,17 @@ from typing import Any, Dict, Generic, List, Optional, Sequence, TypeVar
7
7
  from opentelemetry.sdk.trace import ReadableSpan
8
8
  from opentelemetry.sdk.trace.export import SpanExporter, SpanExportResult
9
9
 
10
+ from ..._events._event_bus import EventBus
11
+ from ..._events._events import (
12
+ EvalRunCreatedEvent,
13
+ EvalRunUpdatedEvent,
14
+ EvalSetRunCreatedEvent,
15
+ EvalSetRunUpdatedEvent,
16
+ EvaluationEvents,
17
+ )
10
18
  from ...eval.evaluators import BaseEvaluator
11
19
  from ...eval.models import EvaluationResult
12
- from ...eval.models.models import AgentExecution
20
+ from ...eval.models.models import AgentExecution, EvalItemResult
13
21
  from .._runtime._contracts import (
14
22
  UiPathBaseRuntime,
15
23
  UiPathRuntimeContext,
@@ -75,10 +83,16 @@ class UiPathEvalContext(UiPathRuntimeContext):
75
83
  class UiPathEvalRuntime(UiPathBaseRuntime, Generic[T, C]):
76
84
  """Specialized runtime for evaluation runs, with access to the factory."""
77
85
 
78
- def __init__(self, context: UiPathEvalContext, factory: UiPathRuntimeFactory[T, C]):
86
+ def __init__(
87
+ self,
88
+ context: UiPathEvalContext,
89
+ factory: UiPathRuntimeFactory[T, C],
90
+ event_bus: EventBus,
91
+ ):
79
92
  super().__init__(context)
80
93
  self.context: UiPathEvalContext = context
81
94
  self.factory: UiPathRuntimeFactory[T, C] = factory
95
+ self.event_bus: EventBus = event_bus
82
96
  self.span_exporter: ExecutionSpanExporter = ExecutionSpanExporter()
83
97
  self.factory.add_span_exporter(self.span_exporter)
84
98
 
@@ -87,50 +101,119 @@ class UiPathEvalRuntime(UiPathBaseRuntime, Generic[T, C]):
87
101
  cls,
88
102
  context: UiPathEvalContext,
89
103
  factory: UiPathRuntimeFactory[T, C],
104
+ event_bus: EventBus,
90
105
  ) -> "UiPathEvalRuntime[T, C]":
91
- return cls(context, factory)
106
+ return cls(context, factory, event_bus)
92
107
 
93
108
  async def execute(self) -> Optional[UiPathRuntimeResult]:
94
- """Evaluation logic. Can spawn other runtimes through the factory."""
95
109
  if self.context.eval_set is None:
96
110
  raise ValueError("eval_set must be provided for evaluation runs")
97
111
 
112
+ if not self.context.execution_id:
113
+ raise ValueError("execution_id must be provided for evaluation runs")
114
+
115
+ event_bus = self.event_bus
116
+
98
117
  evaluation_set = EvalHelpers.load_eval_set(
99
118
  self.context.eval_set, self.context.eval_ids
100
119
  )
101
120
  evaluators = self._load_evaluators(evaluation_set)
121
+
122
+ evaluator_averages = {evaluator.id: 0.0 for evaluator in evaluators}
123
+ evaluator_counts = {evaluator.id: 0 for evaluator in evaluators}
124
+
125
+ await event_bus.publish(
126
+ EvaluationEvents.CREATE_EVAL_SET_RUN,
127
+ EvalSetRunCreatedEvent(
128
+ execution_id=self.context.execution_id,
129
+ entrypoint=self.context.entrypoint or "",
130
+ eval_set_id=evaluation_set.id,
131
+ no_of_evals=len(evaluation_set.evaluations),
132
+ evaluators=evaluators,
133
+ ),
134
+ )
135
+
102
136
  results = UiPathEvalOutput(
103
137
  evaluation_set_name=evaluation_set.name, score=0, evaluation_set_results=[]
104
138
  )
105
139
  for eval_item in evaluation_set.evaluations:
140
+ await event_bus.publish(
141
+ EvaluationEvents.CREATE_EVAL_RUN,
142
+ EvalRunCreatedEvent(
143
+ execution_id=self.context.execution_id,
144
+ eval_item=eval_item,
145
+ ),
146
+ )
147
+
106
148
  evaluation_run_results = EvaluationRunResult(
107
149
  evaluation_name=eval_item.name, evaluation_run_results=[]
108
150
  )
109
151
 
110
152
  results.evaluation_set_results.append(evaluation_run_results)
153
+
111
154
  agent_execution_output = await self.execute_runtime(eval_item)
112
- # we run each evaluator on the agent_output
155
+ evaluation_item_results: list[EvalItemResult] = []
156
+
113
157
  for evaluator in evaluators:
114
158
  evaluation_result = await self.run_evaluator(
115
159
  evaluator=evaluator,
116
160
  execution_output=agent_execution_output,
117
161
  eval_item=eval_item,
118
162
  )
163
+
164
+ dto_result = EvaluationResultDto.from_evaluation_result(
165
+ evaluation_result
166
+ )
167
+ evaluator_counts[evaluator.id] += 1
168
+ count = evaluator_counts[evaluator.id]
169
+ evaluator_averages[evaluator.id] += (
170
+ dto_result.score - evaluator_averages[evaluator.id]
171
+ ) / count
172
+
119
173
  evaluation_run_results.evaluation_run_results.append(
120
174
  EvaluationRunResultDto(
121
175
  evaluator_name=evaluator.name,
122
- result=EvaluationResultDto.from_evaluation_result(
123
- evaluation_result
124
- ),
176
+ result=dto_result,
177
+ )
178
+ )
179
+ evaluation_item_results.append(
180
+ EvalItemResult(
181
+ evaluator_id=evaluator.id,
182
+ result=evaluation_result,
125
183
  )
126
184
  )
127
185
 
186
+ evaluation_run_results.compute_average_score()
187
+
188
+ await event_bus.publish(
189
+ EvaluationEvents.UPDATE_EVAL_RUN,
190
+ EvalRunUpdatedEvent(
191
+ execution_id=self.context.execution_id,
192
+ eval_item=eval_item,
193
+ eval_results=evaluation_item_results,
194
+ success=not agent_execution_output.result.error,
195
+ agent_output=agent_execution_output.result.output,
196
+ agent_execution_time=agent_execution_output.execution_time,
197
+ spans=agent_execution_output.spans,
198
+ ),
199
+ wait_for_completion=False,
200
+ )
201
+
128
202
  results.compute_average_score()
203
+
204
+ await event_bus.publish(
205
+ EvaluationEvents.UPDATE_EVAL_SET_RUN,
206
+ EvalSetRunUpdatedEvent(
207
+ execution_id=self.context.execution_id,
208
+ evaluator_scores=evaluator_averages,
209
+ ),
210
+ wait_for_completion=False,
211
+ )
212
+
129
213
  self.context.result = UiPathRuntimeResult(
130
214
  output={**results.model_dump(by_alias=True)},
131
215
  status=UiPathRuntimeStatus.SUCCESSFUL,
132
216
  )
133
-
134
217
  return self.context.result
135
218
 
136
219
  async def execute_runtime(
@@ -141,11 +224,21 @@ class UiPathEvalRuntime(UiPathBaseRuntime, Generic[T, C]):
141
224
  input_json=eval_item.inputs,
142
225
  is_eval_run=True,
143
226
  )
227
+ attributes = {
228
+ "evalId": eval_item.id,
229
+ "span_type": "eval",
230
+ }
231
+ if runtime_context.execution_id:
232
+ attributes["execution.id"] = runtime_context.execution_id
233
+
144
234
  start_time = time()
235
+
145
236
  result = await self.factory.execute_in_root_span(
146
- runtime_context, root_span=eval_item.name
237
+ runtime_context, root_span=eval_item.name, attributes=attributes
147
238
  )
239
+
148
240
  end_time = time()
241
+
149
242
  if runtime_context.execution_id is None:
150
243
  raise ValueError("execution_id must be set for eval runs")
151
244