uipath 2.1.108__py3-none-any.whl → 2.1.110__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of uipath might be problematic. Click here for more details.

Files changed (72) hide show
  1. uipath/_cli/__init__.py +4 -0
  2. uipath/_cli/_evals/_console_progress_reporter.py +2 -2
  3. uipath/_cli/_evals/_evaluator_factory.py +314 -29
  4. uipath/_cli/_evals/_helpers.py +194 -0
  5. uipath/_cli/_evals/_models/_evaluation_set.py +73 -7
  6. uipath/_cli/_evals/_models/_evaluator.py +183 -9
  7. uipath/_cli/_evals/_models/_evaluator_base_params.py +3 -3
  8. uipath/_cli/_evals/_models/_output.py +87 -3
  9. uipath/_cli/_evals/_progress_reporter.py +288 -28
  10. uipath/_cli/_evals/_runtime.py +80 -26
  11. uipath/_cli/_evals/mocks/input_mocker.py +1 -3
  12. uipath/_cli/_evals/mocks/llm_mocker.py +2 -2
  13. uipath/_cli/_evals/mocks/mocker_factory.py +2 -2
  14. uipath/_cli/_evals/mocks/mockito_mocker.py +2 -2
  15. uipath/_cli/_evals/mocks/mocks.py +5 -3
  16. uipath/_cli/_push/models.py +17 -0
  17. uipath/_cli/_push/sw_file_handler.py +336 -3
  18. uipath/_cli/_runtime/_contracts.py +2 -4
  19. uipath/_cli/_runtime/_runtime.py +2 -5
  20. uipath/_cli/_templates/custom_evaluator.py.template +65 -0
  21. uipath/_cli/_utils/_eval_set.py +30 -9
  22. uipath/_cli/_utils/_resources.py +21 -0
  23. uipath/_cli/_utils/_studio_project.py +18 -0
  24. uipath/_cli/cli_add.py +114 -0
  25. uipath/_cli/cli_eval.py +5 -1
  26. uipath/_cli/cli_init.py +5 -4
  27. uipath/_cli/cli_pull.py +11 -26
  28. uipath/_cli/cli_push.py +2 -0
  29. uipath/_cli/cli_register.py +45 -0
  30. uipath/_events/_events.py +6 -5
  31. uipath/_utils/constants.py +4 -0
  32. uipath/eval/_helpers/evaluators_helpers.py +494 -0
  33. uipath/eval/_helpers/helpers.py +30 -2
  34. uipath/eval/evaluators/__init__.py +60 -5
  35. uipath/eval/evaluators/base_evaluator.py +546 -44
  36. uipath/eval/evaluators/contains_evaluator.py +80 -0
  37. uipath/eval/evaluators/exact_match_evaluator.py +43 -12
  38. uipath/eval/evaluators/json_similarity_evaluator.py +41 -12
  39. uipath/eval/evaluators/legacy_base_evaluator.py +89 -0
  40. uipath/eval/evaluators/{deterministic_evaluator_base.py → legacy_deterministic_evaluator_base.py} +2 -2
  41. uipath/eval/evaluators/legacy_exact_match_evaluator.py +37 -0
  42. uipath/eval/evaluators/legacy_json_similarity_evaluator.py +151 -0
  43. uipath/eval/evaluators/legacy_llm_as_judge_evaluator.py +137 -0
  44. uipath/eval/evaluators/{trajectory_evaluator.py → legacy_trajectory_evaluator.py} +5 -6
  45. uipath/eval/evaluators/llm_as_judge_evaluator.py +143 -78
  46. uipath/eval/evaluators/llm_judge_output_evaluator.py +112 -0
  47. uipath/eval/evaluators/llm_judge_trajectory_evaluator.py +142 -0
  48. uipath/eval/evaluators/output_evaluator.py +117 -0
  49. uipath/eval/evaluators/tool_call_args_evaluator.py +82 -0
  50. uipath/eval/evaluators/tool_call_count_evaluator.py +87 -0
  51. uipath/eval/evaluators/tool_call_order_evaluator.py +84 -0
  52. uipath/eval/evaluators/tool_call_output_evaluator.py +87 -0
  53. uipath/eval/evaluators_types/ContainsEvaluator.json +73 -0
  54. uipath/eval/evaluators_types/ExactMatchEvaluator.json +89 -0
  55. uipath/eval/evaluators_types/JsonSimilarityEvaluator.json +81 -0
  56. uipath/eval/evaluators_types/LLMJudgeOutputEvaluator.json +110 -0
  57. uipath/eval/evaluators_types/LLMJudgeSimulationTrajectoryEvaluator.json +88 -0
  58. uipath/eval/evaluators_types/LLMJudgeStrictJSONSimilarityOutputEvaluator.json +110 -0
  59. uipath/eval/evaluators_types/LLMJudgeTrajectoryEvaluator.json +88 -0
  60. uipath/eval/evaluators_types/ToolCallArgsEvaluator.json +131 -0
  61. uipath/eval/evaluators_types/ToolCallCountEvaluator.json +104 -0
  62. uipath/eval/evaluators_types/ToolCallOrderEvaluator.json +100 -0
  63. uipath/eval/evaluators_types/ToolCallOutputEvaluator.json +124 -0
  64. uipath/eval/evaluators_types/generate_types.py +31 -0
  65. uipath/eval/models/__init__.py +16 -1
  66. uipath/eval/models/llm_judge_types.py +196 -0
  67. uipath/eval/models/models.py +109 -7
  68. {uipath-2.1.108.dist-info → uipath-2.1.110.dist-info}/METADATA +1 -1
  69. {uipath-2.1.108.dist-info → uipath-2.1.110.dist-info}/RECORD +72 -40
  70. {uipath-2.1.108.dist-info → uipath-2.1.110.dist-info}/WHEEL +0 -0
  71. {uipath-2.1.108.dist-info → uipath-2.1.110.dist-info}/entry_points.txt +0 -0
  72. {uipath-2.1.108.dist-info → uipath-2.1.110.dist-info}/licenses/LICENSE +0 -0
@@ -5,12 +5,18 @@ import json
5
5
  import logging
6
6
  import os
7
7
  from typing import Any, Dict, List
8
+ from urllib.parse import urlparse
8
9
 
9
10
  from opentelemetry import trace
10
11
  from rich.console import Console
11
12
 
12
13
  from uipath import UiPath
13
- from uipath._cli._evals._models._evaluation_set import EvaluationItem, EvaluationStatus
14
+ from uipath._cli._evals._models._evaluation_set import (
15
+ AnyEvaluationItem,
16
+ AnyEvaluator,
17
+ EvaluationItem,
18
+ EvaluationStatus,
19
+ )
14
20
  from uipath._cli._evals._models._sw_reporting import (
15
21
  StudioWebAgentSnapshot,
16
22
  StudioWebProgressItem,
@@ -28,8 +34,12 @@ from uipath._events._events import (
28
34
  EvaluationEvents,
29
35
  )
30
36
  from uipath._utils import Endpoint, RequestSpec
31
- from uipath._utils.constants import ENV_TENANT_ID, HEADER_INTERNAL_TENANT_ID
32
- from uipath.eval.evaluators import BaseEvaluator
37
+ from uipath._utils.constants import (
38
+ ENV_EVAL_BACKEND_URL,
39
+ ENV_TENANT_ID,
40
+ HEADER_INTERNAL_TENANT_ID,
41
+ )
42
+ from uipath.eval.evaluators import BaseEvaluator, LegacyBaseEvaluator
33
43
  from uipath.eval.models import EvalItemResult, ScoreType
34
44
  from uipath.tracing import LlmOpsHttpExporter
35
45
 
@@ -65,7 +75,10 @@ class StudioWebProgressReporter:
65
75
 
66
76
  logging.getLogger("uipath._cli.middlewares").setLevel(logging.CRITICAL)
67
77
  console_logger = ConsoleLogger.get_instance()
68
- uipath = UiPath()
78
+
79
+ # Use UIPATH_EVAL_BACKEND_URL for eval-specific routing if set
80
+ eval_backend_url = os.getenv(ENV_EVAL_BACKEND_URL)
81
+ uipath = UiPath(base_url=eval_backend_url) if eval_backend_url else UiPath()
69
82
 
70
83
  self._client = uipath.api_client
71
84
  self._console = console_logger
@@ -80,18 +93,128 @@ class StudioWebProgressReporter:
80
93
  self.evaluators: Dict[str, Any] = {}
81
94
  self.evaluator_scores: Dict[str, List[float]] = {}
82
95
  self.eval_run_ids: Dict[str, str] = {}
96
+ self.is_coded_eval: Dict[str, bool] = {} # Track coded vs legacy per execution
97
+ self.eval_spans: Dict[
98
+ str, list[Any]
99
+ ] = {} # Store spans per execution for usage metrics
83
100
 
84
101
  def _format_error_message(self, error: Exception, context: str) -> None:
85
102
  """Helper method to format and display error messages consistently."""
86
103
  self._rich_console.print(f" • \u26a0 [dim]{context}: {error}[/dim]")
87
104
 
105
+ def _is_localhost(self) -> bool:
106
+ """Check if the eval backend URL is localhost.
107
+
108
+ Returns:
109
+ True if using localhost, False otherwise.
110
+ """
111
+ eval_backend_url = os.getenv(ENV_EVAL_BACKEND_URL, "")
112
+ if eval_backend_url:
113
+ try:
114
+ parsed = urlparse(eval_backend_url)
115
+ hostname = parsed.hostname or parsed.netloc.split(":")[0]
116
+ return hostname.lower() in ("localhost", "127.0.0.1")
117
+ except Exception:
118
+ pass
119
+ return False
120
+
121
+ def _get_endpoint_prefix(self) -> str:
122
+ """Determine the endpoint prefix based on environment.
123
+
124
+ Checks UIPATH_EVAL_BACKEND_URL environment variable:
125
+ - If set to localhost/127.0.0.1: returns "api/" (direct API access)
126
+ - Otherwise: returns "agentsruntime_/api/" (service routing for alpha/prod)
127
+
128
+ Returns:
129
+ "api/" for localhost environments, "agentsruntime_/api/" for alpha/production.
130
+ """
131
+ if self._is_localhost():
132
+ return "api/"
133
+ return "agentsruntime_/api/"
134
+
135
+ def _is_coded_evaluator(self, evaluators: List[AnyEvaluator]) -> bool:
136
+ """Check if evaluators are coded (BaseEvaluator) vs legacy (LegacyBaseEvaluator).
137
+
138
+ Args:
139
+ evaluators: List of evaluators to check
140
+
141
+ Returns:
142
+ True if using coded evaluators, False for legacy evaluators
143
+ """
144
+ if not evaluators:
145
+ return False
146
+ # Check the first evaluator type
147
+ return isinstance(evaluators[0], BaseEvaluator)
148
+
149
+ def _extract_usage_from_spans(
150
+ self, spans: list[Any]
151
+ ) -> dict[str, int | float | None]:
152
+ """Extract token usage and cost from OpenTelemetry spans.
153
+
154
+ Args:
155
+ spans: List of ReadableSpan objects from agent execution
156
+
157
+ Returns:
158
+ Dictionary with tokens, completionTokens, promptTokens, and cost
159
+ """
160
+ total_tokens = 0
161
+ completion_tokens = 0
162
+ prompt_tokens = 0
163
+ total_cost = 0.0
164
+
165
+ for span in spans:
166
+ try:
167
+ # Handle both dictionary attributes and string Attributes field
168
+ attrs = None
169
+ if hasattr(span, "attributes") and span.attributes:
170
+ if isinstance(span.attributes, dict):
171
+ attrs = span.attributes
172
+ elif isinstance(span.attributes, str):
173
+ # Parse JSON string attributes
174
+ attrs = json.loads(span.attributes)
175
+
176
+ # Also check for Attributes field (capitalized) from backend spans
177
+ if not attrs and hasattr(span, "Attributes") and span.Attributes:
178
+ if isinstance(span.Attributes, str):
179
+ attrs = json.loads(span.Attributes)
180
+ elif isinstance(span.Attributes, dict):
181
+ attrs = span.Attributes
182
+
183
+ if attrs:
184
+ # Try to get usage from nested usage object (backend format)
185
+ if "usage" in attrs and isinstance(attrs["usage"], dict):
186
+ usage = attrs["usage"]
187
+ prompt_tokens += usage.get("promptTokens", 0)
188
+ completion_tokens += usage.get("completionTokens", 0)
189
+ total_tokens += usage.get("totalTokens", 0)
190
+ # Cost might be in usage or at root level
191
+ total_cost += usage.get("cost", 0.0)
192
+
193
+ # Also try OpenTelemetry semantic conventions (SDK format)
194
+ prompt_tokens += attrs.get("gen_ai.usage.prompt_tokens", 0)
195
+ completion_tokens += attrs.get("gen_ai.usage.completion_tokens", 0)
196
+ total_tokens += attrs.get("gen_ai.usage.total_tokens", 0)
197
+ total_cost += attrs.get("gen_ai.usage.cost", 0.0)
198
+ total_cost += attrs.get("llm.usage.cost", 0.0)
199
+
200
+ except (json.JSONDecodeError, AttributeError, TypeError) as e:
201
+ logger.debug(f"Failed to parse span attributes: {e}")
202
+ continue
203
+
204
+ return {
205
+ "tokens": total_tokens if total_tokens > 0 else None,
206
+ "completionTokens": completion_tokens if completion_tokens > 0 else None,
207
+ "promptTokens": prompt_tokens if prompt_tokens > 0 else None,
208
+ "cost": total_cost if total_cost > 0 else None,
209
+ }
210
+
88
211
  @gracefully_handle_errors
89
212
  async def create_eval_set_run(
90
213
  self,
91
214
  eval_set_id: str,
92
215
  agent_snapshot: StudioWebAgentSnapshot,
93
216
  no_of_evals: int,
94
- evaluators: List[BaseEvaluator[Any]],
217
+ evaluators: List[LegacyBaseEvaluator[Any]],
95
218
  ) -> str:
96
219
  """Create a new evaluation set run in StudioWeb."""
97
220
  spec = self._create_eval_set_run_spec(eval_set_id, agent_snapshot, no_of_evals)
@@ -101,13 +224,14 @@ class StudioWebProgressReporter:
101
224
  params=spec.params,
102
225
  json=spec.json,
103
226
  headers=spec.headers,
227
+ scoped="org" if self._is_localhost() else "tenant",
104
228
  )
105
229
  eval_set_run_id = json.loads(response.content)["id"]
106
230
  return eval_set_run_id
107
231
 
108
232
  @gracefully_handle_errors
109
233
  async def create_eval_run(
110
- self, eval_item: EvaluationItem, eval_set_run_id: str
234
+ self, eval_item: AnyEvaluationItem, eval_set_run_id: str
111
235
  ) -> str:
112
236
  """Create a new evaluation run in StudioWeb.
113
237
 
@@ -125,6 +249,7 @@ class StudioWebProgressReporter:
125
249
  params=spec.params,
126
250
  json=spec.json,
127
251
  headers=spec.headers,
252
+ scoped="org" if self._is_localhost() else "tenant",
128
253
  )
129
254
  return json.loads(response.content)["id"]
130
255
 
@@ -132,25 +257,53 @@ class StudioWebProgressReporter:
132
257
  async def update_eval_run(
133
258
  self,
134
259
  sw_progress_item: StudioWebProgressItem,
135
- evaluators: dict[str, BaseEvaluator[Any]],
260
+ evaluators: dict[str, AnyEvaluator],
261
+ is_coded: bool = False,
262
+ spans: list[Any] | None = None,
136
263
  ):
137
264
  """Update an evaluation run with results."""
138
- assertion_runs, evaluator_scores = self._collect_results(
139
- sw_progress_item.eval_results, evaluators
265
+ coded_evaluators: dict[str, BaseEvaluator[Any, Any, Any]] = {}
266
+ legacy_evaluators: dict[str, LegacyBaseEvaluator[Any]] = {}
267
+ evaluator_runs: list[dict[str, Any]] = []
268
+ evaluator_scores: list[dict[str, Any]] = []
269
+
270
+ for k, v in evaluators.items():
271
+ if isinstance(v, BaseEvaluator):
272
+ coded_evaluators[k] = v
273
+ elif isinstance(v, LegacyBaseEvaluator):
274
+ legacy_evaluators[k] = v
275
+
276
+ # Use coded evaluator format
277
+ runs, scores = self._collect_coded_results(
278
+ sw_progress_item.eval_results, coded_evaluators, spans or []
279
+ )
280
+ evaluator_runs.extend(runs)
281
+ evaluator_scores.extend(scores)
282
+
283
+ # Use legacy evaluator format
284
+ runs, scores = self._collect_results(
285
+ sw_progress_item.eval_results,
286
+ legacy_evaluators,
287
+ spans or [],
140
288
  )
289
+ evaluator_runs.extend(runs)
290
+ evaluator_scores.extend(scores)
291
+
141
292
  spec = self._update_eval_run_spec(
142
- assertion_runs=assertion_runs,
293
+ assertion_runs=evaluator_runs,
143
294
  evaluator_scores=evaluator_scores,
144
295
  eval_run_id=sw_progress_item.eval_run_id,
145
296
  execution_time=sw_progress_item.agent_execution_time,
146
297
  actual_output=sw_progress_item.agent_output,
147
298
  )
299
+
148
300
  await self._client.request_async(
149
301
  method=spec.method,
150
302
  url=spec.endpoint,
151
303
  params=spec.params,
152
304
  json=spec.json,
153
305
  headers=spec.headers,
306
+ scoped="org" if self._is_localhost() else "tenant",
154
307
  )
155
308
 
156
309
  @gracefully_handle_errors
@@ -167,6 +320,7 @@ class StudioWebProgressReporter:
167
320
  params=spec.params,
168
321
  json=spec.json,
169
322
  headers=spec.headers,
323
+ scoped="org" if self._is_localhost() else "tenant",
170
324
  )
171
325
 
172
326
  async def handle_create_eval_set_run(self, payload: EvalSetRunCreatedEvent) -> None:
@@ -174,6 +328,10 @@ class StudioWebProgressReporter:
174
328
  self.evaluators = {eval.id: eval for eval in payload.evaluators}
175
329
  self.evaluator_scores = {eval.id: [] for eval in payload.evaluators}
176
330
 
331
+ # Detect if using coded evaluators and store for this execution
332
+ is_coded = self._is_coded_evaluator(payload.evaluators)
333
+ self.is_coded_eval[payload.execution_id] = is_coded
334
+
177
335
  eval_set_run_id = await self.create_eval_set_run(
178
336
  eval_set_id=payload.eval_set_id,
179
337
  agent_snapshot=self._extract_agent_snapshot(payload.entrypoint),
@@ -185,7 +343,9 @@ class StudioWebProgressReporter:
185
343
  if current_span.is_recording():
186
344
  current_span.set_attribute("eval_set_run_id", eval_set_run_id)
187
345
 
188
- logger.debug(f"Created eval set run with ID: {eval_set_run_id}")
346
+ logger.debug(
347
+ f"Created eval set run with ID: {eval_set_run_id} (coded={is_coded})"
348
+ )
189
349
 
190
350
  except Exception as e:
191
351
  self._format_error_message(e, "StudioWeb create eval set run error")
@@ -230,6 +390,12 @@ class StudioWebProgressReporter:
230
390
 
231
391
  eval_run_id = self.eval_run_ids[payload.execution_id]
232
392
  if eval_run_id:
393
+ # Get the is_coded flag for this execution
394
+ is_coded = self.is_coded_eval.get(payload.execution_id, False)
395
+
396
+ # Extract usage metrics from spans
397
+ self._extract_usage_from_spans(payload.spans)
398
+
233
399
  await self.update_eval_run(
234
400
  StudioWebProgressItem(
235
401
  eval_run_id=eval_run_id,
@@ -239,9 +405,13 @@ class StudioWebProgressReporter:
239
405
  agent_execution_time=payload.agent_execution_time,
240
406
  ),
241
407
  self.evaluators,
408
+ is_coded=is_coded,
409
+ spans=payload.spans,
242
410
  )
243
411
 
244
- logger.debug(f"Updated eval run with ID: {eval_run_id}")
412
+ logger.debug(
413
+ f"Updated eval run with ID: {eval_run_id} (coded={is_coded})"
414
+ )
245
415
 
246
416
  except Exception as e:
247
417
  self._format_error_message(e, "StudioWeb reporting error")
@@ -306,10 +476,15 @@ class StudioWebProgressReporter:
306
476
  def _collect_results(
307
477
  self,
308
478
  eval_results: list[EvalItemResult],
309
- evaluators: dict[str, BaseEvaluator[Any]],
479
+ evaluators: dict[str, LegacyBaseEvaluator[Any]],
480
+ spans: list[Any],
310
481
  ) -> tuple[list[dict[str, Any]], list[dict[str, Any]]]:
311
482
  assertion_runs: list[dict[str, Any]] = []
312
483
  evaluator_scores_list: list[dict[str, Any]] = []
484
+
485
+ # Extract usage metrics from spans
486
+ usage_metrics = self._extract_usage_from_spans(spans)
487
+
313
488
  for eval_result in eval_results:
314
489
  evaluator_scores_list.append(
315
490
  {
@@ -327,10 +502,10 @@ class StudioWebProgressReporter:
327
502
  "duration": int(eval_result.result.evaluation_time)
328
503
  if eval_result.result.evaluation_time
329
504
  else 0,
330
- "cost": None,
331
- "tokens": 0,
332
- "completionTokens": 0,
333
- "promptTokens": 0,
505
+ "cost": usage_metrics["cost"],
506
+ "tokens": usage_metrics["tokens"] or 0,
507
+ "completionTokens": usage_metrics["completionTokens"] or 0,
508
+ "promptTokens": usage_metrics["promptTokens"] or 0,
334
509
  },
335
510
  "assertionSnapshot": {
336
511
  "assertionType": evaluators[
@@ -344,6 +519,55 @@ class StudioWebProgressReporter:
344
519
  )
345
520
  return assertion_runs, evaluator_scores_list
346
521
 
522
+ def _collect_coded_results(
523
+ self,
524
+ eval_results: list[EvalItemResult],
525
+ evaluators: dict[str, BaseEvaluator[Any, Any, Any]],
526
+ spans: list[Any],
527
+ ) -> tuple[list[dict[str, Any]], list[dict[str, Any]]]:
528
+ """Collect results for coded evaluators.
529
+
530
+ Returns evaluatorRuns and scores in the format expected by coded eval endpoints.
531
+ """
532
+ evaluator_runs: list[dict[str, Any]] = []
533
+ evaluator_scores_list: list[dict[str, Any]] = []
534
+
535
+ # Extract usage metrics from spans
536
+ usage_metrics = self._extract_usage_from_spans(spans)
537
+
538
+ for eval_result in eval_results:
539
+ evaluator_scores_list.append(
540
+ {
541
+ "type": eval_result.result.score_type.value,
542
+ "value": eval_result.result.score,
543
+ "justification": eval_result.result.details,
544
+ "evaluatorId": eval_result.evaluator_id,
545
+ }
546
+ )
547
+ evaluator_runs.append(
548
+ {
549
+ "status": EvaluationStatus.COMPLETED.value,
550
+ "evaluatorId": eval_result.evaluator_id,
551
+ "result": {
552
+ "score": {
553
+ "type": eval_result.result.score_type.value,
554
+ "value": eval_result.result.score,
555
+ },
556
+ "justification": eval_result.result.details,
557
+ },
558
+ "completionMetrics": {
559
+ "duration": int(eval_result.result.evaluation_time)
560
+ if eval_result.result.evaluation_time
561
+ else 0,
562
+ "cost": usage_metrics["cost"],
563
+ "tokens": usage_metrics["tokens"] or 0,
564
+ "completionTokens": usage_metrics["completionTokens"] or 0,
565
+ "promptTokens": usage_metrics["promptTokens"] or 0,
566
+ },
567
+ }
568
+ )
569
+ return evaluator_runs, evaluator_scores_list
570
+
347
571
  def _update_eval_run_spec(
348
572
  self,
349
573
  assertion_runs: list[dict[str, Any]],
@@ -355,7 +579,7 @@ class StudioWebProgressReporter:
355
579
  return RequestSpec(
356
580
  method="PUT",
357
581
  endpoint=Endpoint(
358
- f"agentsruntime_/api/execution/agents/{self._project_id}/evalRun"
582
+ f"{self._get_endpoint_prefix()}execution/agents/{self._project_id}/coded/evalRun"
359
583
  ),
360
584
  json={
361
585
  "evalRunId": eval_run_id,
@@ -370,22 +594,58 @@ class StudioWebProgressReporter:
370
594
  headers=self._tenant_header(),
371
595
  )
372
596
 
597
+ def _update_coded_eval_run_spec(
598
+ self,
599
+ evaluator_runs: list[dict[str, Any]],
600
+ evaluator_scores: list[dict[str, Any]],
601
+ eval_run_id: str,
602
+ actual_output: dict[str, Any],
603
+ execution_time: float,
604
+ ) -> RequestSpec:
605
+ """Create update spec for coded evaluators."""
606
+ return RequestSpec(
607
+ method="PUT",
608
+ endpoint=Endpoint(
609
+ f"{self._get_endpoint_prefix()}execution/agents/{self._project_id}/coded/evalRun"
610
+ ),
611
+ json={
612
+ "evalRunId": eval_run_id,
613
+ "status": EvaluationStatus.COMPLETED.value,
614
+ "result": {
615
+ "output": {"content": {**actual_output}},
616
+ "scores": evaluator_scores,
617
+ },
618
+ "completionMetrics": {"duration": int(execution_time)},
619
+ "evaluatorRuns": evaluator_runs,
620
+ },
621
+ headers=self._tenant_header(),
622
+ )
623
+
373
624
  def _create_eval_run_spec(
374
- self, eval_item: EvaluationItem, eval_set_run_id: str
625
+ self, eval_item: AnyEvaluationItem, eval_set_run_id: str
375
626
  ) -> RequestSpec:
627
+ # Build eval snapshot based on evaluation item type
628
+ eval_snapshot = {
629
+ "id": eval_item.id,
630
+ "name": eval_item.name,
631
+ "inputs": eval_item.inputs,
632
+ }
633
+
634
+ # For new coded evaluators (EvaluationItem), use evaluationCriterias
635
+ # For legacy evaluators (LegacyEvaluationItem), use expectedOutput
636
+ if isinstance(eval_item, EvaluationItem):
637
+ eval_snapshot["evaluationCriterias"] = eval_item.evaluation_criterias
638
+ else:
639
+ eval_snapshot["expectedOutput"] = eval_item.expected_output
640
+
376
641
  return RequestSpec(
377
642
  method="POST",
378
643
  endpoint=Endpoint(
379
- f"agentsruntime_/api/execution/agents/{self._project_id}/evalRun"
644
+ f"{self._get_endpoint_prefix()}execution/agents/{self._project_id}/coded/evalRun"
380
645
  ),
381
646
  json={
382
647
  "evalSetRunId": eval_set_run_id,
383
- "evalSnapshot": {
384
- "id": eval_item.id,
385
- "name": eval_item.name,
386
- "inputs": eval_item.inputs,
387
- "expectedOutput": eval_item.expected_output,
388
- },
648
+ "evalSnapshot": eval_snapshot,
389
649
  "status": EvaluationStatus.IN_PROGRESS.value,
390
650
  },
391
651
  headers=self._tenant_header(),
@@ -400,7 +660,7 @@ class StudioWebProgressReporter:
400
660
  return RequestSpec(
401
661
  method="POST",
402
662
  endpoint=Endpoint(
403
- f"agentsruntime_/api/execution/agents/{self._project_id}/evalSetRun"
663
+ f"{self._get_endpoint_prefix()}execution/agents/{self._project_id}/coded/evalSetRun"
404
664
  ),
405
665
  json={
406
666
  "agentId": self._project_id,
@@ -425,7 +685,7 @@ class StudioWebProgressReporter:
425
685
  return RequestSpec(
426
686
  method="PUT",
427
687
  endpoint=Endpoint(
428
- f"agentsruntime_/api/execution/agents/{self._project_id}/evalSetRun"
688
+ f"{self._get_endpoint_prefix()}execution/agents/{self._project_id}/coded/evalSetRun"
429
689
  ),
430
690
  json={
431
691
  "evalSetRunId": eval_set_run_id,