uipath 2.1.107__py3-none-any.whl → 2.1.109__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of uipath might be problematic. Click here for more details.
- uipath/_cli/__init__.py +4 -0
- uipath/_cli/_evals/_console_progress_reporter.py +2 -2
- uipath/_cli/_evals/_evaluator_factory.py +314 -29
- uipath/_cli/_evals/_helpers.py +194 -0
- uipath/_cli/_evals/_models/_evaluation_set.py +73 -7
- uipath/_cli/_evals/_models/_evaluator.py +183 -9
- uipath/_cli/_evals/_models/_evaluator_base_params.py +3 -3
- uipath/_cli/_evals/_models/_output.py +87 -3
- uipath/_cli/_evals/_progress_reporter.py +288 -28
- uipath/_cli/_evals/_runtime.py +80 -26
- uipath/_cli/_evals/mocks/input_mocker.py +1 -3
- uipath/_cli/_evals/mocks/llm_mocker.py +2 -2
- uipath/_cli/_evals/mocks/mocker_factory.py +2 -2
- uipath/_cli/_evals/mocks/mockito_mocker.py +2 -2
- uipath/_cli/_evals/mocks/mocks.py +5 -3
- uipath/_cli/_push/models.py +17 -0
- uipath/_cli/_push/sw_file_handler.py +336 -3
- uipath/_cli/_runtime/_contracts.py +25 -5
- uipath/_cli/_templates/custom_evaluator.py.template +65 -0
- uipath/_cli/_utils/_eval_set.py +30 -9
- uipath/_cli/_utils/_resources.py +21 -0
- uipath/_cli/_utils/_studio_project.py +18 -0
- uipath/_cli/cli_add.py +114 -0
- uipath/_cli/cli_eval.py +5 -1
- uipath/_cli/cli_pull.py +11 -26
- uipath/_cli/cli_push.py +2 -0
- uipath/_cli/cli_register.py +45 -0
- uipath/_events/_events.py +6 -5
- uipath/_resources/SDK_REFERENCE.md +0 -97
- uipath/_uipath.py +10 -37
- uipath/_utils/constants.py +4 -0
- uipath/eval/_helpers/evaluators_helpers.py +494 -0
- uipath/eval/_helpers/helpers.py +30 -2
- uipath/eval/evaluators/__init__.py +60 -5
- uipath/eval/evaluators/base_evaluator.py +546 -44
- uipath/eval/evaluators/contains_evaluator.py +80 -0
- uipath/eval/evaluators/exact_match_evaluator.py +43 -12
- uipath/eval/evaluators/json_similarity_evaluator.py +41 -12
- uipath/eval/evaluators/legacy_base_evaluator.py +89 -0
- uipath/eval/evaluators/{deterministic_evaluator_base.py → legacy_deterministic_evaluator_base.py} +2 -2
- uipath/eval/evaluators/legacy_exact_match_evaluator.py +37 -0
- uipath/eval/evaluators/legacy_json_similarity_evaluator.py +151 -0
- uipath/eval/evaluators/legacy_llm_as_judge_evaluator.py +137 -0
- uipath/eval/evaluators/{trajectory_evaluator.py → legacy_trajectory_evaluator.py} +5 -6
- uipath/eval/evaluators/llm_as_judge_evaluator.py +143 -78
- uipath/eval/evaluators/llm_judge_output_evaluator.py +112 -0
- uipath/eval/evaluators/llm_judge_trajectory_evaluator.py +142 -0
- uipath/eval/evaluators/output_evaluator.py +117 -0
- uipath/eval/evaluators/tool_call_args_evaluator.py +82 -0
- uipath/eval/evaluators/tool_call_count_evaluator.py +87 -0
- uipath/eval/evaluators/tool_call_order_evaluator.py +84 -0
- uipath/eval/evaluators/tool_call_output_evaluator.py +87 -0
- uipath/eval/evaluators_types/ContainsEvaluator.json +73 -0
- uipath/eval/evaluators_types/ExactMatchEvaluator.json +89 -0
- uipath/eval/evaluators_types/JsonSimilarityEvaluator.json +81 -0
- uipath/eval/evaluators_types/LLMJudgeOutputEvaluator.json +110 -0
- uipath/eval/evaluators_types/LLMJudgeSimulationTrajectoryEvaluator.json +88 -0
- uipath/eval/evaluators_types/LLMJudgeStrictJSONSimilarityOutputEvaluator.json +110 -0
- uipath/eval/evaluators_types/LLMJudgeTrajectoryEvaluator.json +88 -0
- uipath/eval/evaluators_types/ToolCallArgsEvaluator.json +131 -0
- uipath/eval/evaluators_types/ToolCallCountEvaluator.json +104 -0
- uipath/eval/evaluators_types/ToolCallOrderEvaluator.json +100 -0
- uipath/eval/evaluators_types/ToolCallOutputEvaluator.json +124 -0
- uipath/eval/evaluators_types/generate_types.py +31 -0
- uipath/eval/models/__init__.py +16 -1
- uipath/eval/models/llm_judge_types.py +196 -0
- uipath/eval/models/models.py +109 -7
- {uipath-2.1.107.dist-info → uipath-2.1.109.dist-info}/METADATA +1 -1
- {uipath-2.1.107.dist-info → uipath-2.1.109.dist-info}/RECORD +72 -40
- {uipath-2.1.107.dist-info → uipath-2.1.109.dist-info}/WHEEL +0 -0
- {uipath-2.1.107.dist-info → uipath-2.1.109.dist-info}/entry_points.txt +0 -0
- {uipath-2.1.107.dist-info → uipath-2.1.109.dist-info}/licenses/LICENSE +0 -0
|
@@ -5,12 +5,18 @@ import json
|
|
|
5
5
|
import logging
|
|
6
6
|
import os
|
|
7
7
|
from typing import Any, Dict, List
|
|
8
|
+
from urllib.parse import urlparse
|
|
8
9
|
|
|
9
10
|
from opentelemetry import trace
|
|
10
11
|
from rich.console import Console
|
|
11
12
|
|
|
12
13
|
from uipath import UiPath
|
|
13
|
-
from uipath._cli._evals._models._evaluation_set import
|
|
14
|
+
from uipath._cli._evals._models._evaluation_set import (
|
|
15
|
+
AnyEvaluationItem,
|
|
16
|
+
AnyEvaluator,
|
|
17
|
+
EvaluationItem,
|
|
18
|
+
EvaluationStatus,
|
|
19
|
+
)
|
|
14
20
|
from uipath._cli._evals._models._sw_reporting import (
|
|
15
21
|
StudioWebAgentSnapshot,
|
|
16
22
|
StudioWebProgressItem,
|
|
@@ -28,8 +34,12 @@ from uipath._events._events import (
|
|
|
28
34
|
EvaluationEvents,
|
|
29
35
|
)
|
|
30
36
|
from uipath._utils import Endpoint, RequestSpec
|
|
31
|
-
from uipath._utils.constants import
|
|
32
|
-
|
|
37
|
+
from uipath._utils.constants import (
|
|
38
|
+
ENV_EVAL_BACKEND_URL,
|
|
39
|
+
ENV_TENANT_ID,
|
|
40
|
+
HEADER_INTERNAL_TENANT_ID,
|
|
41
|
+
)
|
|
42
|
+
from uipath.eval.evaluators import BaseEvaluator, LegacyBaseEvaluator
|
|
33
43
|
from uipath.eval.models import EvalItemResult, ScoreType
|
|
34
44
|
from uipath.tracing import LlmOpsHttpExporter
|
|
35
45
|
|
|
@@ -65,7 +75,10 @@ class StudioWebProgressReporter:
|
|
|
65
75
|
|
|
66
76
|
logging.getLogger("uipath._cli.middlewares").setLevel(logging.CRITICAL)
|
|
67
77
|
console_logger = ConsoleLogger.get_instance()
|
|
68
|
-
|
|
78
|
+
|
|
79
|
+
# Use UIPATH_EVAL_BACKEND_URL for eval-specific routing if set
|
|
80
|
+
eval_backend_url = os.getenv(ENV_EVAL_BACKEND_URL)
|
|
81
|
+
uipath = UiPath(base_url=eval_backend_url) if eval_backend_url else UiPath()
|
|
69
82
|
|
|
70
83
|
self._client = uipath.api_client
|
|
71
84
|
self._console = console_logger
|
|
@@ -80,18 +93,128 @@ class StudioWebProgressReporter:
|
|
|
80
93
|
self.evaluators: Dict[str, Any] = {}
|
|
81
94
|
self.evaluator_scores: Dict[str, List[float]] = {}
|
|
82
95
|
self.eval_run_ids: Dict[str, str] = {}
|
|
96
|
+
self.is_coded_eval: Dict[str, bool] = {} # Track coded vs legacy per execution
|
|
97
|
+
self.eval_spans: Dict[
|
|
98
|
+
str, list[Any]
|
|
99
|
+
] = {} # Store spans per execution for usage metrics
|
|
83
100
|
|
|
84
101
|
def _format_error_message(self, error: Exception, context: str) -> None:
|
|
85
102
|
"""Helper method to format and display error messages consistently."""
|
|
86
103
|
self._rich_console.print(f" • \u26a0 [dim]{context}: {error}[/dim]")
|
|
87
104
|
|
|
105
|
+
def _is_localhost(self) -> bool:
|
|
106
|
+
"""Check if the eval backend URL is localhost.
|
|
107
|
+
|
|
108
|
+
Returns:
|
|
109
|
+
True if using localhost, False otherwise.
|
|
110
|
+
"""
|
|
111
|
+
eval_backend_url = os.getenv(ENV_EVAL_BACKEND_URL, "")
|
|
112
|
+
if eval_backend_url:
|
|
113
|
+
try:
|
|
114
|
+
parsed = urlparse(eval_backend_url)
|
|
115
|
+
hostname = parsed.hostname or parsed.netloc.split(":")[0]
|
|
116
|
+
return hostname.lower() in ("localhost", "127.0.0.1")
|
|
117
|
+
except Exception:
|
|
118
|
+
pass
|
|
119
|
+
return False
|
|
120
|
+
|
|
121
|
+
def _get_endpoint_prefix(self) -> str:
|
|
122
|
+
"""Determine the endpoint prefix based on environment.
|
|
123
|
+
|
|
124
|
+
Checks UIPATH_EVAL_BACKEND_URL environment variable:
|
|
125
|
+
- If set to localhost/127.0.0.1: returns "api/" (direct API access)
|
|
126
|
+
- Otherwise: returns "agentsruntime_/api/" (service routing for alpha/prod)
|
|
127
|
+
|
|
128
|
+
Returns:
|
|
129
|
+
"api/" for localhost environments, "agentsruntime_/api/" for alpha/production.
|
|
130
|
+
"""
|
|
131
|
+
if self._is_localhost():
|
|
132
|
+
return "api/"
|
|
133
|
+
return "agentsruntime_/api/"
|
|
134
|
+
|
|
135
|
+
def _is_coded_evaluator(self, evaluators: List[AnyEvaluator]) -> bool:
|
|
136
|
+
"""Check if evaluators are coded (BaseEvaluator) vs legacy (LegacyBaseEvaluator).
|
|
137
|
+
|
|
138
|
+
Args:
|
|
139
|
+
evaluators: List of evaluators to check
|
|
140
|
+
|
|
141
|
+
Returns:
|
|
142
|
+
True if using coded evaluators, False for legacy evaluators
|
|
143
|
+
"""
|
|
144
|
+
if not evaluators:
|
|
145
|
+
return False
|
|
146
|
+
# Check the first evaluator type
|
|
147
|
+
return isinstance(evaluators[0], BaseEvaluator)
|
|
148
|
+
|
|
149
|
+
def _extract_usage_from_spans(
|
|
150
|
+
self, spans: list[Any]
|
|
151
|
+
) -> dict[str, int | float | None]:
|
|
152
|
+
"""Extract token usage and cost from OpenTelemetry spans.
|
|
153
|
+
|
|
154
|
+
Args:
|
|
155
|
+
spans: List of ReadableSpan objects from agent execution
|
|
156
|
+
|
|
157
|
+
Returns:
|
|
158
|
+
Dictionary with tokens, completionTokens, promptTokens, and cost
|
|
159
|
+
"""
|
|
160
|
+
total_tokens = 0
|
|
161
|
+
completion_tokens = 0
|
|
162
|
+
prompt_tokens = 0
|
|
163
|
+
total_cost = 0.0
|
|
164
|
+
|
|
165
|
+
for span in spans:
|
|
166
|
+
try:
|
|
167
|
+
# Handle both dictionary attributes and string Attributes field
|
|
168
|
+
attrs = None
|
|
169
|
+
if hasattr(span, "attributes") and span.attributes:
|
|
170
|
+
if isinstance(span.attributes, dict):
|
|
171
|
+
attrs = span.attributes
|
|
172
|
+
elif isinstance(span.attributes, str):
|
|
173
|
+
# Parse JSON string attributes
|
|
174
|
+
attrs = json.loads(span.attributes)
|
|
175
|
+
|
|
176
|
+
# Also check for Attributes field (capitalized) from backend spans
|
|
177
|
+
if not attrs and hasattr(span, "Attributes") and span.Attributes:
|
|
178
|
+
if isinstance(span.Attributes, str):
|
|
179
|
+
attrs = json.loads(span.Attributes)
|
|
180
|
+
elif isinstance(span.Attributes, dict):
|
|
181
|
+
attrs = span.Attributes
|
|
182
|
+
|
|
183
|
+
if attrs:
|
|
184
|
+
# Try to get usage from nested usage object (backend format)
|
|
185
|
+
if "usage" in attrs and isinstance(attrs["usage"], dict):
|
|
186
|
+
usage = attrs["usage"]
|
|
187
|
+
prompt_tokens += usage.get("promptTokens", 0)
|
|
188
|
+
completion_tokens += usage.get("completionTokens", 0)
|
|
189
|
+
total_tokens += usage.get("totalTokens", 0)
|
|
190
|
+
# Cost might be in usage or at root level
|
|
191
|
+
total_cost += usage.get("cost", 0.0)
|
|
192
|
+
|
|
193
|
+
# Also try OpenTelemetry semantic conventions (SDK format)
|
|
194
|
+
prompt_tokens += attrs.get("gen_ai.usage.prompt_tokens", 0)
|
|
195
|
+
completion_tokens += attrs.get("gen_ai.usage.completion_tokens", 0)
|
|
196
|
+
total_tokens += attrs.get("gen_ai.usage.total_tokens", 0)
|
|
197
|
+
total_cost += attrs.get("gen_ai.usage.cost", 0.0)
|
|
198
|
+
total_cost += attrs.get("llm.usage.cost", 0.0)
|
|
199
|
+
|
|
200
|
+
except (json.JSONDecodeError, AttributeError, TypeError) as e:
|
|
201
|
+
logger.debug(f"Failed to parse span attributes: {e}")
|
|
202
|
+
continue
|
|
203
|
+
|
|
204
|
+
return {
|
|
205
|
+
"tokens": total_tokens if total_tokens > 0 else None,
|
|
206
|
+
"completionTokens": completion_tokens if completion_tokens > 0 else None,
|
|
207
|
+
"promptTokens": prompt_tokens if prompt_tokens > 0 else None,
|
|
208
|
+
"cost": total_cost if total_cost > 0 else None,
|
|
209
|
+
}
|
|
210
|
+
|
|
88
211
|
@gracefully_handle_errors
|
|
89
212
|
async def create_eval_set_run(
|
|
90
213
|
self,
|
|
91
214
|
eval_set_id: str,
|
|
92
215
|
agent_snapshot: StudioWebAgentSnapshot,
|
|
93
216
|
no_of_evals: int,
|
|
94
|
-
evaluators: List[
|
|
217
|
+
evaluators: List[LegacyBaseEvaluator[Any]],
|
|
95
218
|
) -> str:
|
|
96
219
|
"""Create a new evaluation set run in StudioWeb."""
|
|
97
220
|
spec = self._create_eval_set_run_spec(eval_set_id, agent_snapshot, no_of_evals)
|
|
@@ -101,13 +224,14 @@ class StudioWebProgressReporter:
|
|
|
101
224
|
params=spec.params,
|
|
102
225
|
json=spec.json,
|
|
103
226
|
headers=spec.headers,
|
|
227
|
+
scoped="org" if self._is_localhost() else "tenant",
|
|
104
228
|
)
|
|
105
229
|
eval_set_run_id = json.loads(response.content)["id"]
|
|
106
230
|
return eval_set_run_id
|
|
107
231
|
|
|
108
232
|
@gracefully_handle_errors
|
|
109
233
|
async def create_eval_run(
|
|
110
|
-
self, eval_item:
|
|
234
|
+
self, eval_item: AnyEvaluationItem, eval_set_run_id: str
|
|
111
235
|
) -> str:
|
|
112
236
|
"""Create a new evaluation run in StudioWeb.
|
|
113
237
|
|
|
@@ -125,6 +249,7 @@ class StudioWebProgressReporter:
|
|
|
125
249
|
params=spec.params,
|
|
126
250
|
json=spec.json,
|
|
127
251
|
headers=spec.headers,
|
|
252
|
+
scoped="org" if self._is_localhost() else "tenant",
|
|
128
253
|
)
|
|
129
254
|
return json.loads(response.content)["id"]
|
|
130
255
|
|
|
@@ -132,25 +257,53 @@ class StudioWebProgressReporter:
|
|
|
132
257
|
async def update_eval_run(
|
|
133
258
|
self,
|
|
134
259
|
sw_progress_item: StudioWebProgressItem,
|
|
135
|
-
evaluators: dict[str,
|
|
260
|
+
evaluators: dict[str, AnyEvaluator],
|
|
261
|
+
is_coded: bool = False,
|
|
262
|
+
spans: list[Any] | None = None,
|
|
136
263
|
):
|
|
137
264
|
"""Update an evaluation run with results."""
|
|
138
|
-
|
|
139
|
-
|
|
265
|
+
coded_evaluators: dict[str, BaseEvaluator[Any, Any, Any]] = {}
|
|
266
|
+
legacy_evaluators: dict[str, LegacyBaseEvaluator[Any]] = {}
|
|
267
|
+
evaluator_runs: list[dict[str, Any]] = []
|
|
268
|
+
evaluator_scores: list[dict[str, Any]] = []
|
|
269
|
+
|
|
270
|
+
for k, v in evaluators.items():
|
|
271
|
+
if isinstance(v, BaseEvaluator):
|
|
272
|
+
coded_evaluators[k] = v
|
|
273
|
+
elif isinstance(v, LegacyBaseEvaluator):
|
|
274
|
+
legacy_evaluators[k] = v
|
|
275
|
+
|
|
276
|
+
# Use coded evaluator format
|
|
277
|
+
runs, scores = self._collect_coded_results(
|
|
278
|
+
sw_progress_item.eval_results, coded_evaluators, spans or []
|
|
279
|
+
)
|
|
280
|
+
evaluator_runs.extend(runs)
|
|
281
|
+
evaluator_scores.extend(scores)
|
|
282
|
+
|
|
283
|
+
# Use legacy evaluator format
|
|
284
|
+
runs, scores = self._collect_results(
|
|
285
|
+
sw_progress_item.eval_results,
|
|
286
|
+
legacy_evaluators,
|
|
287
|
+
spans or [],
|
|
140
288
|
)
|
|
289
|
+
evaluator_runs.extend(runs)
|
|
290
|
+
evaluator_scores.extend(scores)
|
|
291
|
+
|
|
141
292
|
spec = self._update_eval_run_spec(
|
|
142
|
-
assertion_runs=
|
|
293
|
+
assertion_runs=evaluator_runs,
|
|
143
294
|
evaluator_scores=evaluator_scores,
|
|
144
295
|
eval_run_id=sw_progress_item.eval_run_id,
|
|
145
296
|
execution_time=sw_progress_item.agent_execution_time,
|
|
146
297
|
actual_output=sw_progress_item.agent_output,
|
|
147
298
|
)
|
|
299
|
+
|
|
148
300
|
await self._client.request_async(
|
|
149
301
|
method=spec.method,
|
|
150
302
|
url=spec.endpoint,
|
|
151
303
|
params=spec.params,
|
|
152
304
|
json=spec.json,
|
|
153
305
|
headers=spec.headers,
|
|
306
|
+
scoped="org" if self._is_localhost() else "tenant",
|
|
154
307
|
)
|
|
155
308
|
|
|
156
309
|
@gracefully_handle_errors
|
|
@@ -167,6 +320,7 @@ class StudioWebProgressReporter:
|
|
|
167
320
|
params=spec.params,
|
|
168
321
|
json=spec.json,
|
|
169
322
|
headers=spec.headers,
|
|
323
|
+
scoped="org" if self._is_localhost() else "tenant",
|
|
170
324
|
)
|
|
171
325
|
|
|
172
326
|
async def handle_create_eval_set_run(self, payload: EvalSetRunCreatedEvent) -> None:
|
|
@@ -174,6 +328,10 @@ class StudioWebProgressReporter:
|
|
|
174
328
|
self.evaluators = {eval.id: eval for eval in payload.evaluators}
|
|
175
329
|
self.evaluator_scores = {eval.id: [] for eval in payload.evaluators}
|
|
176
330
|
|
|
331
|
+
# Detect if using coded evaluators and store for this execution
|
|
332
|
+
is_coded = self._is_coded_evaluator(payload.evaluators)
|
|
333
|
+
self.is_coded_eval[payload.execution_id] = is_coded
|
|
334
|
+
|
|
177
335
|
eval_set_run_id = await self.create_eval_set_run(
|
|
178
336
|
eval_set_id=payload.eval_set_id,
|
|
179
337
|
agent_snapshot=self._extract_agent_snapshot(payload.entrypoint),
|
|
@@ -185,7 +343,9 @@ class StudioWebProgressReporter:
|
|
|
185
343
|
if current_span.is_recording():
|
|
186
344
|
current_span.set_attribute("eval_set_run_id", eval_set_run_id)
|
|
187
345
|
|
|
188
|
-
logger.debug(
|
|
346
|
+
logger.debug(
|
|
347
|
+
f"Created eval set run with ID: {eval_set_run_id} (coded={is_coded})"
|
|
348
|
+
)
|
|
189
349
|
|
|
190
350
|
except Exception as e:
|
|
191
351
|
self._format_error_message(e, "StudioWeb create eval set run error")
|
|
@@ -230,6 +390,12 @@ class StudioWebProgressReporter:
|
|
|
230
390
|
|
|
231
391
|
eval_run_id = self.eval_run_ids[payload.execution_id]
|
|
232
392
|
if eval_run_id:
|
|
393
|
+
# Get the is_coded flag for this execution
|
|
394
|
+
is_coded = self.is_coded_eval.get(payload.execution_id, False)
|
|
395
|
+
|
|
396
|
+
# Extract usage metrics from spans
|
|
397
|
+
self._extract_usage_from_spans(payload.spans)
|
|
398
|
+
|
|
233
399
|
await self.update_eval_run(
|
|
234
400
|
StudioWebProgressItem(
|
|
235
401
|
eval_run_id=eval_run_id,
|
|
@@ -239,9 +405,13 @@ class StudioWebProgressReporter:
|
|
|
239
405
|
agent_execution_time=payload.agent_execution_time,
|
|
240
406
|
),
|
|
241
407
|
self.evaluators,
|
|
408
|
+
is_coded=is_coded,
|
|
409
|
+
spans=payload.spans,
|
|
242
410
|
)
|
|
243
411
|
|
|
244
|
-
logger.debug(
|
|
412
|
+
logger.debug(
|
|
413
|
+
f"Updated eval run with ID: {eval_run_id} (coded={is_coded})"
|
|
414
|
+
)
|
|
245
415
|
|
|
246
416
|
except Exception as e:
|
|
247
417
|
self._format_error_message(e, "StudioWeb reporting error")
|
|
@@ -306,10 +476,15 @@ class StudioWebProgressReporter:
|
|
|
306
476
|
def _collect_results(
|
|
307
477
|
self,
|
|
308
478
|
eval_results: list[EvalItemResult],
|
|
309
|
-
evaluators: dict[str,
|
|
479
|
+
evaluators: dict[str, LegacyBaseEvaluator[Any]],
|
|
480
|
+
spans: list[Any],
|
|
310
481
|
) -> tuple[list[dict[str, Any]], list[dict[str, Any]]]:
|
|
311
482
|
assertion_runs: list[dict[str, Any]] = []
|
|
312
483
|
evaluator_scores_list: list[dict[str, Any]] = []
|
|
484
|
+
|
|
485
|
+
# Extract usage metrics from spans
|
|
486
|
+
usage_metrics = self._extract_usage_from_spans(spans)
|
|
487
|
+
|
|
313
488
|
for eval_result in eval_results:
|
|
314
489
|
evaluator_scores_list.append(
|
|
315
490
|
{
|
|
@@ -327,10 +502,10 @@ class StudioWebProgressReporter:
|
|
|
327
502
|
"duration": int(eval_result.result.evaluation_time)
|
|
328
503
|
if eval_result.result.evaluation_time
|
|
329
504
|
else 0,
|
|
330
|
-
"cost":
|
|
331
|
-
"tokens": 0,
|
|
332
|
-
"completionTokens": 0,
|
|
333
|
-
"promptTokens": 0,
|
|
505
|
+
"cost": usage_metrics["cost"],
|
|
506
|
+
"tokens": usage_metrics["tokens"] or 0,
|
|
507
|
+
"completionTokens": usage_metrics["completionTokens"] or 0,
|
|
508
|
+
"promptTokens": usage_metrics["promptTokens"] or 0,
|
|
334
509
|
},
|
|
335
510
|
"assertionSnapshot": {
|
|
336
511
|
"assertionType": evaluators[
|
|
@@ -344,6 +519,55 @@ class StudioWebProgressReporter:
|
|
|
344
519
|
)
|
|
345
520
|
return assertion_runs, evaluator_scores_list
|
|
346
521
|
|
|
522
|
+
def _collect_coded_results(
|
|
523
|
+
self,
|
|
524
|
+
eval_results: list[EvalItemResult],
|
|
525
|
+
evaluators: dict[str, BaseEvaluator[Any, Any, Any]],
|
|
526
|
+
spans: list[Any],
|
|
527
|
+
) -> tuple[list[dict[str, Any]], list[dict[str, Any]]]:
|
|
528
|
+
"""Collect results for coded evaluators.
|
|
529
|
+
|
|
530
|
+
Returns evaluatorRuns and scores in the format expected by coded eval endpoints.
|
|
531
|
+
"""
|
|
532
|
+
evaluator_runs: list[dict[str, Any]] = []
|
|
533
|
+
evaluator_scores_list: list[dict[str, Any]] = []
|
|
534
|
+
|
|
535
|
+
# Extract usage metrics from spans
|
|
536
|
+
usage_metrics = self._extract_usage_from_spans(spans)
|
|
537
|
+
|
|
538
|
+
for eval_result in eval_results:
|
|
539
|
+
evaluator_scores_list.append(
|
|
540
|
+
{
|
|
541
|
+
"type": eval_result.result.score_type.value,
|
|
542
|
+
"value": eval_result.result.score,
|
|
543
|
+
"justification": eval_result.result.details,
|
|
544
|
+
"evaluatorId": eval_result.evaluator_id,
|
|
545
|
+
}
|
|
546
|
+
)
|
|
547
|
+
evaluator_runs.append(
|
|
548
|
+
{
|
|
549
|
+
"status": EvaluationStatus.COMPLETED.value,
|
|
550
|
+
"evaluatorId": eval_result.evaluator_id,
|
|
551
|
+
"result": {
|
|
552
|
+
"score": {
|
|
553
|
+
"type": eval_result.result.score_type.value,
|
|
554
|
+
"value": eval_result.result.score,
|
|
555
|
+
},
|
|
556
|
+
"justification": eval_result.result.details,
|
|
557
|
+
},
|
|
558
|
+
"completionMetrics": {
|
|
559
|
+
"duration": int(eval_result.result.evaluation_time)
|
|
560
|
+
if eval_result.result.evaluation_time
|
|
561
|
+
else 0,
|
|
562
|
+
"cost": usage_metrics["cost"],
|
|
563
|
+
"tokens": usage_metrics["tokens"] or 0,
|
|
564
|
+
"completionTokens": usage_metrics["completionTokens"] or 0,
|
|
565
|
+
"promptTokens": usage_metrics["promptTokens"] or 0,
|
|
566
|
+
},
|
|
567
|
+
}
|
|
568
|
+
)
|
|
569
|
+
return evaluator_runs, evaluator_scores_list
|
|
570
|
+
|
|
347
571
|
def _update_eval_run_spec(
|
|
348
572
|
self,
|
|
349
573
|
assertion_runs: list[dict[str, Any]],
|
|
@@ -355,7 +579,7 @@ class StudioWebProgressReporter:
|
|
|
355
579
|
return RequestSpec(
|
|
356
580
|
method="PUT",
|
|
357
581
|
endpoint=Endpoint(
|
|
358
|
-
f"
|
|
582
|
+
f"{self._get_endpoint_prefix()}execution/agents/{self._project_id}/coded/evalRun"
|
|
359
583
|
),
|
|
360
584
|
json={
|
|
361
585
|
"evalRunId": eval_run_id,
|
|
@@ -370,22 +594,58 @@ class StudioWebProgressReporter:
|
|
|
370
594
|
headers=self._tenant_header(),
|
|
371
595
|
)
|
|
372
596
|
|
|
597
|
+
def _update_coded_eval_run_spec(
|
|
598
|
+
self,
|
|
599
|
+
evaluator_runs: list[dict[str, Any]],
|
|
600
|
+
evaluator_scores: list[dict[str, Any]],
|
|
601
|
+
eval_run_id: str,
|
|
602
|
+
actual_output: dict[str, Any],
|
|
603
|
+
execution_time: float,
|
|
604
|
+
) -> RequestSpec:
|
|
605
|
+
"""Create update spec for coded evaluators."""
|
|
606
|
+
return RequestSpec(
|
|
607
|
+
method="PUT",
|
|
608
|
+
endpoint=Endpoint(
|
|
609
|
+
f"{self._get_endpoint_prefix()}execution/agents/{self._project_id}/coded/evalRun"
|
|
610
|
+
),
|
|
611
|
+
json={
|
|
612
|
+
"evalRunId": eval_run_id,
|
|
613
|
+
"status": EvaluationStatus.COMPLETED.value,
|
|
614
|
+
"result": {
|
|
615
|
+
"output": {"content": {**actual_output}},
|
|
616
|
+
"scores": evaluator_scores,
|
|
617
|
+
},
|
|
618
|
+
"completionMetrics": {"duration": int(execution_time)},
|
|
619
|
+
"evaluatorRuns": evaluator_runs,
|
|
620
|
+
},
|
|
621
|
+
headers=self._tenant_header(),
|
|
622
|
+
)
|
|
623
|
+
|
|
373
624
|
def _create_eval_run_spec(
|
|
374
|
-
self, eval_item:
|
|
625
|
+
self, eval_item: AnyEvaluationItem, eval_set_run_id: str
|
|
375
626
|
) -> RequestSpec:
|
|
627
|
+
# Build eval snapshot based on evaluation item type
|
|
628
|
+
eval_snapshot = {
|
|
629
|
+
"id": eval_item.id,
|
|
630
|
+
"name": eval_item.name,
|
|
631
|
+
"inputs": eval_item.inputs,
|
|
632
|
+
}
|
|
633
|
+
|
|
634
|
+
# For new coded evaluators (EvaluationItem), use evaluationCriterias
|
|
635
|
+
# For legacy evaluators (LegacyEvaluationItem), use expectedOutput
|
|
636
|
+
if isinstance(eval_item, EvaluationItem):
|
|
637
|
+
eval_snapshot["evaluationCriterias"] = eval_item.evaluation_criterias
|
|
638
|
+
else:
|
|
639
|
+
eval_snapshot["expectedOutput"] = eval_item.expected_output
|
|
640
|
+
|
|
376
641
|
return RequestSpec(
|
|
377
642
|
method="POST",
|
|
378
643
|
endpoint=Endpoint(
|
|
379
|
-
f"
|
|
644
|
+
f"{self._get_endpoint_prefix()}execution/agents/{self._project_id}/coded/evalRun"
|
|
380
645
|
),
|
|
381
646
|
json={
|
|
382
647
|
"evalSetRunId": eval_set_run_id,
|
|
383
|
-
"evalSnapshot":
|
|
384
|
-
"id": eval_item.id,
|
|
385
|
-
"name": eval_item.name,
|
|
386
|
-
"inputs": eval_item.inputs,
|
|
387
|
-
"expectedOutput": eval_item.expected_output,
|
|
388
|
-
},
|
|
648
|
+
"evalSnapshot": eval_snapshot,
|
|
389
649
|
"status": EvaluationStatus.IN_PROGRESS.value,
|
|
390
650
|
},
|
|
391
651
|
headers=self._tenant_header(),
|
|
@@ -400,7 +660,7 @@ class StudioWebProgressReporter:
|
|
|
400
660
|
return RequestSpec(
|
|
401
661
|
method="POST",
|
|
402
662
|
endpoint=Endpoint(
|
|
403
|
-
f"
|
|
663
|
+
f"{self._get_endpoint_prefix()}execution/agents/{self._project_id}/coded/evalSetRun"
|
|
404
664
|
),
|
|
405
665
|
json={
|
|
406
666
|
"agentId": self._project_id,
|
|
@@ -425,7 +685,7 @@ class StudioWebProgressReporter:
|
|
|
425
685
|
return RequestSpec(
|
|
426
686
|
method="PUT",
|
|
427
687
|
endpoint=Endpoint(
|
|
428
|
-
f"
|
|
688
|
+
f"{self._get_endpoint_prefix()}execution/agents/{self._project_id}/coded/evalSetRun"
|
|
429
689
|
),
|
|
430
690
|
json={
|
|
431
691
|
"evalSetRunId": eval_set_run_id,
|