uipath 2.1.42__py3-none-any.whl → 2.1.43__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- uipath/_cli/_evals/_models/_agent_execution_output.py +14 -0
- uipath/_cli/_evals/_runtime.py +172 -0
- uipath/_cli/_runtime/_contracts.py +135 -6
- uipath/_cli/_utils/_eval_set.py +84 -0
- uipath/_cli/cli_eval.py +73 -42
- uipath/_cli/cli_run.py +10 -36
- uipath/_cli/middlewares.py +1 -0
- uipath/_utils/constants.py +3 -0
- uipath/eval/_helpers/__init__.py +3 -0
- uipath/eval/_helpers/helpers.py +47 -0
- {uipath-2.1.42.dist-info → uipath-2.1.43.dist-info}/METADATA +1 -1
- {uipath-2.1.42.dist-info → uipath-2.1.43.dist-info}/RECORD +15 -11
- uipath/_cli/_evals/evaluation_service.py +0 -582
- {uipath-2.1.42.dist-info → uipath-2.1.43.dist-info}/WHEEL +0 -0
- {uipath-2.1.42.dist-info → uipath-2.1.43.dist-info}/entry_points.txt +0 -0
- {uipath-2.1.42.dist-info → uipath-2.1.43.dist-info}/licenses/LICENSE +0 -0
@@ -1,582 +0,0 @@
|
|
1
|
-
"""Evaluation service for running and managing evaluation sets."""
|
2
|
-
|
3
|
-
import asyncio
|
4
|
-
import json
|
5
|
-
import os
|
6
|
-
import tempfile
|
7
|
-
import warnings
|
8
|
-
from datetime import datetime, timezone
|
9
|
-
from pathlib import Path
|
10
|
-
from typing import Any, Dict, List, Optional
|
11
|
-
|
12
|
-
import click
|
13
|
-
|
14
|
-
from uipath._cli._utils._console import ConsoleLogger, EvaluationProgressManager
|
15
|
-
|
16
|
-
from ..cli_run import run_core # type: ignore
|
17
|
-
from ._evaluators._evaluator_base import EvaluatorBase
|
18
|
-
from ._evaluators._evaluator_factory import EvaluatorFactory
|
19
|
-
from ._models import (
|
20
|
-
EvaluationSet,
|
21
|
-
EvaluationSetResult,
|
22
|
-
)
|
23
|
-
from ._models._evaluators import EvalItemResult
|
24
|
-
from .progress_reporter import ProgressReporter
|
25
|
-
|
26
|
-
console = ConsoleLogger()
|
27
|
-
|
28
|
-
|
29
|
-
class EvaluationService:
|
30
|
-
"""Service for running evaluations."""
|
31
|
-
|
32
|
-
def __init__(
|
33
|
-
self,
|
34
|
-
entrypoint: Optional[str] = None,
|
35
|
-
eval_set_path: Optional[str | Path] = None,
|
36
|
-
eval_ids: Optional[List[str]] = None,
|
37
|
-
workers: int = 8,
|
38
|
-
report_progress: bool = True,
|
39
|
-
):
|
40
|
-
"""Initialize the evaluation service.
|
41
|
-
|
42
|
-
Args:
|
43
|
-
entrypoint: Path to the agent script to evaluate (optional, will auto-discover if not provided)
|
44
|
-
eval_set_path: Path to the evaluation set file (optional, will auto-discover if not provided)
|
45
|
-
workers: Number of parallel workers for running evaluations
|
46
|
-
report_progress: Whether to report progress to StudioWeb
|
47
|
-
"""
|
48
|
-
self.entrypoint, self.eval_set_path = self._resolve_paths(
|
49
|
-
entrypoint, eval_set_path
|
50
|
-
)
|
51
|
-
self._eval_set = self._load_eval_set(eval_ids)
|
52
|
-
self._evaluators = self._load_evaluators()
|
53
|
-
self._num_workers = workers
|
54
|
-
self._results_lock = asyncio.Lock()
|
55
|
-
self._progress_manager: Optional[EvaluationProgressManager] = None
|
56
|
-
self._report_progress = report_progress
|
57
|
-
self._progress_reporter: Optional[ProgressReporter] = None
|
58
|
-
self._initialize_results()
|
59
|
-
|
60
|
-
def _resolve_paths(
|
61
|
-
self, entrypoint: Optional[str], eval_set_path: Optional[str | Path]
|
62
|
-
) -> tuple[str, Path]:
|
63
|
-
"""Resolve entrypoint and eval_set_path, auto-discovering if not provided.
|
64
|
-
|
65
|
-
Args:
|
66
|
-
entrypoint: Optional entrypoint path
|
67
|
-
eval_set_path: Optional eval set path
|
68
|
-
|
69
|
-
Returns:
|
70
|
-
Tuple of (resolved_entrypoint, resolved_eval_set_path)
|
71
|
-
|
72
|
-
Raises:
|
73
|
-
ValueError: If paths cannot be resolved or multiple options exist
|
74
|
-
"""
|
75
|
-
resolved_entrypoint = entrypoint
|
76
|
-
resolved_eval_set_path = eval_set_path
|
77
|
-
|
78
|
-
if resolved_entrypoint is None:
|
79
|
-
resolved_entrypoint = self._auto_discover_entrypoint()
|
80
|
-
|
81
|
-
if resolved_eval_set_path is None:
|
82
|
-
resolved_eval_set_path = self._auto_discover_eval_set()
|
83
|
-
|
84
|
-
eval_set_path_obj = Path(resolved_eval_set_path)
|
85
|
-
if not eval_set_path_obj.is_file() or eval_set_path_obj.suffix != ".json":
|
86
|
-
raise ValueError("Evaluation set must be a JSON file")
|
87
|
-
|
88
|
-
return resolved_entrypoint, eval_set_path_obj
|
89
|
-
|
90
|
-
def _auto_discover_entrypoint(self) -> str:
|
91
|
-
"""Auto-discover entrypoint from config file.
|
92
|
-
|
93
|
-
Returns:
|
94
|
-
Path to the entrypoint
|
95
|
-
|
96
|
-
Raises:
|
97
|
-
ValueError: If no entrypoint found or multiple entrypoints exist
|
98
|
-
"""
|
99
|
-
config_file = "uipath.json"
|
100
|
-
if not os.path.isfile(config_file):
|
101
|
-
raise ValueError(
|
102
|
-
f"File '{config_file}' not found. Please run 'uipath init'."
|
103
|
-
)
|
104
|
-
|
105
|
-
with open(config_file, "r", encoding="utf-8") as f:
|
106
|
-
uipath_config = json.loads(f.read())
|
107
|
-
|
108
|
-
entrypoints = uipath_config.get("entryPoints", [])
|
109
|
-
|
110
|
-
if not entrypoints:
|
111
|
-
raise ValueError(
|
112
|
-
"No entrypoints found in uipath.json. Please run 'uipath init'."
|
113
|
-
)
|
114
|
-
|
115
|
-
if len(entrypoints) > 1:
|
116
|
-
entrypoint_paths = [ep.get("filePath") for ep in entrypoints]
|
117
|
-
raise ValueError(
|
118
|
-
f"Multiple entrypoints found: {entrypoint_paths}. "
|
119
|
-
f"Please specify which entrypoint to use: uipath eval <entrypoint> [eval_set]"
|
120
|
-
)
|
121
|
-
|
122
|
-
entrypoint_path = entrypoints[0].get("filePath")
|
123
|
-
|
124
|
-
console.info(
|
125
|
-
f"Auto-discovered entrypoint: {click.style(entrypoint_path, fg='cyan')}"
|
126
|
-
)
|
127
|
-
return entrypoint_path
|
128
|
-
|
129
|
-
def _auto_discover_eval_set(self) -> str:
|
130
|
-
"""Auto-discover evaluation set from evals/eval-sets directory.
|
131
|
-
|
132
|
-
Returns:
|
133
|
-
Path to the evaluation set file
|
134
|
-
|
135
|
-
Raises:
|
136
|
-
ValueError: If no eval set found or multiple eval sets exist
|
137
|
-
"""
|
138
|
-
eval_sets_dir = Path("evals/eval-sets")
|
139
|
-
|
140
|
-
if not eval_sets_dir.exists():
|
141
|
-
raise ValueError(
|
142
|
-
"No 'evals/eval-sets' directory found. "
|
143
|
-
"Please set 'UIPATH_PROJECT_ID' env var and run 'uipath pull'."
|
144
|
-
)
|
145
|
-
|
146
|
-
eval_set_files = list(eval_sets_dir.glob("*.json"))
|
147
|
-
|
148
|
-
if not eval_set_files:
|
149
|
-
raise ValueError(
|
150
|
-
"No evaluation set files found in 'evals/eval-sets' directory. "
|
151
|
-
)
|
152
|
-
|
153
|
-
if len(eval_set_files) > 1:
|
154
|
-
file_names = [f.name for f in eval_set_files]
|
155
|
-
raise ValueError(
|
156
|
-
f"Multiple evaluation sets found: {file_names}. "
|
157
|
-
f"Please specify which evaluation set to use: 'uipath eval [entrypoint] <eval_set_path>'"
|
158
|
-
)
|
159
|
-
|
160
|
-
eval_set_path = str(eval_set_files[0])
|
161
|
-
console.info(
|
162
|
-
f"Auto-discovered evaluation set: {click.style(eval_set_path, fg='cyan')}"
|
163
|
-
)
|
164
|
-
return eval_set_path
|
165
|
-
|
166
|
-
def _initialize_results(self) -> None:
|
167
|
-
"""Initialize the results file and directory."""
|
168
|
-
self._create_and_initialize_results_file()
|
169
|
-
# Initialize progress reporter if needed
|
170
|
-
if self._report_progress:
|
171
|
-
agent_snapshot = self._extract_agent_snapshot()
|
172
|
-
self._progress_reporter = ProgressReporter(
|
173
|
-
eval_set_id=self._eval_set.id,
|
174
|
-
agent_snapshot=agent_snapshot,
|
175
|
-
no_of_evals=len(self._eval_set.evaluations),
|
176
|
-
evaluators=self._evaluators,
|
177
|
-
)
|
178
|
-
|
179
|
-
def _extract_agent_snapshot(self) -> str:
|
180
|
-
"""Extract agent snapshot from uipath.json file.
|
181
|
-
|
182
|
-
Returns:
|
183
|
-
JSON string containing the agent snapshot with input and output schemas
|
184
|
-
"""
|
185
|
-
config_file = "uipath.json"
|
186
|
-
if not os.path.isfile(config_file):
|
187
|
-
console.error(f"File '{config_file}' not found. Please run 'uipath init'")
|
188
|
-
|
189
|
-
with open(config_file, "r", encoding="utf-8") as f:
|
190
|
-
file_content = f.read()
|
191
|
-
uipath_config = json.loads(file_content)
|
192
|
-
|
193
|
-
entry_point = None
|
194
|
-
for ep in uipath_config.get("entryPoints", []):
|
195
|
-
if ep.get("filePath") == self.entrypoint:
|
196
|
-
entry_point = ep
|
197
|
-
break
|
198
|
-
|
199
|
-
if not entry_point:
|
200
|
-
console.error(
|
201
|
-
f"No entry point found with filePath '{self.entrypoint}' in uipath.json"
|
202
|
-
)
|
203
|
-
|
204
|
-
input_schema = entry_point.get("input", {}) # type: ignore
|
205
|
-
output_schema = entry_point.get("output", {}) # type: ignore
|
206
|
-
|
207
|
-
# Format as agent snapshot
|
208
|
-
agent_snapshot = {"inputSchema": input_schema, "outputSchema": output_schema}
|
209
|
-
|
210
|
-
return json.dumps(agent_snapshot)
|
211
|
-
|
212
|
-
def _create_and_initialize_results_file(self):
|
213
|
-
# Create results directory if it doesn't exist
|
214
|
-
results_dir = self.eval_set_path.parent.parent / "results"
|
215
|
-
results_dir.mkdir(exist_ok=True)
|
216
|
-
|
217
|
-
# Create results file
|
218
|
-
timestamp = datetime.now(timezone.utc).strftime("%M-%H-%d-%m-%Y")
|
219
|
-
eval_set_name = self._eval_set.name
|
220
|
-
self.result_file = results_dir / f"eval-{eval_set_name}-{timestamp}.json"
|
221
|
-
|
222
|
-
initial_results = EvaluationSetResult(
|
223
|
-
eval_set_id=self._eval_set.id,
|
224
|
-
eval_set_name=self._eval_set.name,
|
225
|
-
results=[],
|
226
|
-
average_score=0.0,
|
227
|
-
)
|
228
|
-
|
229
|
-
with open(self.result_file, "w", encoding="utf-8") as f:
|
230
|
-
f.write(initial_results.model_dump_json(indent=2))
|
231
|
-
|
232
|
-
def _load_eval_set(self, eval_ids: Optional[List[str]] = None) -> EvaluationSet:
|
233
|
-
"""Load the evaluation set from file.
|
234
|
-
|
235
|
-
Returns:
|
236
|
-
The loaded evaluation set as EvaluationSet model
|
237
|
-
"""
|
238
|
-
with open(self.eval_set_path, "r", encoding="utf-8") as f:
|
239
|
-
data = json.load(f)
|
240
|
-
eval_set = EvaluationSet(**data)
|
241
|
-
if eval_ids:
|
242
|
-
eval_set.extract_selected_evals(eval_ids)
|
243
|
-
return eval_set
|
244
|
-
|
245
|
-
def _load_evaluators(self) -> List[EvaluatorBase]:
|
246
|
-
"""Load evaluators referenced by the evaluation set."""
|
247
|
-
evaluators = []
|
248
|
-
evaluators_dir = self.eval_set_path.parent.parent / "evaluators"
|
249
|
-
evaluator_refs = set(self._eval_set.evaluatorRefs)
|
250
|
-
found_evaluator_ids = set()
|
251
|
-
|
252
|
-
# Load evaluators from JSON files
|
253
|
-
for file in evaluators_dir.glob("*.json"):
|
254
|
-
with open(file, "r", encoding="utf-8") as f:
|
255
|
-
data = json.load(f)
|
256
|
-
evaluator_id = data.get("id")
|
257
|
-
|
258
|
-
if evaluator_id in evaluator_refs:
|
259
|
-
evaluator = EvaluatorFactory.create_evaluator(data)
|
260
|
-
evaluators.append(evaluator)
|
261
|
-
found_evaluator_ids.add(evaluator_id)
|
262
|
-
|
263
|
-
# Check if all referenced evaluators were found
|
264
|
-
missing_evaluators = evaluator_refs - found_evaluator_ids
|
265
|
-
if missing_evaluators:
|
266
|
-
raise ValueError(
|
267
|
-
f"Could not find evaluators with IDs: {missing_evaluators}"
|
268
|
-
)
|
269
|
-
|
270
|
-
return evaluators
|
271
|
-
|
272
|
-
async def _write_results(self, results: List[Any]) -> None:
|
273
|
-
"""Write evaluation results to file with async lock.
|
274
|
-
|
275
|
-
Args:
|
276
|
-
results: List of evaluation results to write
|
277
|
-
"""
|
278
|
-
async with self._results_lock:
|
279
|
-
# Read current results
|
280
|
-
with open(self.result_file, "r", encoding="utf-8") as f:
|
281
|
-
current_results = EvaluationSetResult.model_validate_json(f.read())
|
282
|
-
|
283
|
-
# Add new results
|
284
|
-
current_results.results.extend(results)
|
285
|
-
|
286
|
-
if current_results.results:
|
287
|
-
current_results.average_score = sum(
|
288
|
-
r.score for r in current_results.results
|
289
|
-
) / len(current_results.results)
|
290
|
-
|
291
|
-
# Write updated results
|
292
|
-
with open(self.result_file, "w", encoding="utf-8") as f:
|
293
|
-
f.write(current_results.model_dump_json(indent=2))
|
294
|
-
|
295
|
-
async def _results_queue_consumer(self, results_queue: asyncio.Queue[Any]) -> None:
|
296
|
-
"""Consumer task for the results queue that writes to local file.
|
297
|
-
|
298
|
-
Args:
|
299
|
-
results_queue: Queue containing evaluation results to write to file
|
300
|
-
"""
|
301
|
-
while True:
|
302
|
-
results: list[EvalItemResult] = await results_queue.get()
|
303
|
-
if results is None:
|
304
|
-
# Sentinel value - consumer should stop
|
305
|
-
results_queue.task_done()
|
306
|
-
return
|
307
|
-
|
308
|
-
try:
|
309
|
-
await self._write_results([eval_item.result for eval_item in results])
|
310
|
-
results_queue.task_done()
|
311
|
-
except Exception as e:
|
312
|
-
console.warning(f"Error writing results to file: {str(e)}")
|
313
|
-
results_queue.task_done()
|
314
|
-
|
315
|
-
async def _sw_progress_reporter_queue_consumer(
|
316
|
-
self, sw_progress_reporter_queue: asyncio.Queue[Any]
|
317
|
-
) -> None:
|
318
|
-
"""Consumer task for the SW progress reporter.
|
319
|
-
|
320
|
-
Args:
|
321
|
-
sw_progress_reporter_queue: Queue containing evaluation results to report to StudioWeb
|
322
|
-
"""
|
323
|
-
while True:
|
324
|
-
queue_item = await sw_progress_reporter_queue.get()
|
325
|
-
if queue_item is None:
|
326
|
-
# Sentinel value - consumer should stop
|
327
|
-
sw_progress_reporter_queue.task_done()
|
328
|
-
return
|
329
|
-
eval_run_id: str
|
330
|
-
eval_results: list[EvalItemResult]
|
331
|
-
success: bool
|
332
|
-
execution_time: float
|
333
|
-
|
334
|
-
eval_run_id, eval_results, success, execution_time = queue_item
|
335
|
-
|
336
|
-
try:
|
337
|
-
if self._progress_reporter:
|
338
|
-
await self._progress_reporter.update_eval_run(
|
339
|
-
eval_results, eval_run_id, execution_time
|
340
|
-
)
|
341
|
-
sw_progress_reporter_queue.task_done()
|
342
|
-
except Exception as e:
|
343
|
-
console.warning(f"Error reporting progress to StudioWeb: {str(e)}")
|
344
|
-
sw_progress_reporter_queue.task_done()
|
345
|
-
|
346
|
-
def _run_agent(self, input_json: str) -> tuple[Dict[str, Any], bool, float]:
|
347
|
-
"""Run the agent with the given input.
|
348
|
-
|
349
|
-
Args:
|
350
|
-
input_json: JSON string containing input data
|
351
|
-
|
352
|
-
Returns:
|
353
|
-
Agent output as dictionary and success status
|
354
|
-
"""
|
355
|
-
with tempfile.TemporaryDirectory() as tmpdir:
|
356
|
-
try:
|
357
|
-
import time
|
358
|
-
|
359
|
-
output_file = Path(tmpdir) / "output.json"
|
360
|
-
logs_file = Path(tmpdir) / "execution.log"
|
361
|
-
|
362
|
-
# Suppress LangChain deprecation warnings during agent execution
|
363
|
-
with warnings.catch_warnings():
|
364
|
-
warnings.filterwarnings(
|
365
|
-
"ignore", category=UserWarning, module="langchain"
|
366
|
-
)
|
367
|
-
# Note: Progress reporting is handled outside this method since it's async
|
368
|
-
start_time = time.time()
|
369
|
-
success, error_message, info_message = run_core(
|
370
|
-
entrypoint=self.entrypoint,
|
371
|
-
input=input_json,
|
372
|
-
resume=False,
|
373
|
-
input_file=None,
|
374
|
-
execution_output_file=output_file,
|
375
|
-
logs_file=logs_file,
|
376
|
-
runtime_dir=tmpdir,
|
377
|
-
is_eval_run=True,
|
378
|
-
)
|
379
|
-
execution_time = time.time() - start_time
|
380
|
-
if not success:
|
381
|
-
console.warning(error_message)
|
382
|
-
return {}, False, execution_time
|
383
|
-
else:
|
384
|
-
# Read the output file
|
385
|
-
with open(output_file, "r", encoding="utf-8") as f:
|
386
|
-
result = json.load(f)
|
387
|
-
|
388
|
-
# uncomment the following lines to have access to the execution.logs (needed for some types of evals)
|
389
|
-
# with open(logs_file, "r", encoding="utf-8") as f:
|
390
|
-
# logs = f.read()
|
391
|
-
if isinstance(result, str):
|
392
|
-
try:
|
393
|
-
return json.loads(result), True, execution_time
|
394
|
-
except json.JSONDecodeError as e:
|
395
|
-
raise Exception(f"Error parsing output: {e}") from e
|
396
|
-
return result, True, 0.0
|
397
|
-
|
398
|
-
except Exception as e:
|
399
|
-
console.warning(f"Error running agent: {str(e)}")
|
400
|
-
return {"error": str(e)}, False, execution_time
|
401
|
-
|
402
|
-
async def _process_evaluation(
|
403
|
-
self,
|
404
|
-
eval_item: Dict[str, Any],
|
405
|
-
results_queue: asyncio.Queue[Any],
|
406
|
-
sw_progress_reporter_queue: asyncio.Queue[Any],
|
407
|
-
) -> None:
|
408
|
-
"""Process a single evaluation item.
|
409
|
-
|
410
|
-
Args:
|
411
|
-
eval_item: The evaluation item to process
|
412
|
-
results_queue: Queue for local file results
|
413
|
-
sw_progress_reporter_queue: Queue for StudioWeb progress reporting
|
414
|
-
"""
|
415
|
-
eval_id = eval_item["id"]
|
416
|
-
eval_run_id: Optional[str] = None
|
417
|
-
|
418
|
-
try:
|
419
|
-
input_json = json.dumps(eval_item["inputs"])
|
420
|
-
|
421
|
-
if self._report_progress and self._progress_reporter:
|
422
|
-
eval_run_id = await self._progress_reporter.create_eval_run(eval_item)
|
423
|
-
|
424
|
-
loop = asyncio.get_running_loop()
|
425
|
-
actual_output, success, execution_time = await loop.run_in_executor(
|
426
|
-
None,
|
427
|
-
self._run_agent,
|
428
|
-
input_json,
|
429
|
-
)
|
430
|
-
|
431
|
-
if success:
|
432
|
-
# Run each evaluator
|
433
|
-
eval_results: list[EvalItemResult] = []
|
434
|
-
for evaluator in self._evaluators:
|
435
|
-
result = await evaluator.evaluate(
|
436
|
-
evaluation_id=eval_item["id"],
|
437
|
-
evaluation_name=eval_item["name"],
|
438
|
-
input_data=eval_item["inputs"],
|
439
|
-
expected_output=eval_item["expectedOutput"],
|
440
|
-
actual_output=actual_output,
|
441
|
-
)
|
442
|
-
eval_results.append(
|
443
|
-
EvalItemResult(evaluator_id=evaluator.id, result=result)
|
444
|
-
)
|
445
|
-
|
446
|
-
await results_queue.put(eval_results)
|
447
|
-
if self._report_progress:
|
448
|
-
# TODO: modify this, here we are only reporting for success
|
449
|
-
await sw_progress_reporter_queue.put(
|
450
|
-
(eval_run_id, eval_results, success, execution_time)
|
451
|
-
)
|
452
|
-
|
453
|
-
# Update progress to completed
|
454
|
-
if self._progress_manager:
|
455
|
-
self._progress_manager.complete_evaluation(eval_id)
|
456
|
-
else:
|
457
|
-
# Mark as failed if agent execution failed
|
458
|
-
if self._progress_manager:
|
459
|
-
self._progress_manager.fail_evaluation(
|
460
|
-
eval_id, "Agent execution failed"
|
461
|
-
)
|
462
|
-
|
463
|
-
except Exception as e:
|
464
|
-
# Mark as failed with error message
|
465
|
-
if self._progress_manager:
|
466
|
-
self._progress_manager.fail_evaluation(eval_id, str(e))
|
467
|
-
raise
|
468
|
-
|
469
|
-
async def _producer_task(self, task_queue: asyncio.Queue[Any]) -> None:
|
470
|
-
"""Producer task that adds all evaluations to the queue.
|
471
|
-
|
472
|
-
Args:
|
473
|
-
task_queue: The asyncio queue to add tasks to
|
474
|
-
"""
|
475
|
-
for eval_item in self._eval_set.evaluations:
|
476
|
-
await task_queue.put(eval_item.model_dump())
|
477
|
-
|
478
|
-
# Add sentinel values to signal workers to stop
|
479
|
-
for _ in range(self._num_workers):
|
480
|
-
await task_queue.put(None)
|
481
|
-
|
482
|
-
async def _consumer_task(
|
483
|
-
self,
|
484
|
-
task_queue: asyncio.Queue[Any],
|
485
|
-
worker_id: int,
|
486
|
-
results_queue: asyncio.Queue[Any],
|
487
|
-
sw_progress_reporter_queue: asyncio.Queue[Any],
|
488
|
-
) -> None:
|
489
|
-
"""Consumer task that processes evaluations from the queue.
|
490
|
-
|
491
|
-
Args:
|
492
|
-
task_queue: The asyncio queue to get tasks from
|
493
|
-
worker_id: ID of this worker for logging
|
494
|
-
results_queue: Queue for local file results
|
495
|
-
sw_progress_reporter_queue: Queue for StudioWeb progress reporting
|
496
|
-
"""
|
497
|
-
while True:
|
498
|
-
eval_item = await task_queue.get()
|
499
|
-
if eval_item is None:
|
500
|
-
# Sentinel value - worker should stop
|
501
|
-
task_queue.task_done()
|
502
|
-
return
|
503
|
-
|
504
|
-
try:
|
505
|
-
await self._process_evaluation(
|
506
|
-
eval_item, results_queue, sw_progress_reporter_queue
|
507
|
-
)
|
508
|
-
task_queue.task_done()
|
509
|
-
except Exception as e:
|
510
|
-
# Log error and continue to next item
|
511
|
-
task_queue.task_done()
|
512
|
-
console.warning(
|
513
|
-
f"Evaluation {eval_item.get('name', 'Unknown')} failed: {str(e)}"
|
514
|
-
)
|
515
|
-
|
516
|
-
async def run_evaluation(self) -> None:
|
517
|
-
"""Run the evaluation set using multiple worker tasks."""
|
518
|
-
console.info(
|
519
|
-
f"Starting evaluating {click.style(self._eval_set.name, fg='cyan')} evaluation set..."
|
520
|
-
)
|
521
|
-
|
522
|
-
if self._report_progress and self._progress_reporter:
|
523
|
-
await self._progress_reporter.create_eval_set_run()
|
524
|
-
|
525
|
-
# Prepare items for progress tracker
|
526
|
-
progress_items = [
|
527
|
-
{"id": eval_item.id, "name": eval_item.name}
|
528
|
-
for eval_item in self._eval_set.evaluations
|
529
|
-
]
|
530
|
-
|
531
|
-
with console.evaluation_progress(progress_items) as progress_manager:
|
532
|
-
self._progress_manager = progress_manager
|
533
|
-
|
534
|
-
task_queue: asyncio.Queue[Any] = asyncio.Queue()
|
535
|
-
results_queue: asyncio.Queue[Any] = asyncio.Queue()
|
536
|
-
sw_progress_reporter_queue: asyncio.Queue[Any] = asyncio.Queue()
|
537
|
-
|
538
|
-
producer = asyncio.create_task(self._producer_task(task_queue))
|
539
|
-
|
540
|
-
consumers = []
|
541
|
-
for worker_id in range(self._num_workers):
|
542
|
-
consumer = asyncio.create_task(
|
543
|
-
self._consumer_task(
|
544
|
-
task_queue, worker_id, results_queue, sw_progress_reporter_queue
|
545
|
-
)
|
546
|
-
)
|
547
|
-
consumers.append(consumer)
|
548
|
-
|
549
|
-
# Create results queue consumer
|
550
|
-
results_consumer = asyncio.create_task(
|
551
|
-
self._results_queue_consumer(results_queue)
|
552
|
-
)
|
553
|
-
|
554
|
-
# Create SW progress reporter queue consumer
|
555
|
-
sw_progress_consumer = None
|
556
|
-
if self._report_progress:
|
557
|
-
sw_progress_consumer = asyncio.create_task(
|
558
|
-
self._sw_progress_reporter_queue_consumer(
|
559
|
-
sw_progress_reporter_queue
|
560
|
-
)
|
561
|
-
)
|
562
|
-
|
563
|
-
# Wait for producer to finish
|
564
|
-
await producer
|
565
|
-
await task_queue.join()
|
566
|
-
|
567
|
-
# Wait for all consumers to finish
|
568
|
-
await asyncio.gather(*consumers)
|
569
|
-
|
570
|
-
# Signal queue consumers to stop by sending sentinel values
|
571
|
-
await results_queue.put(None)
|
572
|
-
if self._report_progress:
|
573
|
-
await sw_progress_reporter_queue.put(None)
|
574
|
-
|
575
|
-
await results_consumer
|
576
|
-
if sw_progress_consumer:
|
577
|
-
await sw_progress_consumer
|
578
|
-
|
579
|
-
if self._progress_reporter:
|
580
|
-
await self._progress_reporter.update_eval_set_run()
|
581
|
-
|
582
|
-
console.info(f"Results saved to {click.style(self.result_file, fg='cyan')}")
|
File without changes
|
File without changes
|
File without changes
|