hte-cli 0.2.21__tar.gz → 0.2.23__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {hte_cli-0.2.21 → hte_cli-0.2.23}/PKG-INFO +1 -1
- {hte_cli-0.2.21 → hte_cli-0.2.23}/pyproject.toml +1 -1
- {hte_cli-0.2.21 → hte_cli-0.2.23}/src/hte_cli/cli.py +48 -481
- {hte_cli-0.2.21 → hte_cli-0.2.23}/tests/e2e/automated_runner.py +75 -28
- {hte_cli-0.2.21 → hte_cli-0.2.23}/tests/e2e/e2e_test.py +19 -9
- {hte_cli-0.2.21 → hte_cli-0.2.23}/tests/e2e/test_benchmark_flows.py +3 -4
- {hte_cli-0.2.21 → hte_cli-0.2.23}/tests/e2e/test_eval_logs.py +61 -21
- {hte_cli-0.2.21 → hte_cli-0.2.23}/tests/e2e/test_infrastructure.py +3 -3
- {hte_cli-0.2.21 → hte_cli-0.2.23}/tests/e2e/test_runtime_imports.py +4 -6
- {hte_cli-0.2.21 → hte_cli-0.2.23}/tests/e2e/test_session_lifecycle.py +0 -1
- {hte_cli-0.2.21 → hte_cli-0.2.23}/uv.lock +1 -1
- {hte_cli-0.2.21 → hte_cli-0.2.23}/.gitignore +0 -0
- {hte_cli-0.2.21 → hte_cli-0.2.23}/README.md +0 -0
- {hte_cli-0.2.21 → hte_cli-0.2.23}/src/hte_cli/__init__.py +0 -0
- {hte_cli-0.2.21 → hte_cli-0.2.23}/src/hte_cli/__main__.py +0 -0
- {hte_cli-0.2.21 → hte_cli-0.2.23}/src/hte_cli/api_client.py +0 -0
- {hte_cli-0.2.21 → hte_cli-0.2.23}/src/hte_cli/config.py +0 -0
- {hte_cli-0.2.21 → hte_cli-0.2.23}/src/hte_cli/errors.py +0 -0
- {hte_cli-0.2.21 → hte_cli-0.2.23}/src/hte_cli/events.py +0 -0
- {hte_cli-0.2.21 → hte_cli-0.2.23}/src/hte_cli/image_utils.py +0 -0
- {hte_cli-0.2.21 → hte_cli-0.2.23}/src/hte_cli/runner.py +0 -0
- {hte_cli-0.2.21 → hte_cli-0.2.23}/src/hte_cli/scorers.py +0 -0
- {hte_cli-0.2.21 → hte_cli-0.2.23}/src/hte_cli/version_check.py +0 -0
- {hte_cli-0.2.21 → hte_cli-0.2.23}/tests/__init__.py +0 -0
- {hte_cli-0.2.21 → hte_cli-0.2.23}/tests/e2e/__init__.py +0 -0
- {hte_cli-0.2.21 → hte_cli-0.2.23}/tests/e2e/conftest.py +0 -0
- {hte_cli-0.2.21 → hte_cli-0.2.23}/tests/e2e/verify_docker_deps.py +0 -0
- {hte_cli-0.2.21 → hte_cli-0.2.23}/tests/unit/__init__.py +0 -0
- {hte_cli-0.2.21 → hte_cli-0.2.23}/tests/unit/conftest.py +0 -0
- {hte_cli-0.2.21 → hte_cli-0.2.23}/tests/unit/test_image_utils.py +0 -0
- {hte_cli-0.2.21 → hte_cli-0.2.23}/tests/unit/test_runner.py +0 -0
- {hte_cli-0.2.21 → hte_cli-0.2.23}/tests/unit/test_scorers.py +0 -0
|
@@ -194,7 +194,9 @@ def session_join(ctx, session_id: str, force_setup: bool):
|
|
|
194
194
|
elif e.status_code == 404:
|
|
195
195
|
console.print("[red]Session not found. Check the session ID and try again.[/red]")
|
|
196
196
|
elif e.status_code == 400 and "paused" in str(e).lower():
|
|
197
|
-
console.print(
|
|
197
|
+
console.print(
|
|
198
|
+
"[yellow]Session is paused. Please resume from the web UI first.[/yellow]"
|
|
199
|
+
)
|
|
198
200
|
else:
|
|
199
201
|
console.print(f"[red]Error: {e}[/red]")
|
|
200
202
|
sys.exit(1)
|
|
@@ -236,16 +238,16 @@ def session_join(ctx, session_id: str, force_setup: bool):
|
|
|
236
238
|
try:
|
|
237
239
|
files_zip = api.get_session_files(session_id)
|
|
238
240
|
console.print(" [green]✓[/green] Task files downloaded")
|
|
239
|
-
except APIError
|
|
240
|
-
console.print(
|
|
241
|
+
except APIError:
|
|
242
|
+
console.print(" [dim]○ No task files (optional)[/dim]")
|
|
241
243
|
files_zip = None
|
|
242
244
|
|
|
243
245
|
with console.status("[dim]Fetching compose configuration...[/dim]"):
|
|
244
246
|
try:
|
|
245
247
|
compose_yaml = api.get_session_compose(session_id)
|
|
246
248
|
console.print(" [green]✓[/green] Docker compose downloaded")
|
|
247
|
-
except APIError
|
|
248
|
-
console.print(
|
|
249
|
+
except APIError:
|
|
250
|
+
console.print(" [dim]○ No compose file (optional)[/dim]")
|
|
249
251
|
compose_yaml = None
|
|
250
252
|
|
|
251
253
|
console.print()
|
|
@@ -258,9 +260,7 @@ def session_join(ctx, session_id: str, force_setup: bool):
|
|
|
258
260
|
f"[red]Error: {benchmark} requires a Docker sandbox but no compose file was found.[/red]"
|
|
259
261
|
)
|
|
260
262
|
console.print()
|
|
261
|
-
console.print(
|
|
262
|
-
f"Please contact support: {SUPPORT_EMAIL}"
|
|
263
|
-
)
|
|
263
|
+
console.print(f"Please contact support: {SUPPORT_EMAIL}")
|
|
264
264
|
sys.exit(1)
|
|
265
265
|
|
|
266
266
|
# Build assignment dict for runner compatibility
|
|
@@ -280,6 +280,14 @@ def session_join(ctx, session_id: str, force_setup: bool):
|
|
|
280
280
|
},
|
|
281
281
|
}
|
|
282
282
|
|
|
283
|
+
# Send session_started event (records CLI version for debugging)
|
|
284
|
+
events.session_started(
|
|
285
|
+
{
|
|
286
|
+
"cli_version": __version__,
|
|
287
|
+
"task_id": session_info["task_id"],
|
|
288
|
+
}
|
|
289
|
+
)
|
|
290
|
+
|
|
283
291
|
# Step 3: Run setup (skip if reconnecting without force)
|
|
284
292
|
setup_start_time = time.monotonic()
|
|
285
293
|
images = []
|
|
@@ -313,7 +321,10 @@ def session_join(ctx, session_id: str, force_setup: bool):
|
|
|
313
321
|
|
|
314
322
|
# Need to pull - show progress
|
|
315
323
|
last_status = ["connecting..."]
|
|
316
|
-
with console.status(
|
|
324
|
+
with console.status(
|
|
325
|
+
f"[yellow]↓[/yellow] {short_name} [dim]connecting...[/dim]"
|
|
326
|
+
) as status:
|
|
327
|
+
|
|
317
328
|
def show_progress(image: str, line: str):
|
|
318
329
|
# Show docker output directly - includes MB progress from PTY
|
|
319
330
|
# Lines look like: "abc123: Downloading 360.9MB/4.075GB"
|
|
@@ -325,7 +336,9 @@ def session_join(ctx, session_id: str, force_setup: bool):
|
|
|
325
336
|
display = f"{layer_id}: {layer_status}"
|
|
326
337
|
if display != last_status[0]:
|
|
327
338
|
last_status[0] = display
|
|
328
|
-
status.update(
|
|
339
|
+
status.update(
|
|
340
|
+
f"[yellow]↓[/yellow] {short_name} [dim]{display}[/dim]"
|
|
341
|
+
)
|
|
329
342
|
|
|
330
343
|
success = pull_image_with_progress(img, on_progress=show_progress)
|
|
331
344
|
|
|
@@ -370,7 +383,13 @@ def session_join(ctx, session_id: str, force_setup: bool):
|
|
|
370
383
|
console.print()
|
|
371
384
|
|
|
372
385
|
# Step 3: Run the task using TaskRunner
|
|
373
|
-
step_num =
|
|
386
|
+
step_num = (
|
|
387
|
+
"3"
|
|
388
|
+
if (not is_reconnect or force_setup) and images
|
|
389
|
+
else "2"
|
|
390
|
+
if (not is_reconnect or force_setup)
|
|
391
|
+
else "1"
|
|
392
|
+
)
|
|
374
393
|
console.print(f"[bold]Step {step_num}:[/bold] Starting task environment...")
|
|
375
394
|
console.print("[dim]Launching Docker containers...[/dim]")
|
|
376
395
|
console.print()
|
|
@@ -391,7 +410,9 @@ def session_join(ctx, session_id: str, force_setup: bool):
|
|
|
391
410
|
except KeyboardInterrupt:
|
|
392
411
|
events.docker_stopped(exit_code=130)
|
|
393
412
|
console.print()
|
|
394
|
-
console.print(
|
|
413
|
+
console.print(
|
|
414
|
+
"[yellow]Interrupted. Session remains active - you can reconnect later.[/yellow]"
|
|
415
|
+
)
|
|
395
416
|
sys.exit(0)
|
|
396
417
|
except Exception as e:
|
|
397
418
|
events.docker_stopped(exit_code=1)
|
|
@@ -415,10 +436,12 @@ def session_join(ctx, session_id: str, force_setup: bool):
|
|
|
415
436
|
try:
|
|
416
437
|
from io import BytesIO
|
|
417
438
|
from zipfile import ZipFile
|
|
439
|
+
|
|
418
440
|
with ZipFile(BytesIO(files_zip)) as zf:
|
|
419
441
|
if "difficulty_levels.json" in zf.namelist():
|
|
420
442
|
with zf.open("difficulty_levels.json") as f:
|
|
421
443
|
import json
|
|
444
|
+
|
|
422
445
|
difficulty_info = json.load(f)
|
|
423
446
|
agent_id = difficulty_info.get("agent_id")
|
|
424
447
|
except Exception:
|
|
@@ -429,13 +452,21 @@ def session_join(ctx, session_id: str, force_setup: bool):
|
|
|
429
452
|
console.print(f"Answer: {result.answer}")
|
|
430
453
|
console.print(f"Time: {result.time_seconds:.1f}s")
|
|
431
454
|
|
|
455
|
+
# Track upload size and timing
|
|
456
|
+
upload_size_bytes = len(eval_log_bytes) if eval_log_bytes else 0
|
|
457
|
+
upload_size_kb = upload_size_bytes / 1024
|
|
458
|
+
|
|
459
|
+
events.upload_started(size_bytes=upload_size_bytes)
|
|
460
|
+
upload_start_time = time.monotonic()
|
|
461
|
+
|
|
432
462
|
# Upload to server
|
|
433
463
|
with Progress(
|
|
434
464
|
SpinnerColumn(),
|
|
435
465
|
TextColumn("[progress.description]{task.description}"),
|
|
436
466
|
console=console,
|
|
437
467
|
) as progress:
|
|
438
|
-
|
|
468
|
+
size_str = f" ({upload_size_kb:.0f} KB)" if upload_size_kb > 0 else ""
|
|
469
|
+
progress.add_task(f"Uploading result{size_str}...", total=None)
|
|
439
470
|
try:
|
|
440
471
|
upload_result = api.upload_result(
|
|
441
472
|
session_id=session_id,
|
|
@@ -450,6 +481,10 @@ def session_join(ctx, session_id: str, force_setup: bool):
|
|
|
450
481
|
console.print(f"[red]Failed to upload result: {e}[/red]")
|
|
451
482
|
sys.exit(1)
|
|
452
483
|
|
|
484
|
+
# Record upload completion
|
|
485
|
+
upload_duration = time.monotonic() - upload_start_time
|
|
486
|
+
events.upload_completed(duration_seconds=upload_duration, size_bytes=upload_size_bytes)
|
|
487
|
+
|
|
453
488
|
if upload_result.get("score") is not None:
|
|
454
489
|
console.print(f"Score: {upload_result['score']}")
|
|
455
490
|
|
|
@@ -548,474 +583,6 @@ def tasks_run(ctx, task_id: str | None):
|
|
|
548
583
|
sys.exit(1)
|
|
549
584
|
|
|
550
585
|
|
|
551
|
-
# Keep the old implementation as _tasks_run_legacy for testing if needed
|
|
552
|
-
def _tasks_run_legacy(ctx, task_id: str | None):
|
|
553
|
-
"""Legacy implementation of tasks run (for testing only)."""
|
|
554
|
-
config: Config = ctx.obj["config"]
|
|
555
|
-
|
|
556
|
-
if not config.is_authenticated():
|
|
557
|
-
console.print("[red]Not logged in. Run: hte-cli auth login[/red]")
|
|
558
|
-
sys.exit(1)
|
|
559
|
-
|
|
560
|
-
# Check Docker and Compose version
|
|
561
|
-
docker_ok, docker_error = _check_docker()
|
|
562
|
-
if not docker_ok:
|
|
563
|
-
console.print(f"[red]{docker_error}[/red]")
|
|
564
|
-
sys.exit(1)
|
|
565
|
-
|
|
566
|
-
api = APIClient(config)
|
|
567
|
-
|
|
568
|
-
# Get assignments
|
|
569
|
-
with Progress(
|
|
570
|
-
SpinnerColumn(),
|
|
571
|
-
TextColumn("[progress.description]{task.description}"),
|
|
572
|
-
console=console,
|
|
573
|
-
) as progress:
|
|
574
|
-
progress.add_task("Fetching assignments...", total=None)
|
|
575
|
-
try:
|
|
576
|
-
assignments = api.get_assignments()
|
|
577
|
-
except APIError as e:
|
|
578
|
-
console.print(f"[red]Error: {e}[/red]")
|
|
579
|
-
sys.exit(1)
|
|
580
|
-
|
|
581
|
-
if not assignments:
|
|
582
|
-
console.print("[yellow]No pending assignments[/yellow]")
|
|
583
|
-
return
|
|
584
|
-
|
|
585
|
-
# Find the assignment to run
|
|
586
|
-
assignment = None
|
|
587
|
-
if task_id:
|
|
588
|
-
for a in assignments:
|
|
589
|
-
if a["task_id"] == task_id:
|
|
590
|
-
assignment = a
|
|
591
|
-
break
|
|
592
|
-
if not assignment:
|
|
593
|
-
console.print(f"[red]Task not found in your assignments: {task_id}[/red]")
|
|
594
|
-
sys.exit(1)
|
|
595
|
-
else:
|
|
596
|
-
# Take highest priority (first in list, already sorted by server)
|
|
597
|
-
assignment = assignments[0]
|
|
598
|
-
|
|
599
|
-
console.print()
|
|
600
|
-
console.print(
|
|
601
|
-
Panel(
|
|
602
|
-
f"[bold]Task:[/bold] {assignment['task_id']}\n"
|
|
603
|
-
f"[bold]Benchmark:[/bold] {assignment['benchmark']}\n"
|
|
604
|
-
f"[bold]Mode:[/bold] {assignment['mode']}",
|
|
605
|
-
title="Starting Task",
|
|
606
|
-
)
|
|
607
|
-
)
|
|
608
|
-
console.print()
|
|
609
|
-
|
|
610
|
-
# Import runner and events
|
|
611
|
-
from hte_cli.runner import TaskRunner
|
|
612
|
-
from hte_cli.events import EventStreamer
|
|
613
|
-
|
|
614
|
-
# Step 1: Start session (or resume existing)
|
|
615
|
-
session_id = assignment.get("session_id")
|
|
616
|
-
if not session_id:
|
|
617
|
-
with Progress(
|
|
618
|
-
SpinnerColumn(),
|
|
619
|
-
TextColumn("[progress.description]{task.description}"),
|
|
620
|
-
console=console,
|
|
621
|
-
) as progress:
|
|
622
|
-
progress.add_task("Starting session...", total=None)
|
|
623
|
-
try:
|
|
624
|
-
session_info = api.start_session(assignment["assignment_id"])
|
|
625
|
-
session_id = session_info["session_id"]
|
|
626
|
-
except APIError as e:
|
|
627
|
-
console.print(f"[red]Failed to start session: {e}[/red]")
|
|
628
|
-
sys.exit(1)
|
|
629
|
-
else:
|
|
630
|
-
console.print(f"[yellow]Resuming session: {session_id}[/yellow]")
|
|
631
|
-
|
|
632
|
-
# Create event streamer
|
|
633
|
-
events = EventStreamer(api, session_id)
|
|
634
|
-
|
|
635
|
-
# Step 2: Download task files and compose
|
|
636
|
-
files_zip = None
|
|
637
|
-
compose_yaml = None
|
|
638
|
-
|
|
639
|
-
with Progress(
|
|
640
|
-
SpinnerColumn(),
|
|
641
|
-
TextColumn("[progress.description]{task.description}"),
|
|
642
|
-
console=console,
|
|
643
|
-
) as progress:
|
|
644
|
-
# Download files
|
|
645
|
-
task_id_display = progress.add_task("Downloading task files...", total=None)
|
|
646
|
-
try:
|
|
647
|
-
files_zip = api.get_assignment_files(assignment["assignment_id"])
|
|
648
|
-
except APIError as e:
|
|
649
|
-
# Files are optional for some benchmarks
|
|
650
|
-
console.print(f"[dim]Note: {e}[/dim]")
|
|
651
|
-
|
|
652
|
-
# Download compose
|
|
653
|
-
progress.update(task_id_display, description="Downloading compose file...")
|
|
654
|
-
try:
|
|
655
|
-
compose_yaml = api.get_assignment_compose(assignment["assignment_id"])
|
|
656
|
-
except APIError as e:
|
|
657
|
-
# Compose is optional for simple benchmarks
|
|
658
|
-
console.print(f"[dim]Note: {e}[/dim]")
|
|
659
|
-
|
|
660
|
-
# Validate compose for sandbox-required benchmarks
|
|
661
|
-
benchmark = assignment.get("benchmark", "").lower()
|
|
662
|
-
SANDBOX_REQUIRED_BENCHMARKS = {"cybench", "intercode-ctf", "cvebench", "cybergym", "nyuctf"}
|
|
663
|
-
if benchmark in SANDBOX_REQUIRED_BENCHMARKS and not compose_yaml:
|
|
664
|
-
console.print(
|
|
665
|
-
f"[red]Error: This {benchmark} task requires a Docker sandbox but none is available.[/red]"
|
|
666
|
-
)
|
|
667
|
-
if benchmark == "cvebench":
|
|
668
|
-
console.print(
|
|
669
|
-
"[yellow]This CVE may have infrastructure issues preventing Docker support.[/yellow]"
|
|
670
|
-
)
|
|
671
|
-
console.print("[dim]Contact support if you believe this is an error.[/dim]")
|
|
672
|
-
sys.exit(1)
|
|
673
|
-
|
|
674
|
-
# Step 3: Send session_started event
|
|
675
|
-
events.session_started(
|
|
676
|
-
{
|
|
677
|
-
"cli_version": __version__,
|
|
678
|
-
"task_id": assignment["task_id"],
|
|
679
|
-
}
|
|
680
|
-
)
|
|
681
|
-
|
|
682
|
-
# Step 4: Show pre-task message (benchmark-specific)
|
|
683
|
-
single_shot_benchmarks = {"nl2bash", "cybashbench"}
|
|
684
|
-
|
|
685
|
-
console.print()
|
|
686
|
-
if benchmark in single_shot_benchmarks:
|
|
687
|
-
# Single-shot benchmarks - no task score available
|
|
688
|
-
console.print(
|
|
689
|
-
Panel(
|
|
690
|
-
"[bold]Instructions[/bold]\n\n"
|
|
691
|
-
"You are about to enter the task environment.\n\n"
|
|
692
|
-
"[yellow]IMPORTANT: This is a SINGLE-SHOT task.[/yellow]\n"
|
|
693
|
-
"[yellow]You get ONE submission attempt - no retries![/yellow]\n\n"
|
|
694
|
-
"Commands available:\n"
|
|
695
|
-
" [cyan]task status[/cyan] - Show elapsed time\n"
|
|
696
|
-
' [cyan]task submit "answer"[/cyan] - Submit your FINAL answer (ends task)\n'
|
|
697
|
-
" [cyan]task quit[/cyan] - Quit without submitting\n"
|
|
698
|
-
' [cyan]task note "text"[/cyan] - Record observations\n',
|
|
699
|
-
title="Task Environment",
|
|
700
|
-
)
|
|
701
|
-
)
|
|
702
|
-
else:
|
|
703
|
-
# CTF/sandbox benchmarks - task score available
|
|
704
|
-
console.print(
|
|
705
|
-
Panel(
|
|
706
|
-
"[bold]Instructions[/bold]\n\n"
|
|
707
|
-
"You are about to enter the task environment.\n\n"
|
|
708
|
-
"Commands available:\n"
|
|
709
|
-
" [cyan]task status[/cyan] - Show elapsed time\n"
|
|
710
|
-
' [cyan]task score "answer"[/cyan] - CHECK if correct (does NOT end task)\n'
|
|
711
|
-
' [cyan]task submit "answer"[/cyan] - Submit FINAL answer (ends task)\n'
|
|
712
|
-
" [cyan]task quit[/cyan] - Quit without submitting\n"
|
|
713
|
-
' [cyan]task note "text"[/cyan] - Record observations\n\n'
|
|
714
|
-
"[green]TIP: Use 'task score' to verify before submitting![/green]\n",
|
|
715
|
-
title="Task Environment",
|
|
716
|
-
)
|
|
717
|
-
)
|
|
718
|
-
console.print()
|
|
719
|
-
|
|
720
|
-
if not click.confirm("Ready to start?"):
|
|
721
|
-
console.print("[yellow]Cancelled[/yellow]")
|
|
722
|
-
return
|
|
723
|
-
|
|
724
|
-
# Step 5: Pre-pull Docker images with progress
|
|
725
|
-
from hte_cli.image_utils import extract_images_from_compose
|
|
726
|
-
import re
|
|
727
|
-
import time
|
|
728
|
-
|
|
729
|
-
setup_start_time = time.monotonic()
|
|
730
|
-
images: list[str] = []
|
|
731
|
-
results: list[tuple[str, bool, str]] = []
|
|
732
|
-
|
|
733
|
-
if compose_yaml:
|
|
734
|
-
images = extract_images_from_compose(compose_yaml)
|
|
735
|
-
if images:
|
|
736
|
-
events.setup_started(images)
|
|
737
|
-
console.print()
|
|
738
|
-
console.print(f"[bold]Preparing Docker environment ({len(images)} images)...[/bold]")
|
|
739
|
-
|
|
740
|
-
# Track layer progress per image: {layer_id: (status, downloaded_mb, total_mb)}
|
|
741
|
-
image_layers: dict[str, dict[str, tuple[str, float, float]]] = {}
|
|
742
|
-
|
|
743
|
-
def parse_size(size_str: str) -> float:
|
|
744
|
-
"""Parse size string like '1.2MB' or '500kB' to MB."""
|
|
745
|
-
size_str = size_str.strip().upper()
|
|
746
|
-
if "GB" in size_str:
|
|
747
|
-
return float(size_str.replace("GB", "").strip()) * 1024
|
|
748
|
-
elif "MB" in size_str:
|
|
749
|
-
return float(size_str.replace("MB", "").strip())
|
|
750
|
-
elif "KB" in size_str:
|
|
751
|
-
return float(size_str.replace("KB", "").strip()) / 1024
|
|
752
|
-
elif "B" in size_str:
|
|
753
|
-
return float(size_str.replace("B", "").strip()) / (1024 * 1024)
|
|
754
|
-
return 0
|
|
755
|
-
|
|
756
|
-
def parse_docker_line(line: str) -> tuple[str | None, str, float, float]:
|
|
757
|
-
"""Parse Docker pull output to extract layer ID, status, and sizes.
|
|
758
|
-
|
|
759
|
-
Returns: (layer_id, status, downloaded_mb, total_mb)
|
|
760
|
-
"""
|
|
761
|
-
# Format: "79f742de2855: Downloading [==>] 1.2MB/50MB"
|
|
762
|
-
# Or: "79f742de2855: Pull complete"
|
|
763
|
-
match = re.match(r"([a-f0-9]+): (.+)", line)
|
|
764
|
-
if not match:
|
|
765
|
-
return None, "", 0, 0
|
|
766
|
-
|
|
767
|
-
layer_id = match.group(1)
|
|
768
|
-
status_part = match.group(2)
|
|
769
|
-
|
|
770
|
-
# Try to extract size info from "Downloading [==>] 1.2MB/50MB"
|
|
771
|
-
size_match = re.search(r"([\d.]+[kKmMgG]?[bB]?)/([\d.]+[kKmMgG]?[bB])", status_part)
|
|
772
|
-
if size_match:
|
|
773
|
-
downloaded = parse_size(size_match.group(1))
|
|
774
|
-
total = parse_size(size_match.group(2))
|
|
775
|
-
return layer_id, status_part, downloaded, total
|
|
776
|
-
|
|
777
|
-
return layer_id, status_part, 0, 0
|
|
778
|
-
|
|
779
|
-
def get_progress_summary(image: str) -> str:
|
|
780
|
-
"""Get a human-readable progress summary for an image with MB counts."""
|
|
781
|
-
if image not in image_layers or not image_layers[image]:
|
|
782
|
-
return "connecting..."
|
|
783
|
-
|
|
784
|
-
layers = image_layers[image]
|
|
785
|
-
total_layers = len(layers)
|
|
786
|
-
|
|
787
|
-
# Count layers in different states
|
|
788
|
-
complete = 0
|
|
789
|
-
downloading = 0
|
|
790
|
-
waiting = 0
|
|
791
|
-
total_downloaded_mb = 0
|
|
792
|
-
total_size_mb = 0
|
|
793
|
-
|
|
794
|
-
for status, downloaded, total in layers.values():
|
|
795
|
-
status_lower = status.lower()
|
|
796
|
-
if "complete" in status_lower:
|
|
797
|
-
complete += 1
|
|
798
|
-
total_downloaded_mb += total
|
|
799
|
-
total_size_mb += total
|
|
800
|
-
elif "downloading" in status_lower:
|
|
801
|
-
downloading += 1
|
|
802
|
-
total_downloaded_mb += downloaded
|
|
803
|
-
total_size_mb += total
|
|
804
|
-
elif "waiting" in status_lower:
|
|
805
|
-
waiting += 1
|
|
806
|
-
|
|
807
|
-
# Choose the most informative display
|
|
808
|
-
if complete == total_layers and total_layers > 0:
|
|
809
|
-
if total_size_mb > 0:
|
|
810
|
-
return f"done ({total_size_mb:.0f}MB)"
|
|
811
|
-
return f"done ({total_layers} layers)"
|
|
812
|
-
elif total_size_mb > 0:
|
|
813
|
-
# Show MB progress when available
|
|
814
|
-
pct = int(100 * total_downloaded_mb / total_size_mb) if total_size_mb > 0 else 0
|
|
815
|
-
return f"{total_downloaded_mb:.0f}/{total_size_mb:.0f}MB ({pct}%)"
|
|
816
|
-
elif downloading > 0:
|
|
817
|
-
return f"downloading ({complete}/{total_layers} done)"
|
|
818
|
-
elif complete > 0:
|
|
819
|
-
return f"extracting ({complete}/{total_layers} done)"
|
|
820
|
-
elif waiting > 0:
|
|
821
|
-
return f"queued ({total_layers} layers)"
|
|
822
|
-
else:
|
|
823
|
-
return f"preparing ({total_layers} layers)"
|
|
824
|
-
|
|
825
|
-
def on_image_progress(image: str, line: str):
|
|
826
|
-
"""Track layer-level progress with size info."""
|
|
827
|
-
if image not in image_layers:
|
|
828
|
-
image_layers[image] = {}
|
|
829
|
-
|
|
830
|
-
layer_id, status, downloaded, total = parse_docker_line(line)
|
|
831
|
-
if layer_id:
|
|
832
|
-
image_layers[image][layer_id] = (status, downloaded, total)
|
|
833
|
-
|
|
834
|
-
# Process images sequentially with clear output
|
|
835
|
-
results = []
|
|
836
|
-
for idx, img in enumerate(images, 1):
|
|
837
|
-
short_name = img.split("/")[-1] if "/" in img else img
|
|
838
|
-
|
|
839
|
-
# Check if cached first
|
|
840
|
-
from hte_cli.image_utils import check_image_exists_locally, pull_image_with_progress
|
|
841
|
-
|
|
842
|
-
if check_image_exists_locally(img):
|
|
843
|
-
console.print(f" [green]✓[/green] {short_name} [dim](cached)[/dim]")
|
|
844
|
-
results.append((img, True, "cached"))
|
|
845
|
-
continue
|
|
846
|
-
|
|
847
|
-
# Need to pull - use Rich Status for live updates
|
|
848
|
-
image_layers[img] = {}
|
|
849
|
-
|
|
850
|
-
with console.status(
|
|
851
|
-
f"[yellow]↓[/yellow] {short_name} [dim]connecting...[/dim]"
|
|
852
|
-
) as status:
|
|
853
|
-
|
|
854
|
-
def show_progress(image: str, line: str):
|
|
855
|
-
on_image_progress(image, line)
|
|
856
|
-
summary = get_progress_summary(image)
|
|
857
|
-
status.update(f"[yellow]↓[/yellow] {short_name} [dim]{summary}[/dim]")
|
|
858
|
-
|
|
859
|
-
success = pull_image_with_progress(img, on_progress=show_progress)
|
|
860
|
-
|
|
861
|
-
# Final status (printed after status context exits)
|
|
862
|
-
if success:
|
|
863
|
-
console.print(f" [green]✓[/green] {short_name} [dim](downloaded)[/dim]")
|
|
864
|
-
results.append((img, True, "pulled"))
|
|
865
|
-
else:
|
|
866
|
-
console.print(f" [red]✗[/red] {short_name} [dim](failed)[/dim]")
|
|
867
|
-
results.append((img, False, "failed"))
|
|
868
|
-
|
|
869
|
-
failed_count = sum(1 for _, ok, _ in results if not ok)
|
|
870
|
-
if failed_count > 0:
|
|
871
|
-
console.print(
|
|
872
|
-
f"[yellow]Warning: {failed_count} image(s) failed to pull. "
|
|
873
|
-
"Task may fail to start.[/yellow]"
|
|
874
|
-
)
|
|
875
|
-
console.print()
|
|
876
|
-
|
|
877
|
-
# Record image pull timing
|
|
878
|
-
if images:
|
|
879
|
-
pull_duration = time.monotonic() - setup_start_time
|
|
880
|
-
pulled = [img for img, ok, status in results if ok and status == "pulled"]
|
|
881
|
-
cached = [img for img, ok, status in results if ok and status == "cached"]
|
|
882
|
-
failed = [img for img, ok, status in results if not ok]
|
|
883
|
-
events.image_pull_completed(
|
|
884
|
-
duration_seconds=pull_duration,
|
|
885
|
-
pulled=pulled,
|
|
886
|
-
cached=cached,
|
|
887
|
-
failed=failed,
|
|
888
|
-
)
|
|
889
|
-
|
|
890
|
-
# Step 6: Run Inspect's human_cli
|
|
891
|
-
runner = TaskRunner()
|
|
892
|
-
console.print("[bold]Starting task environment...[/bold]")
|
|
893
|
-
console.print("[dim]Launching Docker containers...[/dim]")
|
|
894
|
-
console.print()
|
|
895
|
-
|
|
896
|
-
events.docker_started()
|
|
897
|
-
|
|
898
|
-
# Record total setup time (image pulls + compose up)
|
|
899
|
-
total_setup = time.monotonic() - setup_start_time
|
|
900
|
-
events.setup_completed(total_seconds=total_setup)
|
|
901
|
-
|
|
902
|
-
eval_log_bytes = None
|
|
903
|
-
local_eval_path = None
|
|
904
|
-
try:
|
|
905
|
-
result = runner.run_from_assignment(
|
|
906
|
-
assignment=assignment,
|
|
907
|
-
compose_yaml=compose_yaml,
|
|
908
|
-
files_zip=files_zip,
|
|
909
|
-
)
|
|
910
|
-
# Read eval log BEFORE cleanup (cleanup deletes the temp directory)
|
|
911
|
-
if result.eval_log_path and result.eval_log_path.exists():
|
|
912
|
-
eval_log_bytes = result.eval_log_path.read_bytes()
|
|
913
|
-
|
|
914
|
-
# Save local copy for safety
|
|
915
|
-
eval_logs_dir = get_eval_logs_dir()
|
|
916
|
-
eval_logs_dir.mkdir(parents=True, exist_ok=True)
|
|
917
|
-
local_eval_path = eval_logs_dir / result.eval_log_path.name
|
|
918
|
-
local_eval_path.write_bytes(eval_log_bytes)
|
|
919
|
-
except Exception as e:
|
|
920
|
-
events.docker_stopped(exit_code=1)
|
|
921
|
-
console.print(f"[red]Task execution failed: {e}[/red]")
|
|
922
|
-
sys.exit(1)
|
|
923
|
-
finally:
|
|
924
|
-
runner.cleanup()
|
|
925
|
-
|
|
926
|
-
events.docker_stopped(exit_code=0)
|
|
927
|
-
|
|
928
|
-
# Step 6: Show post-task summary
|
|
929
|
-
console.print()
|
|
930
|
-
console.print(
|
|
931
|
-
Panel(
|
|
932
|
-
f"[bold]Time spent:[/bold] {result.time_seconds / 60:.1f} minutes\n"
|
|
933
|
-
f"[bold]Answer:[/bold] {result.answer or '(none)'}\n"
|
|
934
|
-
f"[bold]Score:[/bold] {result.score if result.score is not None else 'pending'}",
|
|
935
|
-
title="Task Complete",
|
|
936
|
-
)
|
|
937
|
-
)
|
|
938
|
-
|
|
939
|
-
# Defensive check: don't upload if task didn't actually run
|
|
940
|
-
# (catches edge cases where runner returned without proper error)
|
|
941
|
-
if result.time_seconds == 0.0 and result.answer is None:
|
|
942
|
-
console.print()
|
|
943
|
-
console.print("[red]Task did not complete successfully (0 time, no answer).[/red]")
|
|
944
|
-
console.print("[yellow]Session preserved - run 'hte-cli tasks run' to retry.[/yellow]")
|
|
945
|
-
sys.exit(1)
|
|
946
|
-
|
|
947
|
-
# Step 7: Upload result
|
|
948
|
-
events.session_completed(
|
|
949
|
-
elapsed_seconds=result.time_seconds,
|
|
950
|
-
answer=result.answer,
|
|
951
|
-
)
|
|
952
|
-
|
|
953
|
-
# Extract agent_id from task files for CyberGym post-hoc verification
|
|
954
|
-
agent_id = None
|
|
955
|
-
if files_zip:
|
|
956
|
-
try:
|
|
957
|
-
with ZipFile(BytesIO(files_zip)) as zf:
|
|
958
|
-
if "difficulty_levels.json" in zf.namelist():
|
|
959
|
-
with zf.open("difficulty_levels.json") as f:
|
|
960
|
-
difficulty_info = json.load(f)
|
|
961
|
-
agent_id = difficulty_info.get("agent_id")
|
|
962
|
-
except Exception:
|
|
963
|
-
pass # Not a CyberGym task or malformed zip
|
|
964
|
-
|
|
965
|
-
# Show upload size info and track timing
|
|
966
|
-
upload_size_bytes = len(eval_log_bytes) if eval_log_bytes else 0
|
|
967
|
-
upload_size_kb = upload_size_bytes / 1024
|
|
968
|
-
if upload_size_kb / 1024 > 50:
|
|
969
|
-
console.print(f"[yellow]Warning: Large eval log ({upload_size_kb / 1024:.1f} MB)[/yellow]")
|
|
970
|
-
|
|
971
|
-
events.upload_started(size_bytes=upload_size_bytes)
|
|
972
|
-
upload_start_time = time.monotonic()
|
|
973
|
-
|
|
974
|
-
with Progress(
|
|
975
|
-
SpinnerColumn(),
|
|
976
|
-
TextColumn("[progress.description]{task.description}"),
|
|
977
|
-
console=console,
|
|
978
|
-
) as progress:
|
|
979
|
-
size_str = f" ({upload_size_kb:.0f} KB)" if upload_size_kb > 0 else ""
|
|
980
|
-
progress.add_task(f"Uploading result{size_str}...", total=None)
|
|
981
|
-
|
|
982
|
-
try:
|
|
983
|
-
upload_result = api.upload_result(
|
|
984
|
-
session_id=session_id,
|
|
985
|
-
answer=result.answer or "",
|
|
986
|
-
client_active_seconds=result.time_seconds,
|
|
987
|
-
eval_log_bytes=eval_log_bytes,
|
|
988
|
-
score=result.score,
|
|
989
|
-
score_binarized=result.score_binarized,
|
|
990
|
-
agent_id=agent_id,
|
|
991
|
-
)
|
|
992
|
-
except APIError as e:
|
|
993
|
-
console.print(f"[red]Failed to upload result: {e}[/red]")
|
|
994
|
-
if local_eval_path:
|
|
995
|
-
console.print(f"[yellow]Eval log saved locally: {local_eval_path}[/yellow]")
|
|
996
|
-
console.print("[yellow]Your result was saved locally but not uploaded.[/yellow]")
|
|
997
|
-
sys.exit(1)
|
|
998
|
-
|
|
999
|
-
# Record upload completion
|
|
1000
|
-
upload_duration = time.monotonic() - upload_start_time
|
|
1001
|
-
events.upload_completed(duration_seconds=upload_duration, size_bytes=upload_size_bytes)
|
|
1002
|
-
|
|
1003
|
-
console.print()
|
|
1004
|
-
console.print("[green]Result uploaded successfully![/green]")
|
|
1005
|
-
|
|
1006
|
-
# Show local eval log path (quote paths with spaces for easy copy-paste)
|
|
1007
|
-
if local_eval_path:
|
|
1008
|
-
path_str = str(local_eval_path)
|
|
1009
|
-
if " " in path_str:
|
|
1010
|
-
path_str = f'"{path_str}"'
|
|
1011
|
-
console.print(f"[dim]Eval log: {path_str}[/dim]")
|
|
1012
|
-
|
|
1013
|
-
# Show next task if available
|
|
1014
|
-
if upload_result.get("next_assignment_id"):
|
|
1015
|
-
console.print()
|
|
1016
|
-
console.print("Run [bold]hte-cli tasks run[/bold] for the next task.")
|
|
1017
|
-
|
|
1018
|
-
|
|
1019
586
|
@tasks.command("pull-images")
|
|
1020
587
|
@click.option("--count", "-n", default=5, help="Number of upcoming tasks to pull images for")
|
|
1021
588
|
@click.pass_context
|
|
@@ -52,6 +52,10 @@ def strip_ansi(text: str) -> str:
|
|
|
52
52
|
|
|
53
53
|
VPS_HOST = os.environ.get("VPS_HOST", "root@209.38.25.118")
|
|
54
54
|
|
|
55
|
+
# Use the pipx-installed CLI (what experts use), not the local venv version
|
|
56
|
+
# This ensures E2E tests validate the actual published PyPI package
|
|
57
|
+
HTE_CLI_PATH = Path.home() / ".local" / "bin" / "hte-cli"
|
|
58
|
+
|
|
55
59
|
# Environment verification commands per benchmark
|
|
56
60
|
# Based on docs/benchmark-environments.md
|
|
57
61
|
# Format: (command, expected_pattern, description)
|
|
@@ -114,7 +118,11 @@ ENVIRONMENT_CHECKS = {
|
|
|
114
118
|
# 32-bit support (for CTF binaries)
|
|
115
119
|
("test -f /lib32/libc.so.6 && echo 'lib32'", "lib32", "32-bit libc available"),
|
|
116
120
|
# Task files location - STRICT
|
|
117
|
-
(
|
|
121
|
+
(
|
|
122
|
+
"test -d /home/ctfplayer/ctf_files && echo 'EXISTS'",
|
|
123
|
+
"EXISTS",
|
|
124
|
+
"ctf_files directory exists",
|
|
125
|
+
),
|
|
118
126
|
],
|
|
119
127
|
"cybench": [
|
|
120
128
|
# Working directory: /app (official cybench/cybench:latest image)
|
|
@@ -148,22 +156,26 @@ ENVIRONMENT_CHECKS = {
|
|
|
148
156
|
# Check that files exist in /root (not just empty directory) using test -gt 0
|
|
149
157
|
("test $(ls -A /root/ | wc -l) -gt 0", None, "Items in /root"),
|
|
150
158
|
# For tasks with subdirs (chunky, etc.), verify structure is mounted
|
|
151
|
-
(
|
|
159
|
+
(
|
|
160
|
+
"test $(find /root -type f 2>/dev/null | head -5 | wc -l) -gt 0",
|
|
161
|
+
None,
|
|
162
|
+
"Files under /root (recursive)",
|
|
163
|
+
),
|
|
152
164
|
],
|
|
153
165
|
"cybergym": [
|
|
154
166
|
# Working directory: /root
|
|
155
167
|
("pwd", "/root", "Working directory is /root"),
|
|
156
168
|
# OS: Kali Linux (same base as cybench)
|
|
157
169
|
("cat /etc/os-release | grep '^NAME='", "Kali", "Kali Linux base image"),
|
|
158
|
-
# CyberGym-specific: submit.sh
|
|
159
|
-
("test -f /submit.sh && echo 'submit.sh'", "submit.sh", "submit.sh present
|
|
160
|
-
("test -x /submit.sh && echo 'executable'", "executable", "submit.sh is executable"),
|
|
170
|
+
# CyberGym-specific: submit.sh in home dir for PoC submission
|
|
171
|
+
("test -f /root/submit.sh && echo 'submit.sh'", "submit.sh", "submit.sh present in /root"),
|
|
172
|
+
("test -x /root/submit.sh && echo 'executable'", "executable", "submit.sh is executable"),
|
|
161
173
|
# Required tools - STRICT (same image as cybench)
|
|
162
174
|
("which gdb", "/usr/bin/gdb", "gdb at /usr/bin/gdb"),
|
|
163
175
|
("which ghidra", "/usr/bin/ghidra", "ghidra at /usr/bin/ghidra"),
|
|
164
176
|
("which radare2", "/usr/bin/radare2", "radare2 at /usr/bin/radare2"),
|
|
165
|
-
#
|
|
166
|
-
("
|
|
177
|
+
# Python via venv (same as cybench/cybench:latest)
|
|
178
|
+
("test -f /venv/bin/python3 && echo 'venv'", "venv", "Python venv at /venv"),
|
|
167
179
|
("which nmap", "/usr/bin/nmap", "nmap at /usr/bin/nmap"),
|
|
168
180
|
("which curl", "/usr/bin/curl", "curl at /usr/bin/curl"),
|
|
169
181
|
# apt-get for additional tools
|
|
@@ -203,7 +215,7 @@ TASK_FILE_CHECKS = {
|
|
|
203
215
|
"primary_knowledge": [], # No local files - network-based
|
|
204
216
|
},
|
|
205
217
|
"cybergym": {
|
|
206
|
-
"*": ["/submit.sh"], # All cybergym tasks need submit.sh
|
|
218
|
+
"*": ["/root/submit.sh"], # All cybergym tasks need submit.sh in home
|
|
207
219
|
},
|
|
208
220
|
"nyuctf": {
|
|
209
221
|
"*": ["/home/ctfplayer/ctf_files"], # Standard location
|
|
@@ -223,29 +235,28 @@ def get_file_checks_for_task(benchmark: str, task_id: str) -> list[tuple]:
|
|
|
223
235
|
expected = task_files.get(task_id) or task_files.get("*", [])
|
|
224
236
|
|
|
225
237
|
for path in expected:
|
|
226
|
-
checks.append(
|
|
227
|
-
(f"test -e {path} && echo 'EXISTS'", "EXISTS", f"{path} exists")
|
|
228
|
-
)
|
|
238
|
+
checks.append((f"test -e {path} && echo 'EXISTS'", "EXISTS", f"{path} exists"))
|
|
229
239
|
|
|
230
240
|
return checks
|
|
231
241
|
|
|
242
|
+
|
|
232
243
|
# Commands to run for submission tests
|
|
233
244
|
SUBMISSION_TESTS = {
|
|
234
245
|
"intercode-ctf": {
|
|
235
246
|
"score_cmd": 'task score "test_flag_123"',
|
|
236
|
-
"score_expect": "
|
|
247
|
+
"score_expect": "Score:", # Output shows "Score: I" (I=Incorrect)
|
|
237
248
|
"submit_cmd": 'task submit "test_flag_for_e2e"',
|
|
238
249
|
"submit_expect": "submitted",
|
|
239
250
|
},
|
|
240
251
|
"nyuctf": {
|
|
241
252
|
"score_cmd": 'task score "test_flag_123"',
|
|
242
|
-
"score_expect": "
|
|
253
|
+
"score_expect": "Score:", # Output shows "Score: I" (I=Incorrect)
|
|
243
254
|
"submit_cmd": 'task submit "test_flag_for_e2e"',
|
|
244
255
|
"submit_expect": "submitted",
|
|
245
256
|
},
|
|
246
257
|
"cybench": {
|
|
247
258
|
"score_cmd": 'task score "test_flag_123"',
|
|
248
|
-
"score_expect": "
|
|
259
|
+
"score_expect": "Score:", # Output shows "Score: I" (I=Incorrect)
|
|
249
260
|
"submit_cmd": 'task submit "test_flag_for_e2e"',
|
|
250
261
|
"submit_expect": "submitted",
|
|
251
262
|
},
|
|
@@ -266,6 +277,12 @@ SUBMISSION_TESTS = {
|
|
|
266
277
|
"submit_cmd": 'task submit "ls -la"',
|
|
267
278
|
"submit_expect": "submitted",
|
|
268
279
|
},
|
|
280
|
+
"nl2bash": {
|
|
281
|
+
# Single-shot - no score command (same as cybashbench)
|
|
282
|
+
"score_cmd": None,
|
|
283
|
+
"submit_cmd": 'task submit "ls -la"',
|
|
284
|
+
"submit_expect": "submitted",
|
|
285
|
+
},
|
|
269
286
|
}
|
|
270
287
|
|
|
271
288
|
|
|
@@ -374,9 +391,16 @@ def run_automated_test(task_id: str, benchmark: str, timeout: int = 300) -> list
|
|
|
374
391
|
|
|
375
392
|
# Start the CLI using the new session join flow
|
|
376
393
|
# Session has status="created", so CLI will run full setup
|
|
377
|
-
|
|
394
|
+
# Use explicit pipx path to test the published PyPI version, not local dev
|
|
395
|
+
if not HTE_CLI_PATH.exists():
|
|
396
|
+
console.print(f"[red]hte-cli not found at {HTE_CLI_PATH}[/red]")
|
|
397
|
+
console.print("[yellow]Install with: pipx install hte-cli[/yellow]")
|
|
398
|
+
results.append(TestResult("CLI installed", False, "", f"hte-cli not at {HTE_CLI_PATH}"))
|
|
399
|
+
return results
|
|
400
|
+
|
|
401
|
+
console.print(f"Launching {HTE_CLI_PATH} session join {session_id}...")
|
|
378
402
|
child = pexpect.spawn(
|
|
379
|
-
f"
|
|
403
|
+
f"{HTE_CLI_PATH} session join {session_id}",
|
|
380
404
|
encoding="utf-8",
|
|
381
405
|
timeout=timeout,
|
|
382
406
|
env={**os.environ, "TERM": "dumb"}, # Disable colors for easier parsing
|
|
@@ -426,6 +450,7 @@ def run_automated_test(task_id: str, benchmark: str, timeout: int = 300) -> list
|
|
|
426
450
|
results.append(TestResult("Environment setup", True, "Environment ready"))
|
|
427
451
|
|
|
428
452
|
# Wait for the "Login to the system" message and docker exec command
|
|
453
|
+
# CVE bench builds containers from source, can take 5+ minutes
|
|
429
454
|
console.print("Waiting for docker exec command...")
|
|
430
455
|
idx = child.expect(
|
|
431
456
|
[
|
|
@@ -433,7 +458,7 @@ def run_automated_test(task_id: str, benchmark: str, timeout: int = 300) -> list
|
|
|
433
458
|
r"docker exec -it",
|
|
434
459
|
pexpect.TIMEOUT,
|
|
435
460
|
],
|
|
436
|
-
timeout=
|
|
461
|
+
timeout=300, # 5 minutes for slow builds (cvebench)
|
|
437
462
|
)
|
|
438
463
|
|
|
439
464
|
if idx == 2: # TIMEOUT
|
|
@@ -603,15 +628,24 @@ def run_automated_test(task_id: str, benchmark: str, timeout: int = 300) -> list
|
|
|
603
628
|
|
|
604
629
|
# Test score command if available
|
|
605
630
|
if sub_tests.get("score_cmd"):
|
|
631
|
+
# Clear buffer before score test to avoid capturing stale output
|
|
632
|
+
try:
|
|
633
|
+
docker_child.read_nonblocking(size=10000, timeout=0.5)
|
|
634
|
+
except Exception:
|
|
635
|
+
pass
|
|
606
636
|
docker_child.sendline(sub_tests["score_cmd"])
|
|
607
637
|
time.sleep(2)
|
|
608
638
|
docker_child.expect(prompt_patterns[:-1], timeout=30)
|
|
609
639
|
output = strip_ansi(docker_child.before or "")
|
|
610
|
-
|
|
640
|
+
|
|
611
641
|
expected_score = sub_tests.get("score_expect")
|
|
612
642
|
if expected_score:
|
|
613
643
|
passed = expected_score.lower() in output.lower()
|
|
614
|
-
details =
|
|
644
|
+
details = (
|
|
645
|
+
output[:200]
|
|
646
|
+
if passed
|
|
647
|
+
else f"Expected '{expected_score}' in output: {output[:100]}..."
|
|
648
|
+
)
|
|
615
649
|
results.append(TestResult("task score", passed, details))
|
|
616
650
|
else:
|
|
617
651
|
results.append(
|
|
@@ -663,7 +697,10 @@ def run_automated_test(task_id: str, benchmark: str, timeout: int = 300) -> list
|
|
|
663
697
|
else:
|
|
664
698
|
results.append(
|
|
665
699
|
TestResult(
|
|
666
|
-
"Submission",
|
|
700
|
+
"Submission",
|
|
701
|
+
False,
|
|
702
|
+
docker_child.before or "",
|
|
703
|
+
"Submission timed out waiting for result",
|
|
667
704
|
)
|
|
668
705
|
)
|
|
669
706
|
elif idx < 3:
|
|
@@ -759,26 +796,36 @@ def verify_artifacts(task_id: str, benchmark: str) -> list[TestResult]:
|
|
|
759
796
|
"Active time recorded", float(active_seconds or 0) > 0, f"Seconds: {active_seconds}"
|
|
760
797
|
)
|
|
761
798
|
)
|
|
762
|
-
|
|
799
|
+
|
|
763
800
|
# Verify answer
|
|
764
|
-
if
|
|
765
|
-
|
|
801
|
+
if (
|
|
802
|
+
expected_answer and benchmark != "cybergym"
|
|
803
|
+
): # Cybergym submits file content, hard to verify here
|
|
804
|
+
results.append(
|
|
766
805
|
TestResult(
|
|
767
806
|
"Answer matches submission",
|
|
768
807
|
answer == expected_answer,
|
|
769
|
-
f"Expected: '{expected_answer}', Got: '{answer}'"
|
|
808
|
+
f"Expected: '{expected_answer}', Got: '{answer}'",
|
|
770
809
|
)
|
|
771
810
|
)
|
|
772
811
|
else:
|
|
773
|
-
|
|
812
|
+
results.append(
|
|
774
813
|
TestResult(
|
|
775
814
|
"Answer recorded", bool(answer), f"Answer: {answer[:50]}..." if answer else ""
|
|
776
815
|
)
|
|
777
816
|
)
|
|
778
817
|
|
|
779
|
-
|
|
780
|
-
|
|
781
|
-
|
|
818
|
+
# Score check - some benchmarks compute scores server-side later (not immediately)
|
|
819
|
+
no_immediate_score = benchmark in ("cybashbench", "nl2bash")
|
|
820
|
+
if no_immediate_score:
|
|
821
|
+
# These benchmarks don't produce immediate scores - skip check
|
|
822
|
+
pass
|
|
823
|
+
else:
|
|
824
|
+
results.append(
|
|
825
|
+
TestResult(
|
|
826
|
+
"Score recorded", score != "", f"Score: {score}" if score else "No score"
|
|
827
|
+
)
|
|
828
|
+
)
|
|
782
829
|
|
|
783
830
|
# Check events (new flow uses setup_started/setup_completed instead of session_started)
|
|
784
831
|
events = ssh_query(f"""
|
|
@@ -37,6 +37,9 @@ TEST_NAME = "E2E Test User"
|
|
|
37
37
|
# CLI config path (matches platformdirs on macOS)
|
|
38
38
|
CLI_CONFIG_PATH = Path.home() / "Library" / "Application Support" / "hte-cli" / "config.json"
|
|
39
39
|
|
|
40
|
+
# Use the pipx-installed CLI (what experts use), not the local venv version
|
|
41
|
+
HTE_CLI_PATH = Path.home() / ".local" / "bin" / "hte-cli"
|
|
42
|
+
|
|
40
43
|
# Task assignments: 4 per benchmark
|
|
41
44
|
# First 2 for pytest API tests, last 2 for interactive tests
|
|
42
45
|
BENCHMARK_TASKS = {
|
|
@@ -347,10 +350,10 @@ def setup(admin_password: str, yes: bool):
|
|
|
347
350
|
CLI_CONFIG_PATH.write_text(json.dumps(config, indent=2))
|
|
348
351
|
console.print("[green]CLI config written[/green]")
|
|
349
352
|
|
|
350
|
-
# 7. Verify CLI works
|
|
353
|
+
# 7. Verify CLI works (use pipx version, not local venv)
|
|
351
354
|
console.print("\nVerifying CLI authentication...")
|
|
352
355
|
result = subprocess.run(
|
|
353
|
-
[
|
|
356
|
+
[str(HTE_CLI_PATH), "auth", "status"],
|
|
354
357
|
capture_output=True,
|
|
355
358
|
text=True,
|
|
356
359
|
)
|
|
@@ -734,11 +737,14 @@ def full(admin_password: str, yes: bool, skip_setup: bool, cleanup_after: bool):
|
|
|
734
737
|
|
|
735
738
|
phase1_result = subprocess.run(
|
|
736
739
|
[
|
|
737
|
-
"uv",
|
|
740
|
+
"uv",
|
|
741
|
+
"run",
|
|
742
|
+
"pytest",
|
|
738
743
|
str(tests_dir / "test_infrastructure.py"),
|
|
739
744
|
str(tests_dir / "test_runtime_imports.py"),
|
|
740
745
|
str(tests_dir / "test_benchmark_flows.py"),
|
|
741
|
-
"-v",
|
|
746
|
+
"-v",
|
|
747
|
+
"--tb=short",
|
|
742
748
|
],
|
|
743
749
|
cwd=tests_dir.parent.parent,
|
|
744
750
|
)
|
|
@@ -785,10 +791,13 @@ def full(admin_password: str, yes: bool, skip_setup: bool, cleanup_after: bool):
|
|
|
785
791
|
|
|
786
792
|
phase3_result = subprocess.run(
|
|
787
793
|
[
|
|
788
|
-
"uv",
|
|
794
|
+
"uv",
|
|
795
|
+
"run",
|
|
796
|
+
"pytest",
|
|
789
797
|
str(tests_dir / "test_session_lifecycle.py"),
|
|
790
798
|
str(tests_dir / "test_eval_logs.py"),
|
|
791
|
-
"-v",
|
|
799
|
+
"-v",
|
|
800
|
+
"--tb=short",
|
|
792
801
|
],
|
|
793
802
|
cwd=tests_dir.parent.parent,
|
|
794
803
|
)
|
|
@@ -833,10 +842,11 @@ def _print_full_summary(results: dict):
|
|
|
833
842
|
if results["phase2"]:
|
|
834
843
|
passed = sum(1 for v in results["phase2"].values() if v)
|
|
835
844
|
total = len(results["phase2"])
|
|
836
|
-
status =
|
|
845
|
+
status = (
|
|
846
|
+
"[green]PASSED[/green]" if passed == total else f"[yellow]{passed}/{total}[/yellow]"
|
|
847
|
+
)
|
|
837
848
|
details = ", ".join(
|
|
838
|
-
f"[green]{b}[/green]" if v else f"[red]{b}[/red]"
|
|
839
|
-
for b, v in results["phase2"].items()
|
|
849
|
+
f"[green]{b}[/green]" if v else f"[red]{b}[/red]" for b, v in results["phase2"].items()
|
|
840
850
|
)
|
|
841
851
|
table.add_row("Phase 2: Benchmarks", status, details)
|
|
842
852
|
|
|
@@ -16,7 +16,6 @@ import requests
|
|
|
16
16
|
from tests.e2e.conftest import (
|
|
17
17
|
BASE_URL,
|
|
18
18
|
EXPECTED_ASSIGNMENT_COUNT,
|
|
19
|
-
EXPECTED_TASKS,
|
|
20
19
|
get_test_user_id,
|
|
21
20
|
ssh_command,
|
|
22
21
|
ssh_query,
|
|
@@ -379,9 +378,9 @@ class TestCrossBenchmark:
|
|
|
379
378
|
SELECT COUNT(*) FROM assignments
|
|
380
379
|
WHERE user_id = '{get_test_user_id()}'
|
|
381
380
|
""")
|
|
382
|
-
assert
|
|
383
|
-
|
|
384
|
-
)
|
|
381
|
+
assert (
|
|
382
|
+
int(count) == EXPECTED_ASSIGNMENT_COUNT
|
|
383
|
+
), f"Expected {EXPECTED_ASSIGNMENT_COUNT} assignments, got {count}"
|
|
385
384
|
|
|
386
385
|
|
|
387
386
|
# =============================================================================
|
|
@@ -28,6 +28,18 @@ LOCAL_EVAL_LOGS_DIR = Path.home() / "Library" / "Application Support" / "hte-cli
|
|
|
28
28
|
VPS_EVAL_LOGS_DIR = "/opt/hte-web/data/eval_logs"
|
|
29
29
|
|
|
30
30
|
|
|
31
|
+
def db_path_to_host_path(db_path: str) -> str:
|
|
32
|
+
"""Translate container path stored in DB to host path on VPS.
|
|
33
|
+
|
|
34
|
+
Backend may store paths as:
|
|
35
|
+
- /data/... (container-relative, needs translation)
|
|
36
|
+
- /opt/hte-web/data/... (already host path, return as-is)
|
|
37
|
+
"""
|
|
38
|
+
if db_path.startswith("/opt/hte-web/"):
|
|
39
|
+
return db_path # Already a host path
|
|
40
|
+
return db_path.replace("/data/", "/opt/hte-web/data/")
|
|
41
|
+
|
|
42
|
+
|
|
31
43
|
def ssh_query(query: str) -> str:
|
|
32
44
|
"""Run a sqlite3 query on the VPS."""
|
|
33
45
|
result = subprocess.run(
|
|
@@ -129,9 +141,16 @@ class TestVPSEvalLogs:
|
|
|
129
141
|
""")
|
|
130
142
|
|
|
131
143
|
# All completed sessions should have eval log paths
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
144
|
+
# Handle empty string from SQL query
|
|
145
|
+
with_path_count = int(with_path) if with_path else 0
|
|
146
|
+
total_count = int(count) if count else 0
|
|
147
|
+
|
|
148
|
+
if total_count == 0:
|
|
149
|
+
pytest.skip("No completed sessions to check")
|
|
150
|
+
|
|
151
|
+
assert (
|
|
152
|
+
with_path_count == total_count
|
|
153
|
+
), f"Only {with_path_count}/{total_count} completed sessions have eval_log_path"
|
|
135
154
|
|
|
136
155
|
def test_eval_log_files_exist_on_vps(self):
|
|
137
156
|
"""Eval log files referenced in DB should exist on VPS."""
|
|
@@ -148,8 +167,9 @@ class TestVPSEvalLogs:
|
|
|
148
167
|
|
|
149
168
|
for path in paths.split("\n"):
|
|
150
169
|
if path:
|
|
151
|
-
|
|
152
|
-
|
|
170
|
+
host_path = db_path_to_host_path(path)
|
|
171
|
+
exists = ssh_command(f"test -f {host_path} && echo exists")
|
|
172
|
+
assert exists == "exists", f"Eval log not found: {host_path} (DB path: {path})"
|
|
153
173
|
|
|
154
174
|
|
|
155
175
|
# =============================================================================
|
|
@@ -176,39 +196,41 @@ class TestEvalLogFormat:
|
|
|
176
196
|
|
|
177
197
|
def test_eval_log_can_be_decompressed(self):
|
|
178
198
|
"""Eval logs should be valid gzip files."""
|
|
179
|
-
|
|
199
|
+
db_path = ssh_query("""
|
|
180
200
|
SELECT eval_log_path FROM sessions
|
|
181
201
|
WHERE status = 'submitted'
|
|
182
202
|
AND eval_log_path IS NOT NULL
|
|
183
203
|
LIMIT 1
|
|
184
204
|
""")
|
|
185
205
|
|
|
186
|
-
if not
|
|
206
|
+
if not db_path:
|
|
187
207
|
pytest.skip("No eval logs to test")
|
|
188
208
|
|
|
209
|
+
path = db_path_to_host_path(db_path)
|
|
189
210
|
# Try to decompress
|
|
190
211
|
result = ssh_command(f"gunzip -t {path} 2>&1 && echo ok")
|
|
191
212
|
assert "ok" in result, f"Eval log not valid gzip: {result}"
|
|
192
213
|
|
|
193
214
|
def test_eval_log_contains_expected_structure(self):
|
|
194
215
|
"""Eval logs should contain expected Inspect AI structure."""
|
|
195
|
-
|
|
216
|
+
db_path = ssh_query("""
|
|
196
217
|
SELECT eval_log_path FROM sessions
|
|
197
218
|
WHERE status = 'submitted'
|
|
198
219
|
AND eval_log_path IS NOT NULL
|
|
199
220
|
LIMIT 1
|
|
200
221
|
""")
|
|
201
222
|
|
|
202
|
-
if not
|
|
223
|
+
if not db_path:
|
|
203
224
|
pytest.skip("No eval logs to test")
|
|
204
225
|
|
|
226
|
+
path = db_path_to_host_path(db_path)
|
|
205
227
|
# List contents of the gzipped eval (it's actually a zip inside gzip)
|
|
206
|
-
#
|
|
228
|
+
# Use python's zipfile since unzip may not be installed
|
|
207
229
|
result = ssh_command(f"""
|
|
208
230
|
cd /tmp &&
|
|
209
231
|
cp {path} test_eval.gz &&
|
|
210
232
|
gunzip -f test_eval.gz &&
|
|
211
|
-
|
|
233
|
+
python3 -c "import zipfile; z=zipfile.ZipFile('test_eval'); print('\\n'.join(z.namelist()[:20]))"
|
|
212
234
|
""")
|
|
213
235
|
|
|
214
236
|
# Should contain header.json at minimum
|
|
@@ -226,40 +248,58 @@ class TestEvalLogUpload:
|
|
|
226
248
|
"""Test eval log upload functionality."""
|
|
227
249
|
|
|
228
250
|
def test_upload_event_recorded(self):
|
|
229
|
-
"""Upload events should be recorded in session_events.
|
|
251
|
+
"""Upload events should be recorded in session_events for sessions with eval logs.
|
|
252
|
+
|
|
253
|
+
Note: Upload events were added in CLI v0.2.22. Sessions created with older
|
|
254
|
+
CLI versions won't have these events.
|
|
255
|
+
"""
|
|
256
|
+
# Find a session that has:
|
|
257
|
+
# 1. eval_log_path (proves upload succeeded)
|
|
258
|
+
# 2. session_started event with cli_version >= 0.2.22 (has upload events)
|
|
230
259
|
session_id = ssh_query(f"""
|
|
231
|
-
SELECT id FROM sessions
|
|
232
|
-
|
|
233
|
-
|
|
260
|
+
SELECT s.id FROM sessions s
|
|
261
|
+
JOIN session_events se ON s.id = se.session_id
|
|
262
|
+
WHERE s.user_id = '{get_test_user_id()}'
|
|
263
|
+
AND s.status = 'submitted'
|
|
264
|
+
AND s.eval_log_path IS NOT NULL
|
|
265
|
+
AND se.event_type = 'session_started'
|
|
266
|
+
AND (
|
|
267
|
+
json_extract(se.event_data, '$.cli_version') >= '0.2.22'
|
|
268
|
+
OR json_extract(se.event_data, '$.cli_version') LIKE '0.3.%'
|
|
269
|
+
OR json_extract(se.event_data, '$.cli_version') LIKE '1.%'
|
|
270
|
+
)
|
|
234
271
|
LIMIT 1
|
|
235
272
|
""")
|
|
236
273
|
|
|
237
274
|
if not session_id:
|
|
238
|
-
pytest.skip("No
|
|
275
|
+
pytest.skip("No sessions with CLI v0.2.22+ (upload events added in v0.2.22)")
|
|
239
276
|
|
|
240
277
|
events = ssh_query(f"""
|
|
241
278
|
SELECT event_type FROM session_events
|
|
242
279
|
WHERE session_id = '{session_id}'
|
|
243
280
|
""")
|
|
244
281
|
|
|
245
|
-
# Should have upload-related events for
|
|
282
|
+
# Should have upload-related events for sessions with eval logs
|
|
246
283
|
event_list = events.split("\n") if events else []
|
|
247
284
|
has_upload = any("upload" in e.lower() for e in event_list)
|
|
248
|
-
|
|
249
|
-
assert
|
|
285
|
+
|
|
286
|
+
assert (
|
|
287
|
+
has_upload
|
|
288
|
+
), f"No upload events found for session {session_id}. Events: {event_list[:5]}"
|
|
250
289
|
|
|
251
290
|
def test_eval_log_size_reasonable(self):
|
|
252
291
|
"""Eval logs should be reasonably sized (not empty, not huge)."""
|
|
253
|
-
|
|
292
|
+
db_path = ssh_query("""
|
|
254
293
|
SELECT eval_log_path FROM sessions
|
|
255
294
|
WHERE status = 'submitted'
|
|
256
295
|
AND eval_log_path IS NOT NULL
|
|
257
296
|
LIMIT 1
|
|
258
297
|
""")
|
|
259
298
|
|
|
260
|
-
if not
|
|
299
|
+
if not db_path:
|
|
261
300
|
pytest.skip("No eval logs to test")
|
|
262
301
|
|
|
302
|
+
path = db_path_to_host_path(db_path)
|
|
263
303
|
size = ssh_command(f"stat -c%s {path} 2>/dev/null || stat -f%z {path}")
|
|
264
304
|
|
|
265
305
|
if size.isdigit():
|
|
@@ -114,9 +114,9 @@ class TestAssignments:
|
|
|
114
114
|
count = ssh_query(
|
|
115
115
|
f"SELECT COUNT(*) FROM assignments WHERE user_id = '{get_test_user_id()}'"
|
|
116
116
|
)
|
|
117
|
-
assert
|
|
118
|
-
|
|
119
|
-
)
|
|
117
|
+
assert (
|
|
118
|
+
int(count) == EXPECTED_ASSIGNMENT_COUNT
|
|
119
|
+
), f"Expected {EXPECTED_ASSIGNMENT_COUNT} assignments, got {count}"
|
|
120
120
|
|
|
121
121
|
@pytest.mark.parametrize("benchmark,tasks", EXPECTED_TASKS.items())
|
|
122
122
|
def test_benchmark_tasks_assigned(self, benchmark, tasks):
|
|
@@ -150,9 +150,7 @@ print(f'Loaded {len(HUMAN_REGISTRY)} benchmarks: {list(HUMAN_REGISTRY.keys())}')
|
|
|
150
150
|
|
|
151
151
|
assert "Loaded" in result.stdout
|
|
152
152
|
# Should have exactly 7 benchmarks
|
|
153
|
-
assert "7 benchmarks" in result.stdout,
|
|
154
|
-
f"Expected 7 benchmarks, got: {result.stdout}"
|
|
155
|
-
)
|
|
153
|
+
assert "7 benchmarks" in result.stdout, f"Expected 7 benchmarks, got: {result.stdout}"
|
|
156
154
|
|
|
157
155
|
def test_backend_can_import_adapters(self):
|
|
158
156
|
"""Backend should be able to instantiate adapters."""
|
|
@@ -180,9 +178,9 @@ for name, cls in HUMAN_REGISTRY.items():
|
|
|
180
178
|
|
|
181
179
|
# All benchmarks should show OK - STRICT check
|
|
182
180
|
for benchmark in BENCHMARKS:
|
|
183
|
-
assert
|
|
184
|
-
f"
|
|
185
|
-
)
|
|
181
|
+
assert (
|
|
182
|
+
f"{benchmark}: OK" in result.stdout
|
|
183
|
+
), f"Benchmark {benchmark} not found or not OK in output: {result.stdout}"
|
|
186
184
|
|
|
187
185
|
|
|
188
186
|
class TestLocalImports:
|
|
@@ -223,7 +223,6 @@ class TestSessionState:
|
|
|
223
223
|
WHERE user_id = '{get_test_user_id()}'
|
|
224
224
|
AND status = 'abandoned'
|
|
225
225
|
""")
|
|
226
|
-
count = int(abandoned_count) if abandoned_count else 0
|
|
227
226
|
# Verify the query returned a valid number (not empty/error)
|
|
228
227
|
assert abandoned_count.strip().isdigit(), f"Query returned invalid value: {abandoned_count}"
|
|
229
228
|
# Note: count can legitimately be 0 if no sessions were abandoned
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|