hte-cli 0.2.21__tar.gz → 0.2.23__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. {hte_cli-0.2.21 → hte_cli-0.2.23}/PKG-INFO +1 -1
  2. {hte_cli-0.2.21 → hte_cli-0.2.23}/pyproject.toml +1 -1
  3. {hte_cli-0.2.21 → hte_cli-0.2.23}/src/hte_cli/cli.py +48 -481
  4. {hte_cli-0.2.21 → hte_cli-0.2.23}/tests/e2e/automated_runner.py +75 -28
  5. {hte_cli-0.2.21 → hte_cli-0.2.23}/tests/e2e/e2e_test.py +19 -9
  6. {hte_cli-0.2.21 → hte_cli-0.2.23}/tests/e2e/test_benchmark_flows.py +3 -4
  7. {hte_cli-0.2.21 → hte_cli-0.2.23}/tests/e2e/test_eval_logs.py +61 -21
  8. {hte_cli-0.2.21 → hte_cli-0.2.23}/tests/e2e/test_infrastructure.py +3 -3
  9. {hte_cli-0.2.21 → hte_cli-0.2.23}/tests/e2e/test_runtime_imports.py +4 -6
  10. {hte_cli-0.2.21 → hte_cli-0.2.23}/tests/e2e/test_session_lifecycle.py +0 -1
  11. {hte_cli-0.2.21 → hte_cli-0.2.23}/uv.lock +1 -1
  12. {hte_cli-0.2.21 → hte_cli-0.2.23}/.gitignore +0 -0
  13. {hte_cli-0.2.21 → hte_cli-0.2.23}/README.md +0 -0
  14. {hte_cli-0.2.21 → hte_cli-0.2.23}/src/hte_cli/__init__.py +0 -0
  15. {hte_cli-0.2.21 → hte_cli-0.2.23}/src/hte_cli/__main__.py +0 -0
  16. {hte_cli-0.2.21 → hte_cli-0.2.23}/src/hte_cli/api_client.py +0 -0
  17. {hte_cli-0.2.21 → hte_cli-0.2.23}/src/hte_cli/config.py +0 -0
  18. {hte_cli-0.2.21 → hte_cli-0.2.23}/src/hte_cli/errors.py +0 -0
  19. {hte_cli-0.2.21 → hte_cli-0.2.23}/src/hte_cli/events.py +0 -0
  20. {hte_cli-0.2.21 → hte_cli-0.2.23}/src/hte_cli/image_utils.py +0 -0
  21. {hte_cli-0.2.21 → hte_cli-0.2.23}/src/hte_cli/runner.py +0 -0
  22. {hte_cli-0.2.21 → hte_cli-0.2.23}/src/hte_cli/scorers.py +0 -0
  23. {hte_cli-0.2.21 → hte_cli-0.2.23}/src/hte_cli/version_check.py +0 -0
  24. {hte_cli-0.2.21 → hte_cli-0.2.23}/tests/__init__.py +0 -0
  25. {hte_cli-0.2.21 → hte_cli-0.2.23}/tests/e2e/__init__.py +0 -0
  26. {hte_cli-0.2.21 → hte_cli-0.2.23}/tests/e2e/conftest.py +0 -0
  27. {hte_cli-0.2.21 → hte_cli-0.2.23}/tests/e2e/verify_docker_deps.py +0 -0
  28. {hte_cli-0.2.21 → hte_cli-0.2.23}/tests/unit/__init__.py +0 -0
  29. {hte_cli-0.2.21 → hte_cli-0.2.23}/tests/unit/conftest.py +0 -0
  30. {hte_cli-0.2.21 → hte_cli-0.2.23}/tests/unit/test_image_utils.py +0 -0
  31. {hte_cli-0.2.21 → hte_cli-0.2.23}/tests/unit/test_runner.py +0 -0
  32. {hte_cli-0.2.21 → hte_cli-0.2.23}/tests/unit/test_scorers.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: hte-cli
3
- Version: 0.2.21
3
+ Version: 0.2.23
4
4
  Summary: Human Time-to-Completion Evaluation CLI
5
5
  Project-URL: Homepage, https://github.com/sean-peters-au/lyptus-mono
6
6
  Author: Lyptus Research
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "hte-cli"
3
- version = "0.2.21"
3
+ version = "0.2.23"
4
4
  description = "Human Time-to-Completion Evaluation CLI"
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.11"
@@ -194,7 +194,9 @@ def session_join(ctx, session_id: str, force_setup: bool):
194
194
  elif e.status_code == 404:
195
195
  console.print("[red]Session not found. Check the session ID and try again.[/red]")
196
196
  elif e.status_code == 400 and "paused" in str(e).lower():
197
- console.print("[yellow]Session is paused. Please resume from the web UI first.[/yellow]")
197
+ console.print(
198
+ "[yellow]Session is paused. Please resume from the web UI first.[/yellow]"
199
+ )
198
200
  else:
199
201
  console.print(f"[red]Error: {e}[/red]")
200
202
  sys.exit(1)
@@ -236,16 +238,16 @@ def session_join(ctx, session_id: str, force_setup: bool):
236
238
  try:
237
239
  files_zip = api.get_session_files(session_id)
238
240
  console.print(" [green]✓[/green] Task files downloaded")
239
- except APIError as e:
240
- console.print(f" [dim]○ No task files (optional)[/dim]")
241
+ except APIError:
242
+ console.print(" [dim]○ No task files (optional)[/dim]")
241
243
  files_zip = None
242
244
 
243
245
  with console.status("[dim]Fetching compose configuration...[/dim]"):
244
246
  try:
245
247
  compose_yaml = api.get_session_compose(session_id)
246
248
  console.print(" [green]✓[/green] Docker compose downloaded")
247
- except APIError as e:
248
- console.print(f" [dim]○ No compose file (optional)[/dim]")
249
+ except APIError:
250
+ console.print(" [dim]○ No compose file (optional)[/dim]")
249
251
  compose_yaml = None
250
252
 
251
253
  console.print()
@@ -258,9 +260,7 @@ def session_join(ctx, session_id: str, force_setup: bool):
258
260
  f"[red]Error: {benchmark} requires a Docker sandbox but no compose file was found.[/red]"
259
261
  )
260
262
  console.print()
261
- console.print(
262
- f"Please contact support: {SUPPORT_EMAIL}"
263
- )
263
+ console.print(f"Please contact support: {SUPPORT_EMAIL}")
264
264
  sys.exit(1)
265
265
 
266
266
  # Build assignment dict for runner compatibility
@@ -280,6 +280,14 @@ def session_join(ctx, session_id: str, force_setup: bool):
280
280
  },
281
281
  }
282
282
 
283
+ # Send session_started event (records CLI version for debugging)
284
+ events.session_started(
285
+ {
286
+ "cli_version": __version__,
287
+ "task_id": session_info["task_id"],
288
+ }
289
+ )
290
+
283
291
  # Step 3: Run setup (skip if reconnecting without force)
284
292
  setup_start_time = time.monotonic()
285
293
  images = []
@@ -313,7 +321,10 @@ def session_join(ctx, session_id: str, force_setup: bool):
313
321
 
314
322
  # Need to pull - show progress
315
323
  last_status = ["connecting..."]
316
- with console.status(f"[yellow]↓[/yellow] {short_name} [dim]connecting...[/dim]") as status:
324
+ with console.status(
325
+ f"[yellow]↓[/yellow] {short_name} [dim]connecting...[/dim]"
326
+ ) as status:
327
+
317
328
  def show_progress(image: str, line: str):
318
329
  # Show docker output directly - includes MB progress from PTY
319
330
  # Lines look like: "abc123: Downloading 360.9MB/4.075GB"
@@ -325,7 +336,9 @@ def session_join(ctx, session_id: str, force_setup: bool):
325
336
  display = f"{layer_id}: {layer_status}"
326
337
  if display != last_status[0]:
327
338
  last_status[0] = display
328
- status.update(f"[yellow]↓[/yellow] {short_name} [dim]{display}[/dim]")
339
+ status.update(
340
+ f"[yellow]↓[/yellow] {short_name} [dim]{display}[/dim]"
341
+ )
329
342
 
330
343
  success = pull_image_with_progress(img, on_progress=show_progress)
331
344
 
@@ -370,7 +383,13 @@ def session_join(ctx, session_id: str, force_setup: bool):
370
383
  console.print()
371
384
 
372
385
  # Step 3: Run the task using TaskRunner
373
- step_num = "3" if (not is_reconnect or force_setup) and images else "2" if (not is_reconnect or force_setup) else "1"
386
+ step_num = (
387
+ "3"
388
+ if (not is_reconnect or force_setup) and images
389
+ else "2"
390
+ if (not is_reconnect or force_setup)
391
+ else "1"
392
+ )
374
393
  console.print(f"[bold]Step {step_num}:[/bold] Starting task environment...")
375
394
  console.print("[dim]Launching Docker containers...[/dim]")
376
395
  console.print()
@@ -391,7 +410,9 @@ def session_join(ctx, session_id: str, force_setup: bool):
391
410
  except KeyboardInterrupt:
392
411
  events.docker_stopped(exit_code=130)
393
412
  console.print()
394
- console.print("[yellow]Interrupted. Session remains active - you can reconnect later.[/yellow]")
413
+ console.print(
414
+ "[yellow]Interrupted. Session remains active - you can reconnect later.[/yellow]"
415
+ )
395
416
  sys.exit(0)
396
417
  except Exception as e:
397
418
  events.docker_stopped(exit_code=1)
@@ -415,10 +436,12 @@ def session_join(ctx, session_id: str, force_setup: bool):
415
436
  try:
416
437
  from io import BytesIO
417
438
  from zipfile import ZipFile
439
+
418
440
  with ZipFile(BytesIO(files_zip)) as zf:
419
441
  if "difficulty_levels.json" in zf.namelist():
420
442
  with zf.open("difficulty_levels.json") as f:
421
443
  import json
444
+
422
445
  difficulty_info = json.load(f)
423
446
  agent_id = difficulty_info.get("agent_id")
424
447
  except Exception:
@@ -429,13 +452,21 @@ def session_join(ctx, session_id: str, force_setup: bool):
429
452
  console.print(f"Answer: {result.answer}")
430
453
  console.print(f"Time: {result.time_seconds:.1f}s")
431
454
 
455
+ # Track upload size and timing
456
+ upload_size_bytes = len(eval_log_bytes) if eval_log_bytes else 0
457
+ upload_size_kb = upload_size_bytes / 1024
458
+
459
+ events.upload_started(size_bytes=upload_size_bytes)
460
+ upload_start_time = time.monotonic()
461
+
432
462
  # Upload to server
433
463
  with Progress(
434
464
  SpinnerColumn(),
435
465
  TextColumn("[progress.description]{task.description}"),
436
466
  console=console,
437
467
  ) as progress:
438
- progress.add_task("Uploading result...", total=None)
468
+ size_str = f" ({upload_size_kb:.0f} KB)" if upload_size_kb > 0 else ""
469
+ progress.add_task(f"Uploading result{size_str}...", total=None)
439
470
  try:
440
471
  upload_result = api.upload_result(
441
472
  session_id=session_id,
@@ -450,6 +481,10 @@ def session_join(ctx, session_id: str, force_setup: bool):
450
481
  console.print(f"[red]Failed to upload result: {e}[/red]")
451
482
  sys.exit(1)
452
483
 
484
+ # Record upload completion
485
+ upload_duration = time.monotonic() - upload_start_time
486
+ events.upload_completed(duration_seconds=upload_duration, size_bytes=upload_size_bytes)
487
+
453
488
  if upload_result.get("score") is not None:
454
489
  console.print(f"Score: {upload_result['score']}")
455
490
 
@@ -548,474 +583,6 @@ def tasks_run(ctx, task_id: str | None):
548
583
  sys.exit(1)
549
584
 
550
585
 
551
- # Keep the old implementation as _tasks_run_legacy for testing if needed
552
- def _tasks_run_legacy(ctx, task_id: str | None):
553
- """Legacy implementation of tasks run (for testing only)."""
554
- config: Config = ctx.obj["config"]
555
-
556
- if not config.is_authenticated():
557
- console.print("[red]Not logged in. Run: hte-cli auth login[/red]")
558
- sys.exit(1)
559
-
560
- # Check Docker and Compose version
561
- docker_ok, docker_error = _check_docker()
562
- if not docker_ok:
563
- console.print(f"[red]{docker_error}[/red]")
564
- sys.exit(1)
565
-
566
- api = APIClient(config)
567
-
568
- # Get assignments
569
- with Progress(
570
- SpinnerColumn(),
571
- TextColumn("[progress.description]{task.description}"),
572
- console=console,
573
- ) as progress:
574
- progress.add_task("Fetching assignments...", total=None)
575
- try:
576
- assignments = api.get_assignments()
577
- except APIError as e:
578
- console.print(f"[red]Error: {e}[/red]")
579
- sys.exit(1)
580
-
581
- if not assignments:
582
- console.print("[yellow]No pending assignments[/yellow]")
583
- return
584
-
585
- # Find the assignment to run
586
- assignment = None
587
- if task_id:
588
- for a in assignments:
589
- if a["task_id"] == task_id:
590
- assignment = a
591
- break
592
- if not assignment:
593
- console.print(f"[red]Task not found in your assignments: {task_id}[/red]")
594
- sys.exit(1)
595
- else:
596
- # Take highest priority (first in list, already sorted by server)
597
- assignment = assignments[0]
598
-
599
- console.print()
600
- console.print(
601
- Panel(
602
- f"[bold]Task:[/bold] {assignment['task_id']}\n"
603
- f"[bold]Benchmark:[/bold] {assignment['benchmark']}\n"
604
- f"[bold]Mode:[/bold] {assignment['mode']}",
605
- title="Starting Task",
606
- )
607
- )
608
- console.print()
609
-
610
- # Import runner and events
611
- from hte_cli.runner import TaskRunner
612
- from hte_cli.events import EventStreamer
613
-
614
- # Step 1: Start session (or resume existing)
615
- session_id = assignment.get("session_id")
616
- if not session_id:
617
- with Progress(
618
- SpinnerColumn(),
619
- TextColumn("[progress.description]{task.description}"),
620
- console=console,
621
- ) as progress:
622
- progress.add_task("Starting session...", total=None)
623
- try:
624
- session_info = api.start_session(assignment["assignment_id"])
625
- session_id = session_info["session_id"]
626
- except APIError as e:
627
- console.print(f"[red]Failed to start session: {e}[/red]")
628
- sys.exit(1)
629
- else:
630
- console.print(f"[yellow]Resuming session: {session_id}[/yellow]")
631
-
632
- # Create event streamer
633
- events = EventStreamer(api, session_id)
634
-
635
- # Step 2: Download task files and compose
636
- files_zip = None
637
- compose_yaml = None
638
-
639
- with Progress(
640
- SpinnerColumn(),
641
- TextColumn("[progress.description]{task.description}"),
642
- console=console,
643
- ) as progress:
644
- # Download files
645
- task_id_display = progress.add_task("Downloading task files...", total=None)
646
- try:
647
- files_zip = api.get_assignment_files(assignment["assignment_id"])
648
- except APIError as e:
649
- # Files are optional for some benchmarks
650
- console.print(f"[dim]Note: {e}[/dim]")
651
-
652
- # Download compose
653
- progress.update(task_id_display, description="Downloading compose file...")
654
- try:
655
- compose_yaml = api.get_assignment_compose(assignment["assignment_id"])
656
- except APIError as e:
657
- # Compose is optional for simple benchmarks
658
- console.print(f"[dim]Note: {e}[/dim]")
659
-
660
- # Validate compose for sandbox-required benchmarks
661
- benchmark = assignment.get("benchmark", "").lower()
662
- SANDBOX_REQUIRED_BENCHMARKS = {"cybench", "intercode-ctf", "cvebench", "cybergym", "nyuctf"}
663
- if benchmark in SANDBOX_REQUIRED_BENCHMARKS and not compose_yaml:
664
- console.print(
665
- f"[red]Error: This {benchmark} task requires a Docker sandbox but none is available.[/red]"
666
- )
667
- if benchmark == "cvebench":
668
- console.print(
669
- "[yellow]This CVE may have infrastructure issues preventing Docker support.[/yellow]"
670
- )
671
- console.print("[dim]Contact support if you believe this is an error.[/dim]")
672
- sys.exit(1)
673
-
674
- # Step 3: Send session_started event
675
- events.session_started(
676
- {
677
- "cli_version": __version__,
678
- "task_id": assignment["task_id"],
679
- }
680
- )
681
-
682
- # Step 4: Show pre-task message (benchmark-specific)
683
- single_shot_benchmarks = {"nl2bash", "cybashbench"}
684
-
685
- console.print()
686
- if benchmark in single_shot_benchmarks:
687
- # Single-shot benchmarks - no task score available
688
- console.print(
689
- Panel(
690
- "[bold]Instructions[/bold]\n\n"
691
- "You are about to enter the task environment.\n\n"
692
- "[yellow]IMPORTANT: This is a SINGLE-SHOT task.[/yellow]\n"
693
- "[yellow]You get ONE submission attempt - no retries![/yellow]\n\n"
694
- "Commands available:\n"
695
- " [cyan]task status[/cyan] - Show elapsed time\n"
696
- ' [cyan]task submit "answer"[/cyan] - Submit your FINAL answer (ends task)\n'
697
- " [cyan]task quit[/cyan] - Quit without submitting\n"
698
- ' [cyan]task note "text"[/cyan] - Record observations\n',
699
- title="Task Environment",
700
- )
701
- )
702
- else:
703
- # CTF/sandbox benchmarks - task score available
704
- console.print(
705
- Panel(
706
- "[bold]Instructions[/bold]\n\n"
707
- "You are about to enter the task environment.\n\n"
708
- "Commands available:\n"
709
- " [cyan]task status[/cyan] - Show elapsed time\n"
710
- ' [cyan]task score "answer"[/cyan] - CHECK if correct (does NOT end task)\n'
711
- ' [cyan]task submit "answer"[/cyan] - Submit FINAL answer (ends task)\n'
712
- " [cyan]task quit[/cyan] - Quit without submitting\n"
713
- ' [cyan]task note "text"[/cyan] - Record observations\n\n'
714
- "[green]TIP: Use 'task score' to verify before submitting![/green]\n",
715
- title="Task Environment",
716
- )
717
- )
718
- console.print()
719
-
720
- if not click.confirm("Ready to start?"):
721
- console.print("[yellow]Cancelled[/yellow]")
722
- return
723
-
724
- # Step 5: Pre-pull Docker images with progress
725
- from hte_cli.image_utils import extract_images_from_compose
726
- import re
727
- import time
728
-
729
- setup_start_time = time.monotonic()
730
- images: list[str] = []
731
- results: list[tuple[str, bool, str]] = []
732
-
733
- if compose_yaml:
734
- images = extract_images_from_compose(compose_yaml)
735
- if images:
736
- events.setup_started(images)
737
- console.print()
738
- console.print(f"[bold]Preparing Docker environment ({len(images)} images)...[/bold]")
739
-
740
- # Track layer progress per image: {layer_id: (status, downloaded_mb, total_mb)}
741
- image_layers: dict[str, dict[str, tuple[str, float, float]]] = {}
742
-
743
- def parse_size(size_str: str) -> float:
744
- """Parse size string like '1.2MB' or '500kB' to MB."""
745
- size_str = size_str.strip().upper()
746
- if "GB" in size_str:
747
- return float(size_str.replace("GB", "").strip()) * 1024
748
- elif "MB" in size_str:
749
- return float(size_str.replace("MB", "").strip())
750
- elif "KB" in size_str:
751
- return float(size_str.replace("KB", "").strip()) / 1024
752
- elif "B" in size_str:
753
- return float(size_str.replace("B", "").strip()) / (1024 * 1024)
754
- return 0
755
-
756
- def parse_docker_line(line: str) -> tuple[str | None, str, float, float]:
757
- """Parse Docker pull output to extract layer ID, status, and sizes.
758
-
759
- Returns: (layer_id, status, downloaded_mb, total_mb)
760
- """
761
- # Format: "79f742de2855: Downloading [==>] 1.2MB/50MB"
762
- # Or: "79f742de2855: Pull complete"
763
- match = re.match(r"([a-f0-9]+): (.+)", line)
764
- if not match:
765
- return None, "", 0, 0
766
-
767
- layer_id = match.group(1)
768
- status_part = match.group(2)
769
-
770
- # Try to extract size info from "Downloading [==>] 1.2MB/50MB"
771
- size_match = re.search(r"([\d.]+[kKmMgG]?[bB]?)/([\d.]+[kKmMgG]?[bB])", status_part)
772
- if size_match:
773
- downloaded = parse_size(size_match.group(1))
774
- total = parse_size(size_match.group(2))
775
- return layer_id, status_part, downloaded, total
776
-
777
- return layer_id, status_part, 0, 0
778
-
779
- def get_progress_summary(image: str) -> str:
780
- """Get a human-readable progress summary for an image with MB counts."""
781
- if image not in image_layers or not image_layers[image]:
782
- return "connecting..."
783
-
784
- layers = image_layers[image]
785
- total_layers = len(layers)
786
-
787
- # Count layers in different states
788
- complete = 0
789
- downloading = 0
790
- waiting = 0
791
- total_downloaded_mb = 0
792
- total_size_mb = 0
793
-
794
- for status, downloaded, total in layers.values():
795
- status_lower = status.lower()
796
- if "complete" in status_lower:
797
- complete += 1
798
- total_downloaded_mb += total
799
- total_size_mb += total
800
- elif "downloading" in status_lower:
801
- downloading += 1
802
- total_downloaded_mb += downloaded
803
- total_size_mb += total
804
- elif "waiting" in status_lower:
805
- waiting += 1
806
-
807
- # Choose the most informative display
808
- if complete == total_layers and total_layers > 0:
809
- if total_size_mb > 0:
810
- return f"done ({total_size_mb:.0f}MB)"
811
- return f"done ({total_layers} layers)"
812
- elif total_size_mb > 0:
813
- # Show MB progress when available
814
- pct = int(100 * total_downloaded_mb / total_size_mb) if total_size_mb > 0 else 0
815
- return f"{total_downloaded_mb:.0f}/{total_size_mb:.0f}MB ({pct}%)"
816
- elif downloading > 0:
817
- return f"downloading ({complete}/{total_layers} done)"
818
- elif complete > 0:
819
- return f"extracting ({complete}/{total_layers} done)"
820
- elif waiting > 0:
821
- return f"queued ({total_layers} layers)"
822
- else:
823
- return f"preparing ({total_layers} layers)"
824
-
825
- def on_image_progress(image: str, line: str):
826
- """Track layer-level progress with size info."""
827
- if image not in image_layers:
828
- image_layers[image] = {}
829
-
830
- layer_id, status, downloaded, total = parse_docker_line(line)
831
- if layer_id:
832
- image_layers[image][layer_id] = (status, downloaded, total)
833
-
834
- # Process images sequentially with clear output
835
- results = []
836
- for idx, img in enumerate(images, 1):
837
- short_name = img.split("/")[-1] if "/" in img else img
838
-
839
- # Check if cached first
840
- from hte_cli.image_utils import check_image_exists_locally, pull_image_with_progress
841
-
842
- if check_image_exists_locally(img):
843
- console.print(f" [green]✓[/green] {short_name} [dim](cached)[/dim]")
844
- results.append((img, True, "cached"))
845
- continue
846
-
847
- # Need to pull - use Rich Status for live updates
848
- image_layers[img] = {}
849
-
850
- with console.status(
851
- f"[yellow]↓[/yellow] {short_name} [dim]connecting...[/dim]"
852
- ) as status:
853
-
854
- def show_progress(image: str, line: str):
855
- on_image_progress(image, line)
856
- summary = get_progress_summary(image)
857
- status.update(f"[yellow]↓[/yellow] {short_name} [dim]{summary}[/dim]")
858
-
859
- success = pull_image_with_progress(img, on_progress=show_progress)
860
-
861
- # Final status (printed after status context exits)
862
- if success:
863
- console.print(f" [green]✓[/green] {short_name} [dim](downloaded)[/dim]")
864
- results.append((img, True, "pulled"))
865
- else:
866
- console.print(f" [red]✗[/red] {short_name} [dim](failed)[/dim]")
867
- results.append((img, False, "failed"))
868
-
869
- failed_count = sum(1 for _, ok, _ in results if not ok)
870
- if failed_count > 0:
871
- console.print(
872
- f"[yellow]Warning: {failed_count} image(s) failed to pull. "
873
- "Task may fail to start.[/yellow]"
874
- )
875
- console.print()
876
-
877
- # Record image pull timing
878
- if images:
879
- pull_duration = time.monotonic() - setup_start_time
880
- pulled = [img for img, ok, status in results if ok and status == "pulled"]
881
- cached = [img for img, ok, status in results if ok and status == "cached"]
882
- failed = [img for img, ok, status in results if not ok]
883
- events.image_pull_completed(
884
- duration_seconds=pull_duration,
885
- pulled=pulled,
886
- cached=cached,
887
- failed=failed,
888
- )
889
-
890
- # Step 6: Run Inspect's human_cli
891
- runner = TaskRunner()
892
- console.print("[bold]Starting task environment...[/bold]")
893
- console.print("[dim]Launching Docker containers...[/dim]")
894
- console.print()
895
-
896
- events.docker_started()
897
-
898
- # Record total setup time (image pulls + compose up)
899
- total_setup = time.monotonic() - setup_start_time
900
- events.setup_completed(total_seconds=total_setup)
901
-
902
- eval_log_bytes = None
903
- local_eval_path = None
904
- try:
905
- result = runner.run_from_assignment(
906
- assignment=assignment,
907
- compose_yaml=compose_yaml,
908
- files_zip=files_zip,
909
- )
910
- # Read eval log BEFORE cleanup (cleanup deletes the temp directory)
911
- if result.eval_log_path and result.eval_log_path.exists():
912
- eval_log_bytes = result.eval_log_path.read_bytes()
913
-
914
- # Save local copy for safety
915
- eval_logs_dir = get_eval_logs_dir()
916
- eval_logs_dir.mkdir(parents=True, exist_ok=True)
917
- local_eval_path = eval_logs_dir / result.eval_log_path.name
918
- local_eval_path.write_bytes(eval_log_bytes)
919
- except Exception as e:
920
- events.docker_stopped(exit_code=1)
921
- console.print(f"[red]Task execution failed: {e}[/red]")
922
- sys.exit(1)
923
- finally:
924
- runner.cleanup()
925
-
926
- events.docker_stopped(exit_code=0)
927
-
928
- # Step 6: Show post-task summary
929
- console.print()
930
- console.print(
931
- Panel(
932
- f"[bold]Time spent:[/bold] {result.time_seconds / 60:.1f} minutes\n"
933
- f"[bold]Answer:[/bold] {result.answer or '(none)'}\n"
934
- f"[bold]Score:[/bold] {result.score if result.score is not None else 'pending'}",
935
- title="Task Complete",
936
- )
937
- )
938
-
939
- # Defensive check: don't upload if task didn't actually run
940
- # (catches edge cases where runner returned without proper error)
941
- if result.time_seconds == 0.0 and result.answer is None:
942
- console.print()
943
- console.print("[red]Task did not complete successfully (0 time, no answer).[/red]")
944
- console.print("[yellow]Session preserved - run 'hte-cli tasks run' to retry.[/yellow]")
945
- sys.exit(1)
946
-
947
- # Step 7: Upload result
948
- events.session_completed(
949
- elapsed_seconds=result.time_seconds,
950
- answer=result.answer,
951
- )
952
-
953
- # Extract agent_id from task files for CyberGym post-hoc verification
954
- agent_id = None
955
- if files_zip:
956
- try:
957
- with ZipFile(BytesIO(files_zip)) as zf:
958
- if "difficulty_levels.json" in zf.namelist():
959
- with zf.open("difficulty_levels.json") as f:
960
- difficulty_info = json.load(f)
961
- agent_id = difficulty_info.get("agent_id")
962
- except Exception:
963
- pass # Not a CyberGym task or malformed zip
964
-
965
- # Show upload size info and track timing
966
- upload_size_bytes = len(eval_log_bytes) if eval_log_bytes else 0
967
- upload_size_kb = upload_size_bytes / 1024
968
- if upload_size_kb / 1024 > 50:
969
- console.print(f"[yellow]Warning: Large eval log ({upload_size_kb / 1024:.1f} MB)[/yellow]")
970
-
971
- events.upload_started(size_bytes=upload_size_bytes)
972
- upload_start_time = time.monotonic()
973
-
974
- with Progress(
975
- SpinnerColumn(),
976
- TextColumn("[progress.description]{task.description}"),
977
- console=console,
978
- ) as progress:
979
- size_str = f" ({upload_size_kb:.0f} KB)" if upload_size_kb > 0 else ""
980
- progress.add_task(f"Uploading result{size_str}...", total=None)
981
-
982
- try:
983
- upload_result = api.upload_result(
984
- session_id=session_id,
985
- answer=result.answer or "",
986
- client_active_seconds=result.time_seconds,
987
- eval_log_bytes=eval_log_bytes,
988
- score=result.score,
989
- score_binarized=result.score_binarized,
990
- agent_id=agent_id,
991
- )
992
- except APIError as e:
993
- console.print(f"[red]Failed to upload result: {e}[/red]")
994
- if local_eval_path:
995
- console.print(f"[yellow]Eval log saved locally: {local_eval_path}[/yellow]")
996
- console.print("[yellow]Your result was saved locally but not uploaded.[/yellow]")
997
- sys.exit(1)
998
-
999
- # Record upload completion
1000
- upload_duration = time.monotonic() - upload_start_time
1001
- events.upload_completed(duration_seconds=upload_duration, size_bytes=upload_size_bytes)
1002
-
1003
- console.print()
1004
- console.print("[green]Result uploaded successfully![/green]")
1005
-
1006
- # Show local eval log path (quote paths with spaces for easy copy-paste)
1007
- if local_eval_path:
1008
- path_str = str(local_eval_path)
1009
- if " " in path_str:
1010
- path_str = f'"{path_str}"'
1011
- console.print(f"[dim]Eval log: {path_str}[/dim]")
1012
-
1013
- # Show next task if available
1014
- if upload_result.get("next_assignment_id"):
1015
- console.print()
1016
- console.print("Run [bold]hte-cli tasks run[/bold] for the next task.")
1017
-
1018
-
1019
586
  @tasks.command("pull-images")
1020
587
  @click.option("--count", "-n", default=5, help="Number of upcoming tasks to pull images for")
1021
588
  @click.pass_context
@@ -52,6 +52,10 @@ def strip_ansi(text: str) -> str:
52
52
 
53
53
  VPS_HOST = os.environ.get("VPS_HOST", "root@209.38.25.118")
54
54
 
55
+ # Use the pipx-installed CLI (what experts use), not the local venv version
56
+ # This ensures E2E tests validate the actual published PyPI package
57
+ HTE_CLI_PATH = Path.home() / ".local" / "bin" / "hte-cli"
58
+
55
59
  # Environment verification commands per benchmark
56
60
  # Based on docs/benchmark-environments.md
57
61
  # Format: (command, expected_pattern, description)
@@ -114,7 +118,11 @@ ENVIRONMENT_CHECKS = {
114
118
  # 32-bit support (for CTF binaries)
115
119
  ("test -f /lib32/libc.so.6 && echo 'lib32'", "lib32", "32-bit libc available"),
116
120
  # Task files location - STRICT
117
- ("test -d /home/ctfplayer/ctf_files && echo 'EXISTS'", "EXISTS", "ctf_files directory exists"),
121
+ (
122
+ "test -d /home/ctfplayer/ctf_files && echo 'EXISTS'",
123
+ "EXISTS",
124
+ "ctf_files directory exists",
125
+ ),
118
126
  ],
119
127
  "cybench": [
120
128
  # Working directory: /app (official cybench/cybench:latest image)
@@ -148,22 +156,26 @@ ENVIRONMENT_CHECKS = {
148
156
  # Check that files exist in /root (not just empty directory) using test -gt 0
149
157
  ("test $(ls -A /root/ | wc -l) -gt 0", None, "Items in /root"),
150
158
  # For tasks with subdirs (chunky, etc.), verify structure is mounted
151
- ("test $(find /root -type f 2>/dev/null | head -5 | wc -l) -gt 0", None, "Files under /root (recursive)"),
159
+ (
160
+ "test $(find /root -type f 2>/dev/null | head -5 | wc -l) -gt 0",
161
+ None,
162
+ "Files under /root (recursive)",
163
+ ),
152
164
  ],
153
165
  "cybergym": [
154
166
  # Working directory: /root
155
167
  ("pwd", "/root", "Working directory is /root"),
156
168
  # OS: Kali Linux (same base as cybench)
157
169
  ("cat /etc/os-release | grep '^NAME='", "Kali", "Kali Linux base image"),
158
- # CyberGym-specific: submit.sh MUST exist for PoC submission
159
- ("test -f /submit.sh && echo 'submit.sh'", "submit.sh", "submit.sh present at /submit.sh"),
160
- ("test -x /submit.sh && echo 'executable'", "executable", "submit.sh is executable"),
170
+ # CyberGym-specific: submit.sh in home dir for PoC submission
171
+ ("test -f /root/submit.sh && echo 'submit.sh'", "submit.sh", "submit.sh present in /root"),
172
+ ("test -x /root/submit.sh && echo 'executable'", "executable", "submit.sh is executable"),
161
173
  # Required tools - STRICT (same image as cybench)
162
174
  ("which gdb", "/usr/bin/gdb", "gdb at /usr/bin/gdb"),
163
175
  ("which ghidra", "/usr/bin/ghidra", "ghidra at /usr/bin/ghidra"),
164
176
  ("which radare2", "/usr/bin/radare2", "radare2 at /usr/bin/radare2"),
165
- # Core security tools - STRICT
166
- ("which python3", "/usr/bin/python3", "python3 at /usr/bin/python3"),
177
+ # Python via venv (same as cybench/cybench:latest)
178
+ ("test -f /venv/bin/python3 && echo 'venv'", "venv", "Python venv at /venv"),
167
179
  ("which nmap", "/usr/bin/nmap", "nmap at /usr/bin/nmap"),
168
180
  ("which curl", "/usr/bin/curl", "curl at /usr/bin/curl"),
169
181
  # apt-get for additional tools
@@ -203,7 +215,7 @@ TASK_FILE_CHECKS = {
203
215
  "primary_knowledge": [], # No local files - network-based
204
216
  },
205
217
  "cybergym": {
206
- "*": ["/submit.sh"], # All cybergym tasks need submit.sh
218
+ "*": ["/root/submit.sh"], # All cybergym tasks need submit.sh in home
207
219
  },
208
220
  "nyuctf": {
209
221
  "*": ["/home/ctfplayer/ctf_files"], # Standard location
@@ -223,29 +235,28 @@ def get_file_checks_for_task(benchmark: str, task_id: str) -> list[tuple]:
223
235
  expected = task_files.get(task_id) or task_files.get("*", [])
224
236
 
225
237
  for path in expected:
226
- checks.append(
227
- (f"test -e {path} && echo 'EXISTS'", "EXISTS", f"{path} exists")
228
- )
238
+ checks.append((f"test -e {path} && echo 'EXISTS'", "EXISTS", f"{path} exists"))
229
239
 
230
240
  return checks
231
241
 
242
+
232
243
  # Commands to run for submission tests
233
244
  SUBMISSION_TESTS = {
234
245
  "intercode-ctf": {
235
246
  "score_cmd": 'task score "test_flag_123"',
236
- "score_expect": "incorrect", # Should say incorrect
247
+ "score_expect": "Score:", # Output shows "Score: I" (I=Incorrect)
237
248
  "submit_cmd": 'task submit "test_flag_for_e2e"',
238
249
  "submit_expect": "submitted",
239
250
  },
240
251
  "nyuctf": {
241
252
  "score_cmd": 'task score "test_flag_123"',
242
- "score_expect": "incorrect",
253
+ "score_expect": "Score:", # Output shows "Score: I" (I=Incorrect)
243
254
  "submit_cmd": 'task submit "test_flag_for_e2e"',
244
255
  "submit_expect": "submitted",
245
256
  },
246
257
  "cybench": {
247
258
  "score_cmd": 'task score "test_flag_123"',
248
- "score_expect": "incorrect",
259
+ "score_expect": "Score:", # Output shows "Score: I" (I=Incorrect)
249
260
  "submit_cmd": 'task submit "test_flag_for_e2e"',
250
261
  "submit_expect": "submitted",
251
262
  },
@@ -266,6 +277,12 @@ SUBMISSION_TESTS = {
266
277
  "submit_cmd": 'task submit "ls -la"',
267
278
  "submit_expect": "submitted",
268
279
  },
280
+ "nl2bash": {
281
+ # Single-shot - no score command (same as cybashbench)
282
+ "score_cmd": None,
283
+ "submit_cmd": 'task submit "ls -la"',
284
+ "submit_expect": "submitted",
285
+ },
269
286
  }
270
287
 
271
288
 
@@ -374,9 +391,16 @@ def run_automated_test(task_id: str, benchmark: str, timeout: int = 300) -> list
374
391
 
375
392
  # Start the CLI using the new session join flow
376
393
  # Session has status="created", so CLI will run full setup
377
- console.print(f"Launching hte-cli session join {session_id}...")
394
+ # Use explicit pipx path to test the published PyPI version, not local dev
395
+ if not HTE_CLI_PATH.exists():
396
+ console.print(f"[red]hte-cli not found at {HTE_CLI_PATH}[/red]")
397
+ console.print("[yellow]Install with: pipx install hte-cli[/yellow]")
398
+ results.append(TestResult("CLI installed", False, "", f"hte-cli not at {HTE_CLI_PATH}"))
399
+ return results
400
+
401
+ console.print(f"Launching {HTE_CLI_PATH} session join {session_id}...")
378
402
  child = pexpect.spawn(
379
- f"hte-cli session join {session_id}",
403
+ f"{HTE_CLI_PATH} session join {session_id}",
380
404
  encoding="utf-8",
381
405
  timeout=timeout,
382
406
  env={**os.environ, "TERM": "dumb"}, # Disable colors for easier parsing
@@ -426,6 +450,7 @@ def run_automated_test(task_id: str, benchmark: str, timeout: int = 300) -> list
426
450
  results.append(TestResult("Environment setup", True, "Environment ready"))
427
451
 
428
452
  # Wait for the "Login to the system" message and docker exec command
453
+ # CVE bench builds containers from source, can take 5+ minutes
429
454
  console.print("Waiting for docker exec command...")
430
455
  idx = child.expect(
431
456
  [
@@ -433,7 +458,7 @@ def run_automated_test(task_id: str, benchmark: str, timeout: int = 300) -> list
433
458
  r"docker exec -it",
434
459
  pexpect.TIMEOUT,
435
460
  ],
436
- timeout=120,
461
+ timeout=300, # 5 minutes for slow builds (cvebench)
437
462
  )
438
463
 
439
464
  if idx == 2: # TIMEOUT
@@ -603,15 +628,24 @@ def run_automated_test(task_id: str, benchmark: str, timeout: int = 300) -> list
603
628
 
604
629
  # Test score command if available
605
630
  if sub_tests.get("score_cmd"):
631
+ # Clear buffer before score test to avoid capturing stale output
632
+ try:
633
+ docker_child.read_nonblocking(size=10000, timeout=0.5)
634
+ except Exception:
635
+ pass
606
636
  docker_child.sendline(sub_tests["score_cmd"])
607
637
  time.sleep(2)
608
638
  docker_child.expect(prompt_patterns[:-1], timeout=30)
609
639
  output = strip_ansi(docker_child.before or "")
610
-
640
+
611
641
  expected_score = sub_tests.get("score_expect")
612
642
  if expected_score:
613
643
  passed = expected_score.lower() in output.lower()
614
- details = output[:200] if passed else f"Expected '{expected_score}' in output: {output[:100]}..."
644
+ details = (
645
+ output[:200]
646
+ if passed
647
+ else f"Expected '{expected_score}' in output: {output[:100]}..."
648
+ )
615
649
  results.append(TestResult("task score", passed, details))
616
650
  else:
617
651
  results.append(
@@ -663,7 +697,10 @@ def run_automated_test(task_id: str, benchmark: str, timeout: int = 300) -> list
663
697
  else:
664
698
  results.append(
665
699
  TestResult(
666
- "Submission", False, docker_child.before or "", "Submission timed out waiting for result"
700
+ "Submission",
701
+ False,
702
+ docker_child.before or "",
703
+ "Submission timed out waiting for result",
667
704
  )
668
705
  )
669
706
  elif idx < 3:
@@ -759,26 +796,36 @@ def verify_artifacts(task_id: str, benchmark: str) -> list[TestResult]:
759
796
  "Active time recorded", float(active_seconds or 0) > 0, f"Seconds: {active_seconds}"
760
797
  )
761
798
  )
762
-
799
+
763
800
  # Verify answer
764
- if expected_answer and benchmark != "cybergym": # Cybergym submits file content, hard to verify here
765
- results.append(
801
+ if (
802
+ expected_answer and benchmark != "cybergym"
803
+ ): # Cybergym submits file content, hard to verify here
804
+ results.append(
766
805
  TestResult(
767
806
  "Answer matches submission",
768
807
  answer == expected_answer,
769
- f"Expected: '{expected_answer}', Got: '{answer}'"
808
+ f"Expected: '{expected_answer}', Got: '{answer}'",
770
809
  )
771
810
  )
772
811
  else:
773
- results.append(
812
+ results.append(
774
813
  TestResult(
775
814
  "Answer recorded", bool(answer), f"Answer: {answer[:50]}..." if answer else ""
776
815
  )
777
816
  )
778
817
 
779
- results.append(
780
- TestResult("Score recorded", score != "", f"Score: {score}" if score else "No score")
781
- )
818
+ # Score check - some benchmarks compute scores server-side later (not immediately)
819
+ no_immediate_score = benchmark in ("cybashbench", "nl2bash")
820
+ if no_immediate_score:
821
+ # These benchmarks don't produce immediate scores - skip check
822
+ pass
823
+ else:
824
+ results.append(
825
+ TestResult(
826
+ "Score recorded", score != "", f"Score: {score}" if score else "No score"
827
+ )
828
+ )
782
829
 
783
830
  # Check events (new flow uses setup_started/setup_completed instead of session_started)
784
831
  events = ssh_query(f"""
@@ -37,6 +37,9 @@ TEST_NAME = "E2E Test User"
37
37
  # CLI config path (matches platformdirs on macOS)
38
38
  CLI_CONFIG_PATH = Path.home() / "Library" / "Application Support" / "hte-cli" / "config.json"
39
39
 
40
+ # Use the pipx-installed CLI (what experts use), not the local venv version
41
+ HTE_CLI_PATH = Path.home() / ".local" / "bin" / "hte-cli"
42
+
40
43
  # Task assignments: 4 per benchmark
41
44
  # First 2 for pytest API tests, last 2 for interactive tests
42
45
  BENCHMARK_TASKS = {
@@ -347,10 +350,10 @@ def setup(admin_password: str, yes: bool):
347
350
  CLI_CONFIG_PATH.write_text(json.dumps(config, indent=2))
348
351
  console.print("[green]CLI config written[/green]")
349
352
 
350
- # 7. Verify CLI works
353
+ # 7. Verify CLI works (use pipx version, not local venv)
351
354
  console.print("\nVerifying CLI authentication...")
352
355
  result = subprocess.run(
353
- ["hte-cli", "auth", "status"],
356
+ [str(HTE_CLI_PATH), "auth", "status"],
354
357
  capture_output=True,
355
358
  text=True,
356
359
  )
@@ -734,11 +737,14 @@ def full(admin_password: str, yes: bool, skip_setup: bool, cleanup_after: bool):
734
737
 
735
738
  phase1_result = subprocess.run(
736
739
  [
737
- "uv", "run", "pytest",
740
+ "uv",
741
+ "run",
742
+ "pytest",
738
743
  str(tests_dir / "test_infrastructure.py"),
739
744
  str(tests_dir / "test_runtime_imports.py"),
740
745
  str(tests_dir / "test_benchmark_flows.py"),
741
- "-v", "--tb=short",
746
+ "-v",
747
+ "--tb=short",
742
748
  ],
743
749
  cwd=tests_dir.parent.parent,
744
750
  )
@@ -785,10 +791,13 @@ def full(admin_password: str, yes: bool, skip_setup: bool, cleanup_after: bool):
785
791
 
786
792
  phase3_result = subprocess.run(
787
793
  [
788
- "uv", "run", "pytest",
794
+ "uv",
795
+ "run",
796
+ "pytest",
789
797
  str(tests_dir / "test_session_lifecycle.py"),
790
798
  str(tests_dir / "test_eval_logs.py"),
791
- "-v", "--tb=short",
799
+ "-v",
800
+ "--tb=short",
792
801
  ],
793
802
  cwd=tests_dir.parent.parent,
794
803
  )
@@ -833,10 +842,11 @@ def _print_full_summary(results: dict):
833
842
  if results["phase2"]:
834
843
  passed = sum(1 for v in results["phase2"].values() if v)
835
844
  total = len(results["phase2"])
836
- status = "[green]PASSED[/green]" if passed == total else f"[yellow]{passed}/{total}[/yellow]"
845
+ status = (
846
+ "[green]PASSED[/green]" if passed == total else f"[yellow]{passed}/{total}[/yellow]"
847
+ )
837
848
  details = ", ".join(
838
- f"[green]{b}[/green]" if v else f"[red]{b}[/red]"
839
- for b, v in results["phase2"].items()
849
+ f"[green]{b}[/green]" if v else f"[red]{b}[/red]" for b, v in results["phase2"].items()
840
850
  )
841
851
  table.add_row("Phase 2: Benchmarks", status, details)
842
852
 
@@ -16,7 +16,6 @@ import requests
16
16
  from tests.e2e.conftest import (
17
17
  BASE_URL,
18
18
  EXPECTED_ASSIGNMENT_COUNT,
19
- EXPECTED_TASKS,
20
19
  get_test_user_id,
21
20
  ssh_command,
22
21
  ssh_query,
@@ -379,9 +378,9 @@ class TestCrossBenchmark:
379
378
  SELECT COUNT(*) FROM assignments
380
379
  WHERE user_id = '{get_test_user_id()}'
381
380
  """)
382
- assert int(count) == EXPECTED_ASSIGNMENT_COUNT, (
383
- f"Expected {EXPECTED_ASSIGNMENT_COUNT} assignments, got {count}"
384
- )
381
+ assert (
382
+ int(count) == EXPECTED_ASSIGNMENT_COUNT
383
+ ), f"Expected {EXPECTED_ASSIGNMENT_COUNT} assignments, got {count}"
385
384
 
386
385
 
387
386
  # =============================================================================
@@ -28,6 +28,18 @@ LOCAL_EVAL_LOGS_DIR = Path.home() / "Library" / "Application Support" / "hte-cli
28
28
  VPS_EVAL_LOGS_DIR = "/opt/hte-web/data/eval_logs"
29
29
 
30
30
 
31
+ def db_path_to_host_path(db_path: str) -> str:
32
+ """Translate container path stored in DB to host path on VPS.
33
+
34
+ Backend may store paths as:
35
+ - /data/... (container-relative, needs translation)
36
+ - /opt/hte-web/data/... (already host path, return as-is)
37
+ """
38
+ if db_path.startswith("/opt/hte-web/"):
39
+ return db_path # Already a host path
40
+ return db_path.replace("/data/", "/opt/hte-web/data/")
41
+
42
+
31
43
  def ssh_query(query: str) -> str:
32
44
  """Run a sqlite3 query on the VPS."""
33
45
  result = subprocess.run(
@@ -129,9 +141,16 @@ class TestVPSEvalLogs:
129
141
  """)
130
142
 
131
143
  # All completed sessions should have eval log paths
132
- assert int(with_path) == int(
133
- count
134
- ), f"Only {with_path}/{count} completed sessions have eval_log_path"
144
+ # Handle empty string from SQL query
145
+ with_path_count = int(with_path) if with_path else 0
146
+ total_count = int(count) if count else 0
147
+
148
+ if total_count == 0:
149
+ pytest.skip("No completed sessions to check")
150
+
151
+ assert (
152
+ with_path_count == total_count
153
+ ), f"Only {with_path_count}/{total_count} completed sessions have eval_log_path"
135
154
 
136
155
  def test_eval_log_files_exist_on_vps(self):
137
156
  """Eval log files referenced in DB should exist on VPS."""
@@ -148,8 +167,9 @@ class TestVPSEvalLogs:
148
167
 
149
168
  for path in paths.split("\n"):
150
169
  if path:
151
- exists = ssh_command(f"test -f {path} && echo exists")
152
- assert exists == "exists", f"Eval log not found: {path}"
170
+ host_path = db_path_to_host_path(path)
171
+ exists = ssh_command(f"test -f {host_path} && echo exists")
172
+ assert exists == "exists", f"Eval log not found: {host_path} (DB path: {path})"
153
173
 
154
174
 
155
175
  # =============================================================================
@@ -176,39 +196,41 @@ class TestEvalLogFormat:
176
196
 
177
197
  def test_eval_log_can_be_decompressed(self):
178
198
  """Eval logs should be valid gzip files."""
179
- path = ssh_query("""
199
+ db_path = ssh_query("""
180
200
  SELECT eval_log_path FROM sessions
181
201
  WHERE status = 'submitted'
182
202
  AND eval_log_path IS NOT NULL
183
203
  LIMIT 1
184
204
  """)
185
205
 
186
- if not path:
206
+ if not db_path:
187
207
  pytest.skip("No eval logs to test")
188
208
 
209
+ path = db_path_to_host_path(db_path)
189
210
  # Try to decompress
190
211
  result = ssh_command(f"gunzip -t {path} 2>&1 && echo ok")
191
212
  assert "ok" in result, f"Eval log not valid gzip: {result}"
192
213
 
193
214
  def test_eval_log_contains_expected_structure(self):
194
215
  """Eval logs should contain expected Inspect AI structure."""
195
- path = ssh_query("""
216
+ db_path = ssh_query("""
196
217
  SELECT eval_log_path FROM sessions
197
218
  WHERE status = 'submitted'
198
219
  AND eval_log_path IS NOT NULL
199
220
  LIMIT 1
200
221
  """)
201
222
 
202
- if not path:
223
+ if not db_path:
203
224
  pytest.skip("No eval logs to test")
204
225
 
226
+ path = db_path_to_host_path(db_path)
205
227
  # List contents of the gzipped eval (it's actually a zip inside gzip)
206
- # First copy to temp, decompress, check structure
228
+ # Use python's zipfile since unzip may not be installed
207
229
  result = ssh_command(f"""
208
230
  cd /tmp &&
209
231
  cp {path} test_eval.gz &&
210
232
  gunzip -f test_eval.gz &&
211
- unzip -l test_eval 2>/dev/null | head -20
233
+ python3 -c "import zipfile; z=zipfile.ZipFile('test_eval'); print('\\n'.join(z.namelist()[:20]))"
212
234
  """)
213
235
 
214
236
  # Should contain header.json at minimum
@@ -226,40 +248,58 @@ class TestEvalLogUpload:
226
248
  """Test eval log upload functionality."""
227
249
 
228
250
  def test_upload_event_recorded(self):
229
- """Upload events should be recorded in session_events."""
251
+ """Upload events should be recorded in session_events for sessions with eval logs.
252
+
253
+ Note: Upload events were added in CLI v0.2.22. Sessions created with older
254
+ CLI versions won't have these events.
255
+ """
256
+ # Find a session that has:
257
+ # 1. eval_log_path (proves upload succeeded)
258
+ # 2. session_started event with cli_version >= 0.2.22 (has upload events)
230
259
  session_id = ssh_query(f"""
231
- SELECT id FROM sessions
232
- WHERE user_id = '{get_test_user_id()}'
233
- AND status = 'submitted'
260
+ SELECT s.id FROM sessions s
261
+ JOIN session_events se ON s.id = se.session_id
262
+ WHERE s.user_id = '{get_test_user_id()}'
263
+ AND s.status = 'submitted'
264
+ AND s.eval_log_path IS NOT NULL
265
+ AND se.event_type = 'session_started'
266
+ AND (
267
+ json_extract(se.event_data, '$.cli_version') >= '0.2.22'
268
+ OR json_extract(se.event_data, '$.cli_version') LIKE '0.3.%'
269
+ OR json_extract(se.event_data, '$.cli_version') LIKE '1.%'
270
+ )
234
271
  LIMIT 1
235
272
  """)
236
273
 
237
274
  if not session_id:
238
- pytest.skip("No completed sessions")
275
+ pytest.skip("No sessions with CLI v0.2.22+ (upload events added in v0.2.22)")
239
276
 
240
277
  events = ssh_query(f"""
241
278
  SELECT event_type FROM session_events
242
279
  WHERE session_id = '{session_id}'
243
280
  """)
244
281
 
245
- # Should have upload-related events for completed sessions
282
+ # Should have upload-related events for sessions with eval logs
246
283
  event_list = events.split("\n") if events else []
247
284
  has_upload = any("upload" in e.lower() for e in event_list)
248
- # Completed sessions should have upload events
249
- assert has_upload, f"No upload events found for session {session_id}. Events: {event_list[:5]}"
285
+
286
+ assert (
287
+ has_upload
288
+ ), f"No upload events found for session {session_id}. Events: {event_list[:5]}"
250
289
 
251
290
  def test_eval_log_size_reasonable(self):
252
291
  """Eval logs should be reasonably sized (not empty, not huge)."""
253
- path = ssh_query("""
292
+ db_path = ssh_query("""
254
293
  SELECT eval_log_path FROM sessions
255
294
  WHERE status = 'submitted'
256
295
  AND eval_log_path IS NOT NULL
257
296
  LIMIT 1
258
297
  """)
259
298
 
260
- if not path:
299
+ if not db_path:
261
300
  pytest.skip("No eval logs to test")
262
301
 
302
+ path = db_path_to_host_path(db_path)
263
303
  size = ssh_command(f"stat -c%s {path} 2>/dev/null || stat -f%z {path}")
264
304
 
265
305
  if size.isdigit():
@@ -114,9 +114,9 @@ class TestAssignments:
114
114
  count = ssh_query(
115
115
  f"SELECT COUNT(*) FROM assignments WHERE user_id = '{get_test_user_id()}'"
116
116
  )
117
- assert int(count) == EXPECTED_ASSIGNMENT_COUNT, (
118
- f"Expected {EXPECTED_ASSIGNMENT_COUNT} assignments, got {count}"
119
- )
117
+ assert (
118
+ int(count) == EXPECTED_ASSIGNMENT_COUNT
119
+ ), f"Expected {EXPECTED_ASSIGNMENT_COUNT} assignments, got {count}"
120
120
 
121
121
  @pytest.mark.parametrize("benchmark,tasks", EXPECTED_TASKS.items())
122
122
  def test_benchmark_tasks_assigned(self, benchmark, tasks):
@@ -150,9 +150,7 @@ print(f'Loaded {len(HUMAN_REGISTRY)} benchmarks: {list(HUMAN_REGISTRY.keys())}')
150
150
 
151
151
  assert "Loaded" in result.stdout
152
152
  # Should have exactly 7 benchmarks
153
- assert "7 benchmarks" in result.stdout, (
154
- f"Expected 7 benchmarks, got: {result.stdout}"
155
- )
153
+ assert "7 benchmarks" in result.stdout, f"Expected 7 benchmarks, got: {result.stdout}"
156
154
 
157
155
  def test_backend_can_import_adapters(self):
158
156
  """Backend should be able to instantiate adapters."""
@@ -180,9 +178,9 @@ for name, cls in HUMAN_REGISTRY.items():
180
178
 
181
179
  # All benchmarks should show OK - STRICT check
182
180
  for benchmark in BENCHMARKS:
183
- assert f"{benchmark}: OK" in result.stdout, (
184
- f"Benchmark {benchmark} not found or not OK in output: {result.stdout}"
185
- )
181
+ assert (
182
+ f"{benchmark}: OK" in result.stdout
183
+ ), f"Benchmark {benchmark} not found or not OK in output: {result.stdout}"
186
184
 
187
185
 
188
186
  class TestLocalImports:
@@ -223,7 +223,6 @@ class TestSessionState:
223
223
  WHERE user_id = '{get_test_user_id()}'
224
224
  AND status = 'abandoned'
225
225
  """)
226
- count = int(abandoned_count) if abandoned_count else 0
227
226
  # Verify the query returned a valid number (not empty/error)
228
227
  assert abandoned_count.strip().isdigit(), f"Query returned invalid value: {abandoned_count}"
229
228
  # Note: count can legitimately be 0 if no sessions were abandoned
@@ -625,7 +625,7 @@ wheels = [
625
625
 
626
626
  [[package]]
627
627
  name = "hte-cli"
628
- version = "0.2.20"
628
+ version = "0.2.22"
629
629
  source = { editable = "." }
630
630
  dependencies = [
631
631
  { name = "click" },
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes