hte-cli 0.2.22__tar.gz → 0.2.23__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. {hte_cli-0.2.22 → hte_cli-0.2.23}/PKG-INFO +1 -1
  2. {hte_cli-0.2.22 → hte_cli-0.2.23}/pyproject.toml +1 -1
  3. {hte_cli-0.2.22 → hte_cli-0.2.23}/src/hte_cli/cli.py +27 -480
  4. {hte_cli-0.2.22 → hte_cli-0.2.23}/tests/e2e/automated_runner.py +32 -14
  5. {hte_cli-0.2.22 → hte_cli-0.2.23}/tests/e2e/e2e_test.py +14 -7
  6. {hte_cli-0.2.22 → hte_cli-0.2.23}/tests/e2e/test_benchmark_flows.py +3 -4
  7. {hte_cli-0.2.22 → hte_cli-0.2.23}/tests/e2e/test_eval_logs.py +33 -13
  8. {hte_cli-0.2.22 → hte_cli-0.2.23}/tests/e2e/test_infrastructure.py +3 -3
  9. {hte_cli-0.2.22 → hte_cli-0.2.23}/tests/e2e/test_runtime_imports.py +4 -6
  10. {hte_cli-0.2.22 → hte_cli-0.2.23}/tests/e2e/test_session_lifecycle.py +0 -1
  11. {hte_cli-0.2.22 → hte_cli-0.2.23}/uv.lock +1 -1
  12. {hte_cli-0.2.22 → hte_cli-0.2.23}/.gitignore +0 -0
  13. {hte_cli-0.2.22 → hte_cli-0.2.23}/README.md +0 -0
  14. {hte_cli-0.2.22 → hte_cli-0.2.23}/src/hte_cli/__init__.py +0 -0
  15. {hte_cli-0.2.22 → hte_cli-0.2.23}/src/hte_cli/__main__.py +0 -0
  16. {hte_cli-0.2.22 → hte_cli-0.2.23}/src/hte_cli/api_client.py +0 -0
  17. {hte_cli-0.2.22 → hte_cli-0.2.23}/src/hte_cli/config.py +0 -0
  18. {hte_cli-0.2.22 → hte_cli-0.2.23}/src/hte_cli/errors.py +0 -0
  19. {hte_cli-0.2.22 → hte_cli-0.2.23}/src/hte_cli/events.py +0 -0
  20. {hte_cli-0.2.22 → hte_cli-0.2.23}/src/hte_cli/image_utils.py +0 -0
  21. {hte_cli-0.2.22 → hte_cli-0.2.23}/src/hte_cli/runner.py +0 -0
  22. {hte_cli-0.2.22 → hte_cli-0.2.23}/src/hte_cli/scorers.py +0 -0
  23. {hte_cli-0.2.22 → hte_cli-0.2.23}/src/hte_cli/version_check.py +0 -0
  24. {hte_cli-0.2.22 → hte_cli-0.2.23}/tests/__init__.py +0 -0
  25. {hte_cli-0.2.22 → hte_cli-0.2.23}/tests/e2e/__init__.py +0 -0
  26. {hte_cli-0.2.22 → hte_cli-0.2.23}/tests/e2e/conftest.py +0 -0
  27. {hte_cli-0.2.22 → hte_cli-0.2.23}/tests/e2e/verify_docker_deps.py +0 -0
  28. {hte_cli-0.2.22 → hte_cli-0.2.23}/tests/unit/__init__.py +0 -0
  29. {hte_cli-0.2.22 → hte_cli-0.2.23}/tests/unit/conftest.py +0 -0
  30. {hte_cli-0.2.22 → hte_cli-0.2.23}/tests/unit/test_image_utils.py +0 -0
  31. {hte_cli-0.2.22 → hte_cli-0.2.23}/tests/unit/test_runner.py +0 -0
  32. {hte_cli-0.2.22 → hte_cli-0.2.23}/tests/unit/test_scorers.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: hte-cli
3
- Version: 0.2.22
3
+ Version: 0.2.23
4
4
  Summary: Human Time-to-Completion Evaluation CLI
5
5
  Project-URL: Homepage, https://github.com/sean-peters-au/lyptus-mono
6
6
  Author: Lyptus Research
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "hte-cli"
3
- version = "0.2.22"
3
+ version = "0.2.23"
4
4
  description = "Human Time-to-Completion Evaluation CLI"
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.11"
@@ -194,7 +194,9 @@ def session_join(ctx, session_id: str, force_setup: bool):
194
194
  elif e.status_code == 404:
195
195
  console.print("[red]Session not found. Check the session ID and try again.[/red]")
196
196
  elif e.status_code == 400 and "paused" in str(e).lower():
197
- console.print("[yellow]Session is paused. Please resume from the web UI first.[/yellow]")
197
+ console.print(
198
+ "[yellow]Session is paused. Please resume from the web UI first.[/yellow]"
199
+ )
198
200
  else:
199
201
  console.print(f"[red]Error: {e}[/red]")
200
202
  sys.exit(1)
@@ -236,16 +238,16 @@ def session_join(ctx, session_id: str, force_setup: bool):
236
238
  try:
237
239
  files_zip = api.get_session_files(session_id)
238
240
  console.print(" [green]✓[/green] Task files downloaded")
239
- except APIError as e:
240
- console.print(f" [dim]○ No task files (optional)[/dim]")
241
+ except APIError:
242
+ console.print(" [dim]○ No task files (optional)[/dim]")
241
243
  files_zip = None
242
244
 
243
245
  with console.status("[dim]Fetching compose configuration...[/dim]"):
244
246
  try:
245
247
  compose_yaml = api.get_session_compose(session_id)
246
248
  console.print(" [green]✓[/green] Docker compose downloaded")
247
- except APIError as e:
248
- console.print(f" [dim]○ No compose file (optional)[/dim]")
249
+ except APIError:
250
+ console.print(" [dim]○ No compose file (optional)[/dim]")
249
251
  compose_yaml = None
250
252
 
251
253
  console.print()
@@ -258,9 +260,7 @@ def session_join(ctx, session_id: str, force_setup: bool):
258
260
  f"[red]Error: {benchmark} requires a Docker sandbox but no compose file was found.[/red]"
259
261
  )
260
262
  console.print()
261
- console.print(
262
- f"Please contact support: {SUPPORT_EMAIL}"
263
- )
263
+ console.print(f"Please contact support: {SUPPORT_EMAIL}")
264
264
  sys.exit(1)
265
265
 
266
266
  # Build assignment dict for runner compatibility
@@ -321,7 +321,10 @@ def session_join(ctx, session_id: str, force_setup: bool):
321
321
 
322
322
  # Need to pull - show progress
323
323
  last_status = ["connecting..."]
324
- with console.status(f"[yellow]↓[/yellow] {short_name} [dim]connecting...[/dim]") as status:
324
+ with console.status(
325
+ f"[yellow]↓[/yellow] {short_name} [dim]connecting...[/dim]"
326
+ ) as status:
327
+
325
328
  def show_progress(image: str, line: str):
326
329
  # Show docker output directly - includes MB progress from PTY
327
330
  # Lines look like: "abc123: Downloading 360.9MB/4.075GB"
@@ -333,7 +336,9 @@ def session_join(ctx, session_id: str, force_setup: bool):
333
336
  display = f"{layer_id}: {layer_status}"
334
337
  if display != last_status[0]:
335
338
  last_status[0] = display
336
- status.update(f"[yellow]↓[/yellow] {short_name} [dim]{display}[/dim]")
339
+ status.update(
340
+ f"[yellow]↓[/yellow] {short_name} [dim]{display}[/dim]"
341
+ )
337
342
 
338
343
  success = pull_image_with_progress(img, on_progress=show_progress)
339
344
 
@@ -378,7 +383,13 @@ def session_join(ctx, session_id: str, force_setup: bool):
378
383
  console.print()
379
384
 
380
385
  # Step 3: Run the task using TaskRunner
381
- step_num = "3" if (not is_reconnect or force_setup) and images else "2" if (not is_reconnect or force_setup) else "1"
386
+ step_num = (
387
+ "3"
388
+ if (not is_reconnect or force_setup) and images
389
+ else "2"
390
+ if (not is_reconnect or force_setup)
391
+ else "1"
392
+ )
382
393
  console.print(f"[bold]Step {step_num}:[/bold] Starting task environment...")
383
394
  console.print("[dim]Launching Docker containers...[/dim]")
384
395
  console.print()
@@ -399,7 +410,9 @@ def session_join(ctx, session_id: str, force_setup: bool):
399
410
  except KeyboardInterrupt:
400
411
  events.docker_stopped(exit_code=130)
401
412
  console.print()
402
- console.print("[yellow]Interrupted. Session remains active - you can reconnect later.[/yellow]")
413
+ console.print(
414
+ "[yellow]Interrupted. Session remains active - you can reconnect later.[/yellow]"
415
+ )
403
416
  sys.exit(0)
404
417
  except Exception as e:
405
418
  events.docker_stopped(exit_code=1)
@@ -423,10 +436,12 @@ def session_join(ctx, session_id: str, force_setup: bool):
423
436
  try:
424
437
  from io import BytesIO
425
438
  from zipfile import ZipFile
439
+
426
440
  with ZipFile(BytesIO(files_zip)) as zf:
427
441
  if "difficulty_levels.json" in zf.namelist():
428
442
  with zf.open("difficulty_levels.json") as f:
429
443
  import json
444
+
430
445
  difficulty_info = json.load(f)
431
446
  agent_id = difficulty_info.get("agent_id")
432
447
  except Exception:
@@ -568,474 +583,6 @@ def tasks_run(ctx, task_id: str | None):
568
583
  sys.exit(1)
569
584
 
570
585
 
571
- # Keep the old implementation as _tasks_run_legacy for testing if needed
572
- def _tasks_run_legacy(ctx, task_id: str | None):
573
- """Legacy implementation of tasks run (for testing only)."""
574
- config: Config = ctx.obj["config"]
575
-
576
- if not config.is_authenticated():
577
- console.print("[red]Not logged in. Run: hte-cli auth login[/red]")
578
- sys.exit(1)
579
-
580
- # Check Docker and Compose version
581
- docker_ok, docker_error = _check_docker()
582
- if not docker_ok:
583
- console.print(f"[red]{docker_error}[/red]")
584
- sys.exit(1)
585
-
586
- api = APIClient(config)
587
-
588
- # Get assignments
589
- with Progress(
590
- SpinnerColumn(),
591
- TextColumn("[progress.description]{task.description}"),
592
- console=console,
593
- ) as progress:
594
- progress.add_task("Fetching assignments...", total=None)
595
- try:
596
- assignments = api.get_assignments()
597
- except APIError as e:
598
- console.print(f"[red]Error: {e}[/red]")
599
- sys.exit(1)
600
-
601
- if not assignments:
602
- console.print("[yellow]No pending assignments[/yellow]")
603
- return
604
-
605
- # Find the assignment to run
606
- assignment = None
607
- if task_id:
608
- for a in assignments:
609
- if a["task_id"] == task_id:
610
- assignment = a
611
- break
612
- if not assignment:
613
- console.print(f"[red]Task not found in your assignments: {task_id}[/red]")
614
- sys.exit(1)
615
- else:
616
- # Take highest priority (first in list, already sorted by server)
617
- assignment = assignments[0]
618
-
619
- console.print()
620
- console.print(
621
- Panel(
622
- f"[bold]Task:[/bold] {assignment['task_id']}\n"
623
- f"[bold]Benchmark:[/bold] {assignment['benchmark']}\n"
624
- f"[bold]Mode:[/bold] {assignment['mode']}",
625
- title="Starting Task",
626
- )
627
- )
628
- console.print()
629
-
630
- # Import runner and events
631
- from hte_cli.runner import TaskRunner
632
- from hte_cli.events import EventStreamer
633
-
634
- # Step 1: Start session (or resume existing)
635
- session_id = assignment.get("session_id")
636
- if not session_id:
637
- with Progress(
638
- SpinnerColumn(),
639
- TextColumn("[progress.description]{task.description}"),
640
- console=console,
641
- ) as progress:
642
- progress.add_task("Starting session...", total=None)
643
- try:
644
- session_info = api.start_session(assignment["assignment_id"])
645
- session_id = session_info["session_id"]
646
- except APIError as e:
647
- console.print(f"[red]Failed to start session: {e}[/red]")
648
- sys.exit(1)
649
- else:
650
- console.print(f"[yellow]Resuming session: {session_id}[/yellow]")
651
-
652
- # Create event streamer
653
- events = EventStreamer(api, session_id)
654
-
655
- # Step 2: Download task files and compose
656
- files_zip = None
657
- compose_yaml = None
658
-
659
- with Progress(
660
- SpinnerColumn(),
661
- TextColumn("[progress.description]{task.description}"),
662
- console=console,
663
- ) as progress:
664
- # Download files
665
- task_id_display = progress.add_task("Downloading task files...", total=None)
666
- try:
667
- files_zip = api.get_assignment_files(assignment["assignment_id"])
668
- except APIError as e:
669
- # Files are optional for some benchmarks
670
- console.print(f"[dim]Note: {e}[/dim]")
671
-
672
- # Download compose
673
- progress.update(task_id_display, description="Downloading compose file...")
674
- try:
675
- compose_yaml = api.get_assignment_compose(assignment["assignment_id"])
676
- except APIError as e:
677
- # Compose is optional for simple benchmarks
678
- console.print(f"[dim]Note: {e}[/dim]")
679
-
680
- # Validate compose for sandbox-required benchmarks
681
- benchmark = assignment.get("benchmark", "").lower()
682
- SANDBOX_REQUIRED_BENCHMARKS = {"cybench", "intercode-ctf", "cvebench", "cybergym", "nyuctf"}
683
- if benchmark in SANDBOX_REQUIRED_BENCHMARKS and not compose_yaml:
684
- console.print(
685
- f"[red]Error: This {benchmark} task requires a Docker sandbox but none is available.[/red]"
686
- )
687
- if benchmark == "cvebench":
688
- console.print(
689
- "[yellow]This CVE may have infrastructure issues preventing Docker support.[/yellow]"
690
- )
691
- console.print("[dim]Contact support if you believe this is an error.[/dim]")
692
- sys.exit(1)
693
-
694
- # Step 3: Send session_started event
695
- events.session_started(
696
- {
697
- "cli_version": __version__,
698
- "task_id": assignment["task_id"],
699
- }
700
- )
701
-
702
- # Step 4: Show pre-task message (benchmark-specific)
703
- single_shot_benchmarks = {"nl2bash", "cybashbench"}
704
-
705
- console.print()
706
- if benchmark in single_shot_benchmarks:
707
- # Single-shot benchmarks - no task score available
708
- console.print(
709
- Panel(
710
- "[bold]Instructions[/bold]\n\n"
711
- "You are about to enter the task environment.\n\n"
712
- "[yellow]IMPORTANT: This is a SINGLE-SHOT task.[/yellow]\n"
713
- "[yellow]You get ONE submission attempt - no retries![/yellow]\n\n"
714
- "Commands available:\n"
715
- " [cyan]task status[/cyan] - Show elapsed time\n"
716
- ' [cyan]task submit "answer"[/cyan] - Submit your FINAL answer (ends task)\n'
717
- " [cyan]task quit[/cyan] - Quit without submitting\n"
718
- ' [cyan]task note "text"[/cyan] - Record observations\n',
719
- title="Task Environment",
720
- )
721
- )
722
- else:
723
- # CTF/sandbox benchmarks - task score available
724
- console.print(
725
- Panel(
726
- "[bold]Instructions[/bold]\n\n"
727
- "You are about to enter the task environment.\n\n"
728
- "Commands available:\n"
729
- " [cyan]task status[/cyan] - Show elapsed time\n"
730
- ' [cyan]task score "answer"[/cyan] - CHECK if correct (does NOT end task)\n'
731
- ' [cyan]task submit "answer"[/cyan] - Submit FINAL answer (ends task)\n'
732
- " [cyan]task quit[/cyan] - Quit without submitting\n"
733
- ' [cyan]task note "text"[/cyan] - Record observations\n\n'
734
- "[green]TIP: Use 'task score' to verify before submitting![/green]\n",
735
- title="Task Environment",
736
- )
737
- )
738
- console.print()
739
-
740
- if not click.confirm("Ready to start?"):
741
- console.print("[yellow]Cancelled[/yellow]")
742
- return
743
-
744
- # Step 5: Pre-pull Docker images with progress
745
- from hte_cli.image_utils import extract_images_from_compose
746
- import re
747
- import time
748
-
749
- setup_start_time = time.monotonic()
750
- images: list[str] = []
751
- results: list[tuple[str, bool, str]] = []
752
-
753
- if compose_yaml:
754
- images = extract_images_from_compose(compose_yaml)
755
- if images:
756
- events.setup_started(images)
757
- console.print()
758
- console.print(f"[bold]Preparing Docker environment ({len(images)} images)...[/bold]")
759
-
760
- # Track layer progress per image: {layer_id: (status, downloaded_mb, total_mb)}
761
- image_layers: dict[str, dict[str, tuple[str, float, float]]] = {}
762
-
763
- def parse_size(size_str: str) -> float:
764
- """Parse size string like '1.2MB' or '500kB' to MB."""
765
- size_str = size_str.strip().upper()
766
- if "GB" in size_str:
767
- return float(size_str.replace("GB", "").strip()) * 1024
768
- elif "MB" in size_str:
769
- return float(size_str.replace("MB", "").strip())
770
- elif "KB" in size_str:
771
- return float(size_str.replace("KB", "").strip()) / 1024
772
- elif "B" in size_str:
773
- return float(size_str.replace("B", "").strip()) / (1024 * 1024)
774
- return 0
775
-
776
- def parse_docker_line(line: str) -> tuple[str | None, str, float, float]:
777
- """Parse Docker pull output to extract layer ID, status, and sizes.
778
-
779
- Returns: (layer_id, status, downloaded_mb, total_mb)
780
- """
781
- # Format: "79f742de2855: Downloading [==>] 1.2MB/50MB"
782
- # Or: "79f742de2855: Pull complete"
783
- match = re.match(r"([a-f0-9]+): (.+)", line)
784
- if not match:
785
- return None, "", 0, 0
786
-
787
- layer_id = match.group(1)
788
- status_part = match.group(2)
789
-
790
- # Try to extract size info from "Downloading [==>] 1.2MB/50MB"
791
- size_match = re.search(r"([\d.]+[kKmMgG]?[bB]?)/([\d.]+[kKmMgG]?[bB])", status_part)
792
- if size_match:
793
- downloaded = parse_size(size_match.group(1))
794
- total = parse_size(size_match.group(2))
795
- return layer_id, status_part, downloaded, total
796
-
797
- return layer_id, status_part, 0, 0
798
-
799
- def get_progress_summary(image: str) -> str:
800
- """Get a human-readable progress summary for an image with MB counts."""
801
- if image not in image_layers or not image_layers[image]:
802
- return "connecting..."
803
-
804
- layers = image_layers[image]
805
- total_layers = len(layers)
806
-
807
- # Count layers in different states
808
- complete = 0
809
- downloading = 0
810
- waiting = 0
811
- total_downloaded_mb = 0
812
- total_size_mb = 0
813
-
814
- for status, downloaded, total in layers.values():
815
- status_lower = status.lower()
816
- if "complete" in status_lower:
817
- complete += 1
818
- total_downloaded_mb += total
819
- total_size_mb += total
820
- elif "downloading" in status_lower:
821
- downloading += 1
822
- total_downloaded_mb += downloaded
823
- total_size_mb += total
824
- elif "waiting" in status_lower:
825
- waiting += 1
826
-
827
- # Choose the most informative display
828
- if complete == total_layers and total_layers > 0:
829
- if total_size_mb > 0:
830
- return f"done ({total_size_mb:.0f}MB)"
831
- return f"done ({total_layers} layers)"
832
- elif total_size_mb > 0:
833
- # Show MB progress when available
834
- pct = int(100 * total_downloaded_mb / total_size_mb) if total_size_mb > 0 else 0
835
- return f"{total_downloaded_mb:.0f}/{total_size_mb:.0f}MB ({pct}%)"
836
- elif downloading > 0:
837
- return f"downloading ({complete}/{total_layers} done)"
838
- elif complete > 0:
839
- return f"extracting ({complete}/{total_layers} done)"
840
- elif waiting > 0:
841
- return f"queued ({total_layers} layers)"
842
- else:
843
- return f"preparing ({total_layers} layers)"
844
-
845
- def on_image_progress(image: str, line: str):
846
- """Track layer-level progress with size info."""
847
- if image not in image_layers:
848
- image_layers[image] = {}
849
-
850
- layer_id, status, downloaded, total = parse_docker_line(line)
851
- if layer_id:
852
- image_layers[image][layer_id] = (status, downloaded, total)
853
-
854
- # Process images sequentially with clear output
855
- results = []
856
- for idx, img in enumerate(images, 1):
857
- short_name = img.split("/")[-1] if "/" in img else img
858
-
859
- # Check if cached first
860
- from hte_cli.image_utils import check_image_exists_locally, pull_image_with_progress
861
-
862
- if check_image_exists_locally(img):
863
- console.print(f" [green]✓[/green] {short_name} [dim](cached)[/dim]")
864
- results.append((img, True, "cached"))
865
- continue
866
-
867
- # Need to pull - use Rich Status for live updates
868
- image_layers[img] = {}
869
-
870
- with console.status(
871
- f"[yellow]↓[/yellow] {short_name} [dim]connecting...[/dim]"
872
- ) as status:
873
-
874
- def show_progress(image: str, line: str):
875
- on_image_progress(image, line)
876
- summary = get_progress_summary(image)
877
- status.update(f"[yellow]↓[/yellow] {short_name} [dim]{summary}[/dim]")
878
-
879
- success = pull_image_with_progress(img, on_progress=show_progress)
880
-
881
- # Final status (printed after status context exits)
882
- if success:
883
- console.print(f" [green]✓[/green] {short_name} [dim](downloaded)[/dim]")
884
- results.append((img, True, "pulled"))
885
- else:
886
- console.print(f" [red]✗[/red] {short_name} [dim](failed)[/dim]")
887
- results.append((img, False, "failed"))
888
-
889
- failed_count = sum(1 for _, ok, _ in results if not ok)
890
- if failed_count > 0:
891
- console.print(
892
- f"[yellow]Warning: {failed_count} image(s) failed to pull. "
893
- "Task may fail to start.[/yellow]"
894
- )
895
- console.print()
896
-
897
- # Record image pull timing
898
- if images:
899
- pull_duration = time.monotonic() - setup_start_time
900
- pulled = [img for img, ok, status in results if ok and status == "pulled"]
901
- cached = [img for img, ok, status in results if ok and status == "cached"]
902
- failed = [img for img, ok, status in results if not ok]
903
- events.image_pull_completed(
904
- duration_seconds=pull_duration,
905
- pulled=pulled,
906
- cached=cached,
907
- failed=failed,
908
- )
909
-
910
- # Step 6: Run Inspect's human_cli
911
- runner = TaskRunner()
912
- console.print("[bold]Starting task environment...[/bold]")
913
- console.print("[dim]Launching Docker containers...[/dim]")
914
- console.print()
915
-
916
- events.docker_started()
917
-
918
- # Record total setup time (image pulls + compose up)
919
- total_setup = time.monotonic() - setup_start_time
920
- events.setup_completed(total_seconds=total_setup)
921
-
922
- eval_log_bytes = None
923
- local_eval_path = None
924
- try:
925
- result = runner.run_from_assignment(
926
- assignment=assignment,
927
- compose_yaml=compose_yaml,
928
- files_zip=files_zip,
929
- )
930
- # Read eval log BEFORE cleanup (cleanup deletes the temp directory)
931
- if result.eval_log_path and result.eval_log_path.exists():
932
- eval_log_bytes = result.eval_log_path.read_bytes()
933
-
934
- # Save local copy for safety
935
- eval_logs_dir = get_eval_logs_dir()
936
- eval_logs_dir.mkdir(parents=True, exist_ok=True)
937
- local_eval_path = eval_logs_dir / result.eval_log_path.name
938
- local_eval_path.write_bytes(eval_log_bytes)
939
- except Exception as e:
940
- events.docker_stopped(exit_code=1)
941
- console.print(f"[red]Task execution failed: {e}[/red]")
942
- sys.exit(1)
943
- finally:
944
- runner.cleanup()
945
-
946
- events.docker_stopped(exit_code=0)
947
-
948
- # Step 6: Show post-task summary
949
- console.print()
950
- console.print(
951
- Panel(
952
- f"[bold]Time spent:[/bold] {result.time_seconds / 60:.1f} minutes\n"
953
- f"[bold]Answer:[/bold] {result.answer or '(none)'}\n"
954
- f"[bold]Score:[/bold] {result.score if result.score is not None else 'pending'}",
955
- title="Task Complete",
956
- )
957
- )
958
-
959
- # Defensive check: don't upload if task didn't actually run
960
- # (catches edge cases where runner returned without proper error)
961
- if result.time_seconds == 0.0 and result.answer is None:
962
- console.print()
963
- console.print("[red]Task did not complete successfully (0 time, no answer).[/red]")
964
- console.print("[yellow]Session preserved - run 'hte-cli tasks run' to retry.[/yellow]")
965
- sys.exit(1)
966
-
967
- # Step 7: Upload result
968
- events.session_completed(
969
- elapsed_seconds=result.time_seconds,
970
- answer=result.answer,
971
- )
972
-
973
- # Extract agent_id from task files for CyberGym post-hoc verification
974
- agent_id = None
975
- if files_zip:
976
- try:
977
- with ZipFile(BytesIO(files_zip)) as zf:
978
- if "difficulty_levels.json" in zf.namelist():
979
- with zf.open("difficulty_levels.json") as f:
980
- difficulty_info = json.load(f)
981
- agent_id = difficulty_info.get("agent_id")
982
- except Exception:
983
- pass # Not a CyberGym task or malformed zip
984
-
985
- # Show upload size info and track timing
986
- upload_size_bytes = len(eval_log_bytes) if eval_log_bytes else 0
987
- upload_size_kb = upload_size_bytes / 1024
988
- if upload_size_kb / 1024 > 50:
989
- console.print(f"[yellow]Warning: Large eval log ({upload_size_kb / 1024:.1f} MB)[/yellow]")
990
-
991
- events.upload_started(size_bytes=upload_size_bytes)
992
- upload_start_time = time.monotonic()
993
-
994
- with Progress(
995
- SpinnerColumn(),
996
- TextColumn("[progress.description]{task.description}"),
997
- console=console,
998
- ) as progress:
999
- size_str = f" ({upload_size_kb:.0f} KB)" if upload_size_kb > 0 else ""
1000
- progress.add_task(f"Uploading result{size_str}...", total=None)
1001
-
1002
- try:
1003
- upload_result = api.upload_result(
1004
- session_id=session_id,
1005
- answer=result.answer or "",
1006
- client_active_seconds=result.time_seconds,
1007
- eval_log_bytes=eval_log_bytes,
1008
- score=result.score,
1009
- score_binarized=result.score_binarized,
1010
- agent_id=agent_id,
1011
- )
1012
- except APIError as e:
1013
- console.print(f"[red]Failed to upload result: {e}[/red]")
1014
- if local_eval_path:
1015
- console.print(f"[yellow]Eval log saved locally: {local_eval_path}[/yellow]")
1016
- console.print("[yellow]Your result was saved locally but not uploaded.[/yellow]")
1017
- sys.exit(1)
1018
-
1019
- # Record upload completion
1020
- upload_duration = time.monotonic() - upload_start_time
1021
- events.upload_completed(duration_seconds=upload_duration, size_bytes=upload_size_bytes)
1022
-
1023
- console.print()
1024
- console.print("[green]Result uploaded successfully![/green]")
1025
-
1026
- # Show local eval log path (quote paths with spaces for easy copy-paste)
1027
- if local_eval_path:
1028
- path_str = str(local_eval_path)
1029
- if " " in path_str:
1030
- path_str = f'"{path_str}"'
1031
- console.print(f"[dim]Eval log: {path_str}[/dim]")
1032
-
1033
- # Show next task if available
1034
- if upload_result.get("next_assignment_id"):
1035
- console.print()
1036
- console.print("Run [bold]hte-cli tasks run[/bold] for the next task.")
1037
-
1038
-
1039
586
  @tasks.command("pull-images")
1040
587
  @click.option("--count", "-n", default=5, help="Number of upcoming tasks to pull images for")
1041
588
  @click.pass_context
@@ -118,7 +118,11 @@ ENVIRONMENT_CHECKS = {
118
118
  # 32-bit support (for CTF binaries)
119
119
  ("test -f /lib32/libc.so.6 && echo 'lib32'", "lib32", "32-bit libc available"),
120
120
  # Task files location - STRICT
121
- ("test -d /home/ctfplayer/ctf_files && echo 'EXISTS'", "EXISTS", "ctf_files directory exists"),
121
+ (
122
+ "test -d /home/ctfplayer/ctf_files && echo 'EXISTS'",
123
+ "EXISTS",
124
+ "ctf_files directory exists",
125
+ ),
122
126
  ],
123
127
  "cybench": [
124
128
  # Working directory: /app (official cybench/cybench:latest image)
@@ -152,7 +156,11 @@ ENVIRONMENT_CHECKS = {
152
156
  # Check that files exist in /root (not just empty directory) using test -gt 0
153
157
  ("test $(ls -A /root/ | wc -l) -gt 0", None, "Items in /root"),
154
158
  # For tasks with subdirs (chunky, etc.), verify structure is mounted
155
- ("test $(find /root -type f 2>/dev/null | head -5 | wc -l) -gt 0", None, "Files under /root (recursive)"),
159
+ (
160
+ "test $(find /root -type f 2>/dev/null | head -5 | wc -l) -gt 0",
161
+ None,
162
+ "Files under /root (recursive)",
163
+ ),
156
164
  ],
157
165
  "cybergym": [
158
166
  # Working directory: /root
@@ -227,12 +235,11 @@ def get_file_checks_for_task(benchmark: str, task_id: str) -> list[tuple]:
227
235
  expected = task_files.get(task_id) or task_files.get("*", [])
228
236
 
229
237
  for path in expected:
230
- checks.append(
231
- (f"test -e {path} && echo 'EXISTS'", "EXISTS", f"{path} exists")
232
- )
238
+ checks.append((f"test -e {path} && echo 'EXISTS'", "EXISTS", f"{path} exists"))
233
239
 
234
240
  return checks
235
241
 
242
+
236
243
  # Commands to run for submission tests
237
244
  SUBMISSION_TESTS = {
238
245
  "intercode-ctf": {
@@ -630,11 +637,15 @@ def run_automated_test(task_id: str, benchmark: str, timeout: int = 300) -> list
630
637
  time.sleep(2)
631
638
  docker_child.expect(prompt_patterns[:-1], timeout=30)
632
639
  output = strip_ansi(docker_child.before or "")
633
-
640
+
634
641
  expected_score = sub_tests.get("score_expect")
635
642
  if expected_score:
636
643
  passed = expected_score.lower() in output.lower()
637
- details = output[:200] if passed else f"Expected '{expected_score}' in output: {output[:100]}..."
644
+ details = (
645
+ output[:200]
646
+ if passed
647
+ else f"Expected '{expected_score}' in output: {output[:100]}..."
648
+ )
638
649
  results.append(TestResult("task score", passed, details))
639
650
  else:
640
651
  results.append(
@@ -686,7 +697,10 @@ def run_automated_test(task_id: str, benchmark: str, timeout: int = 300) -> list
686
697
  else:
687
698
  results.append(
688
699
  TestResult(
689
- "Submission", False, docker_child.before or "", "Submission timed out waiting for result"
700
+ "Submission",
701
+ False,
702
+ docker_child.before or "",
703
+ "Submission timed out waiting for result",
690
704
  )
691
705
  )
692
706
  elif idx < 3:
@@ -782,18 +796,20 @@ def verify_artifacts(task_id: str, benchmark: str) -> list[TestResult]:
782
796
  "Active time recorded", float(active_seconds or 0) > 0, f"Seconds: {active_seconds}"
783
797
  )
784
798
  )
785
-
799
+
786
800
  # Verify answer
787
- if expected_answer and benchmark != "cybergym": # Cybergym submits file content, hard to verify here
788
- results.append(
801
+ if (
802
+ expected_answer and benchmark != "cybergym"
803
+ ): # Cybergym submits file content, hard to verify here
804
+ results.append(
789
805
  TestResult(
790
806
  "Answer matches submission",
791
807
  answer == expected_answer,
792
- f"Expected: '{expected_answer}', Got: '{answer}'"
808
+ f"Expected: '{expected_answer}', Got: '{answer}'",
793
809
  )
794
810
  )
795
811
  else:
796
- results.append(
812
+ results.append(
797
813
  TestResult(
798
814
  "Answer recorded", bool(answer), f"Answer: {answer[:50]}..." if answer else ""
799
815
  )
@@ -806,7 +822,9 @@ def verify_artifacts(task_id: str, benchmark: str) -> list[TestResult]:
806
822
  pass
807
823
  else:
808
824
  results.append(
809
- TestResult("Score recorded", score != "", f"Score: {score}" if score else "No score")
825
+ TestResult(
826
+ "Score recorded", score != "", f"Score: {score}" if score else "No score"
827
+ )
810
828
  )
811
829
 
812
830
  # Check events (new flow uses setup_started/setup_completed instead of session_started)
@@ -737,11 +737,14 @@ def full(admin_password: str, yes: bool, skip_setup: bool, cleanup_after: bool):
737
737
 
738
738
  phase1_result = subprocess.run(
739
739
  [
740
- "uv", "run", "pytest",
740
+ "uv",
741
+ "run",
742
+ "pytest",
741
743
  str(tests_dir / "test_infrastructure.py"),
742
744
  str(tests_dir / "test_runtime_imports.py"),
743
745
  str(tests_dir / "test_benchmark_flows.py"),
744
- "-v", "--tb=short",
746
+ "-v",
747
+ "--tb=short",
745
748
  ],
746
749
  cwd=tests_dir.parent.parent,
747
750
  )
@@ -788,10 +791,13 @@ def full(admin_password: str, yes: bool, skip_setup: bool, cleanup_after: bool):
788
791
 
789
792
  phase3_result = subprocess.run(
790
793
  [
791
- "uv", "run", "pytest",
794
+ "uv",
795
+ "run",
796
+ "pytest",
792
797
  str(tests_dir / "test_session_lifecycle.py"),
793
798
  str(tests_dir / "test_eval_logs.py"),
794
- "-v", "--tb=short",
799
+ "-v",
800
+ "--tb=short",
795
801
  ],
796
802
  cwd=tests_dir.parent.parent,
797
803
  )
@@ -836,10 +842,11 @@ def _print_full_summary(results: dict):
836
842
  if results["phase2"]:
837
843
  passed = sum(1 for v in results["phase2"].values() if v)
838
844
  total = len(results["phase2"])
839
- status = "[green]PASSED[/green]" if passed == total else f"[yellow]{passed}/{total}[/yellow]"
845
+ status = (
846
+ "[green]PASSED[/green]" if passed == total else f"[yellow]{passed}/{total}[/yellow]"
847
+ )
840
848
  details = ", ".join(
841
- f"[green]{b}[/green]" if v else f"[red]{b}[/red]"
842
- for b, v in results["phase2"].items()
849
+ f"[green]{b}[/green]" if v else f"[red]{b}[/red]" for b, v in results["phase2"].items()
843
850
  )
844
851
  table.add_row("Phase 2: Benchmarks", status, details)
845
852
 
@@ -16,7 +16,6 @@ import requests
16
16
  from tests.e2e.conftest import (
17
17
  BASE_URL,
18
18
  EXPECTED_ASSIGNMENT_COUNT,
19
- EXPECTED_TASKS,
20
19
  get_test_user_id,
21
20
  ssh_command,
22
21
  ssh_query,
@@ -379,9 +378,9 @@ class TestCrossBenchmark:
379
378
  SELECT COUNT(*) FROM assignments
380
379
  WHERE user_id = '{get_test_user_id()}'
381
380
  """)
382
- assert int(count) == EXPECTED_ASSIGNMENT_COUNT, (
383
- f"Expected {EXPECTED_ASSIGNMENT_COUNT} assignments, got {count}"
384
- )
381
+ assert (
382
+ int(count) == EXPECTED_ASSIGNMENT_COUNT
383
+ ), f"Expected {EXPECTED_ASSIGNMENT_COUNT} assignments, got {count}"
385
384
 
386
385
 
387
386
  # =============================================================================
@@ -31,9 +31,12 @@ VPS_EVAL_LOGS_DIR = "/opt/hte-web/data/eval_logs"
31
31
  def db_path_to_host_path(db_path: str) -> str:
32
32
  """Translate container path stored in DB to host path on VPS.
33
33
 
34
- Backend runs in Docker with /opt/hte-web/data mounted as /data,
35
- so paths are stored as /data/... but host has /opt/hte-web/data/...
34
+ Backend may store paths as:
35
+ - /data/... (container-relative, needs translation)
36
+ - /opt/hte-web/data/... (already host path, return as-is)
36
37
  """
38
+ if db_path.startswith("/opt/hte-web/"):
39
+ return db_path # Already a host path
37
40
  return db_path.replace("/data/", "/opt/hte-web/data/")
38
41
 
39
42
 
@@ -145,7 +148,9 @@ class TestVPSEvalLogs:
145
148
  if total_count == 0:
146
149
  pytest.skip("No completed sessions to check")
147
150
 
148
- assert with_path_count == total_count, f"Only {with_path_count}/{total_count} completed sessions have eval_log_path"
151
+ assert (
152
+ with_path_count == total_count
153
+ ), f"Only {with_path_count}/{total_count} completed sessions have eval_log_path"
149
154
 
150
155
  def test_eval_log_files_exist_on_vps(self):
151
156
  """Eval log files referenced in DB should exist on VPS."""
@@ -220,12 +225,12 @@ class TestEvalLogFormat:
220
225
 
221
226
  path = db_path_to_host_path(db_path)
222
227
  # List contents of the gzipped eval (it's actually a zip inside gzip)
223
- # First copy to temp, decompress, check structure
228
+ # Use python's zipfile since unzip may not be installed
224
229
  result = ssh_command(f"""
225
230
  cd /tmp &&
226
231
  cp {path} test_eval.gz &&
227
232
  gunzip -f test_eval.gz &&
228
- unzip -l test_eval 2>/dev/null | head -20
233
+ python3 -c "import zipfile; z=zipfile.ZipFile('test_eval'); print('\\n'.join(z.namelist()[:20]))"
229
234
  """)
230
235
 
231
236
  # Should contain header.json at minimum
@@ -243,18 +248,31 @@ class TestEvalLogUpload:
243
248
  """Test eval log upload functionality."""
244
249
 
245
250
  def test_upload_event_recorded(self):
246
- """Upload events should be recorded in session_events for sessions with eval logs."""
247
- # Only check sessions that have eval_log_path (proves upload succeeded)
251
+ """Upload events should be recorded in session_events for sessions with eval logs.
252
+
253
+ Note: Upload events were added in CLI v0.2.22. Sessions created with older
254
+ CLI versions won't have these events.
255
+ """
256
+ # Find a session that has:
257
+ # 1. eval_log_path (proves upload succeeded)
258
+ # 2. session_started event with cli_version >= 0.2.22 (has upload events)
248
259
  session_id = ssh_query(f"""
249
- SELECT id FROM sessions
250
- WHERE user_id = '{get_test_user_id()}'
251
- AND status = 'submitted'
252
- AND eval_log_path IS NOT NULL
260
+ SELECT s.id FROM sessions s
261
+ JOIN session_events se ON s.id = se.session_id
262
+ WHERE s.user_id = '{get_test_user_id()}'
263
+ AND s.status = 'submitted'
264
+ AND s.eval_log_path IS NOT NULL
265
+ AND se.event_type = 'session_started'
266
+ AND (
267
+ json_extract(se.event_data, '$.cli_version') >= '0.2.22'
268
+ OR json_extract(se.event_data, '$.cli_version') LIKE '0.3.%'
269
+ OR json_extract(se.event_data, '$.cli_version') LIKE '1.%'
270
+ )
253
271
  LIMIT 1
254
272
  """)
255
273
 
256
274
  if not session_id:
257
- pytest.skip("No completed sessions with eval logs")
275
+ pytest.skip("No sessions with CLI v0.2.22+ (upload events added in v0.2.22)")
258
276
 
259
277
  events = ssh_query(f"""
260
278
  SELECT event_type FROM session_events
@@ -265,7 +283,9 @@ class TestEvalLogUpload:
265
283
  event_list = events.split("\n") if events else []
266
284
  has_upload = any("upload" in e.lower() for e in event_list)
267
285
 
268
- assert has_upload, f"No upload events found for session {session_id}. Events: {event_list[:5]}"
286
+ assert (
287
+ has_upload
288
+ ), f"No upload events found for session {session_id}. Events: {event_list[:5]}"
269
289
 
270
290
  def test_eval_log_size_reasonable(self):
271
291
  """Eval logs should be reasonably sized (not empty, not huge)."""
@@ -114,9 +114,9 @@ class TestAssignments:
114
114
  count = ssh_query(
115
115
  f"SELECT COUNT(*) FROM assignments WHERE user_id = '{get_test_user_id()}'"
116
116
  )
117
- assert int(count) == EXPECTED_ASSIGNMENT_COUNT, (
118
- f"Expected {EXPECTED_ASSIGNMENT_COUNT} assignments, got {count}"
119
- )
117
+ assert (
118
+ int(count) == EXPECTED_ASSIGNMENT_COUNT
119
+ ), f"Expected {EXPECTED_ASSIGNMENT_COUNT} assignments, got {count}"
120
120
 
121
121
  @pytest.mark.parametrize("benchmark,tasks", EXPECTED_TASKS.items())
122
122
  def test_benchmark_tasks_assigned(self, benchmark, tasks):
@@ -150,9 +150,7 @@ print(f'Loaded {len(HUMAN_REGISTRY)} benchmarks: {list(HUMAN_REGISTRY.keys())}')
150
150
 
151
151
  assert "Loaded" in result.stdout
152
152
  # Should have exactly 7 benchmarks
153
- assert "7 benchmarks" in result.stdout, (
154
- f"Expected 7 benchmarks, got: {result.stdout}"
155
- )
153
+ assert "7 benchmarks" in result.stdout, f"Expected 7 benchmarks, got: {result.stdout}"
156
154
 
157
155
  def test_backend_can_import_adapters(self):
158
156
  """Backend should be able to instantiate adapters."""
@@ -180,9 +178,9 @@ for name, cls in HUMAN_REGISTRY.items():
180
178
 
181
179
  # All benchmarks should show OK - STRICT check
182
180
  for benchmark in BENCHMARKS:
183
- assert f"{benchmark}: OK" in result.stdout, (
184
- f"Benchmark {benchmark} not found or not OK in output: {result.stdout}"
185
- )
181
+ assert (
182
+ f"{benchmark}: OK" in result.stdout
183
+ ), f"Benchmark {benchmark} not found or not OK in output: {result.stdout}"
186
184
 
187
185
 
188
186
  class TestLocalImports:
@@ -223,7 +223,6 @@ class TestSessionState:
223
223
  WHERE user_id = '{get_test_user_id()}'
224
224
  AND status = 'abandoned'
225
225
  """)
226
- count = int(abandoned_count) if abandoned_count else 0
227
226
  # Verify the query returned a valid number (not empty/error)
228
227
  assert abandoned_count.strip().isdigit(), f"Query returned invalid value: {abandoned_count}"
229
228
  # Note: count can legitimately be 0 if no sessions were abandoned
@@ -625,7 +625,7 @@ wheels = [
625
625
 
626
626
  [[package]]
627
627
  name = "hte-cli"
628
- version = "0.2.21"
628
+ version = "0.2.22"
629
629
  source = { editable = "." }
630
630
  dependencies = [
631
631
  { name = "click" },
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes