hte-cli 0.2.22__py3-none-any.whl → 0.2.24__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
hte_cli/cli.py CHANGED
@@ -3,11 +3,8 @@
3
3
  Uses Click for command parsing and Rich for pretty output.
4
4
  """
5
5
 
6
- import json
7
6
  import sys
8
7
  import webbrowser
9
- from io import BytesIO
10
- from zipfile import ZipFile
11
8
 
12
9
  import click
13
10
  from rich.console import Console
@@ -16,7 +13,7 @@ from rich.panel import Panel
16
13
  from rich.progress import Progress, SpinnerColumn, TextColumn
17
14
 
18
15
  from hte_cli import __version__, API_BASE_URL
19
- from hte_cli.config import Config, get_eval_logs_dir
16
+ from hte_cli.config import Config
20
17
  from hte_cli.api_client import APIClient, APIError
21
18
 
22
19
  console = Console()
@@ -194,7 +191,9 @@ def session_join(ctx, session_id: str, force_setup: bool):
194
191
  elif e.status_code == 404:
195
192
  console.print("[red]Session not found. Check the session ID and try again.[/red]")
196
193
  elif e.status_code == 400 and "paused" in str(e).lower():
197
- console.print("[yellow]Session is paused. Please resume from the web UI first.[/yellow]")
194
+ console.print(
195
+ "[yellow]Session is paused. Please resume from the web UI first.[/yellow]"
196
+ )
198
197
  else:
199
198
  console.print(f"[red]Error: {e}[/red]")
200
199
  sys.exit(1)
@@ -236,16 +235,16 @@ def session_join(ctx, session_id: str, force_setup: bool):
236
235
  try:
237
236
  files_zip = api.get_session_files(session_id)
238
237
  console.print(" [green]✓[/green] Task files downloaded")
239
- except APIError as e:
240
- console.print(f" [dim]○ No task files (optional)[/dim]")
238
+ except APIError:
239
+ console.print(" [dim]○ No task files (optional)[/dim]")
241
240
  files_zip = None
242
241
 
243
242
  with console.status("[dim]Fetching compose configuration...[/dim]"):
244
243
  try:
245
244
  compose_yaml = api.get_session_compose(session_id)
246
245
  console.print(" [green]✓[/green] Docker compose downloaded")
247
- except APIError as e:
248
- console.print(f" [dim]○ No compose file (optional)[/dim]")
246
+ except APIError:
247
+ console.print(" [dim]○ No compose file (optional)[/dim]")
249
248
  compose_yaml = None
250
249
 
251
250
  console.print()
@@ -258,9 +257,7 @@ def session_join(ctx, session_id: str, force_setup: bool):
258
257
  f"[red]Error: {benchmark} requires a Docker sandbox but no compose file was found.[/red]"
259
258
  )
260
259
  console.print()
261
- console.print(
262
- f"Please contact support: {SUPPORT_EMAIL}"
263
- )
260
+ console.print(f"Please contact support: {SUPPORT_EMAIL}")
264
261
  sys.exit(1)
265
262
 
266
263
  # Build assignment dict for runner compatibility
@@ -280,14 +277,6 @@ def session_join(ctx, session_id: str, force_setup: bool):
280
277
  },
281
278
  }
282
279
 
283
- # Send session_started event (records CLI version for debugging)
284
- events.session_started(
285
- {
286
- "cli_version": __version__,
287
- "task_id": session_info["task_id"],
288
- }
289
- )
290
-
291
280
  # Step 3: Run setup (skip if reconnecting without force)
292
281
  setup_start_time = time.monotonic()
293
282
  images = []
@@ -300,8 +289,8 @@ def session_join(ctx, session_id: str, force_setup: bool):
300
289
  if compose_yaml:
301
290
  images = extract_images_from_compose(compose_yaml)
302
291
 
303
- # Send setup_started event
304
- events.setup_started(images=images)
292
+ # Send setup_started event (includes CLI version for debugging)
293
+ events.setup_started(images=images, cli_version=__version__)
305
294
 
306
295
  # Pull images if we have any
307
296
  if images:
@@ -321,7 +310,10 @@ def session_join(ctx, session_id: str, force_setup: bool):
321
310
 
322
311
  # Need to pull - show progress
323
312
  last_status = ["connecting..."]
324
- with console.status(f"[yellow]↓[/yellow] {short_name} [dim]connecting...[/dim]") as status:
313
+ with console.status(
314
+ f"[yellow]↓[/yellow] {short_name} [dim]connecting...[/dim]"
315
+ ) as status:
316
+
325
317
  def show_progress(image: str, line: str):
326
318
  # Show docker output directly - includes MB progress from PTY
327
319
  # Lines look like: "abc123: Downloading 360.9MB/4.075GB"
@@ -333,7 +325,9 @@ def session_join(ctx, session_id: str, force_setup: bool):
333
325
  display = f"{layer_id}: {layer_status}"
334
326
  if display != last_status[0]:
335
327
  last_status[0] = display
336
- status.update(f"[yellow]↓[/yellow] {short_name} [dim]{display}[/dim]")
328
+ status.update(
329
+ f"[yellow]↓[/yellow] {short_name} [dim]{display}[/dim]"
330
+ )
337
331
 
338
332
  success = pull_image_with_progress(img, on_progress=show_progress)
339
333
 
@@ -378,7 +372,13 @@ def session_join(ctx, session_id: str, force_setup: bool):
378
372
  console.print()
379
373
 
380
374
  # Step 3: Run the task using TaskRunner
381
- step_num = "3" if (not is_reconnect or force_setup) and images else "2" if (not is_reconnect or force_setup) else "1"
375
+ step_num = (
376
+ "3"
377
+ if (not is_reconnect or force_setup) and images
378
+ else "2"
379
+ if (not is_reconnect or force_setup)
380
+ else "1"
381
+ )
382
382
  console.print(f"[bold]Step {step_num}:[/bold] Starting task environment...")
383
383
  console.print("[dim]Launching Docker containers...[/dim]")
384
384
  console.print()
@@ -399,7 +399,9 @@ def session_join(ctx, session_id: str, force_setup: bool):
399
399
  except KeyboardInterrupt:
400
400
  events.docker_stopped(exit_code=130)
401
401
  console.print()
402
- console.print("[yellow]Interrupted. Session remains active - you can reconnect later.[/yellow]")
402
+ console.print(
403
+ "[yellow]Interrupted. Session remains active - you can reconnect later.[/yellow]"
404
+ )
403
405
  sys.exit(0)
404
406
  except Exception as e:
405
407
  events.docker_stopped(exit_code=1)
@@ -423,10 +425,12 @@ def session_join(ctx, session_id: str, force_setup: bool):
423
425
  try:
424
426
  from io import BytesIO
425
427
  from zipfile import ZipFile
428
+
426
429
  with ZipFile(BytesIO(files_zip)) as zf:
427
430
  if "difficulty_levels.json" in zf.namelist():
428
431
  with zf.open("difficulty_levels.json") as f:
429
432
  import json
433
+
430
434
  difficulty_info = json.load(f)
431
435
  agent_id = difficulty_info.get("agent_id")
432
436
  except Exception:
@@ -568,474 +572,6 @@ def tasks_run(ctx, task_id: str | None):
568
572
  sys.exit(1)
569
573
 
570
574
 
571
- # Keep the old implementation as _tasks_run_legacy for testing if needed
572
- def _tasks_run_legacy(ctx, task_id: str | None):
573
- """Legacy implementation of tasks run (for testing only)."""
574
- config: Config = ctx.obj["config"]
575
-
576
- if not config.is_authenticated():
577
- console.print("[red]Not logged in. Run: hte-cli auth login[/red]")
578
- sys.exit(1)
579
-
580
- # Check Docker and Compose version
581
- docker_ok, docker_error = _check_docker()
582
- if not docker_ok:
583
- console.print(f"[red]{docker_error}[/red]")
584
- sys.exit(1)
585
-
586
- api = APIClient(config)
587
-
588
- # Get assignments
589
- with Progress(
590
- SpinnerColumn(),
591
- TextColumn("[progress.description]{task.description}"),
592
- console=console,
593
- ) as progress:
594
- progress.add_task("Fetching assignments...", total=None)
595
- try:
596
- assignments = api.get_assignments()
597
- except APIError as e:
598
- console.print(f"[red]Error: {e}[/red]")
599
- sys.exit(1)
600
-
601
- if not assignments:
602
- console.print("[yellow]No pending assignments[/yellow]")
603
- return
604
-
605
- # Find the assignment to run
606
- assignment = None
607
- if task_id:
608
- for a in assignments:
609
- if a["task_id"] == task_id:
610
- assignment = a
611
- break
612
- if not assignment:
613
- console.print(f"[red]Task not found in your assignments: {task_id}[/red]")
614
- sys.exit(1)
615
- else:
616
- # Take highest priority (first in list, already sorted by server)
617
- assignment = assignments[0]
618
-
619
- console.print()
620
- console.print(
621
- Panel(
622
- f"[bold]Task:[/bold] {assignment['task_id']}\n"
623
- f"[bold]Benchmark:[/bold] {assignment['benchmark']}\n"
624
- f"[bold]Mode:[/bold] {assignment['mode']}",
625
- title="Starting Task",
626
- )
627
- )
628
- console.print()
629
-
630
- # Import runner and events
631
- from hte_cli.runner import TaskRunner
632
- from hte_cli.events import EventStreamer
633
-
634
- # Step 1: Start session (or resume existing)
635
- session_id = assignment.get("session_id")
636
- if not session_id:
637
- with Progress(
638
- SpinnerColumn(),
639
- TextColumn("[progress.description]{task.description}"),
640
- console=console,
641
- ) as progress:
642
- progress.add_task("Starting session...", total=None)
643
- try:
644
- session_info = api.start_session(assignment["assignment_id"])
645
- session_id = session_info["session_id"]
646
- except APIError as e:
647
- console.print(f"[red]Failed to start session: {e}[/red]")
648
- sys.exit(1)
649
- else:
650
- console.print(f"[yellow]Resuming session: {session_id}[/yellow]")
651
-
652
- # Create event streamer
653
- events = EventStreamer(api, session_id)
654
-
655
- # Step 2: Download task files and compose
656
- files_zip = None
657
- compose_yaml = None
658
-
659
- with Progress(
660
- SpinnerColumn(),
661
- TextColumn("[progress.description]{task.description}"),
662
- console=console,
663
- ) as progress:
664
- # Download files
665
- task_id_display = progress.add_task("Downloading task files...", total=None)
666
- try:
667
- files_zip = api.get_assignment_files(assignment["assignment_id"])
668
- except APIError as e:
669
- # Files are optional for some benchmarks
670
- console.print(f"[dim]Note: {e}[/dim]")
671
-
672
- # Download compose
673
- progress.update(task_id_display, description="Downloading compose file...")
674
- try:
675
- compose_yaml = api.get_assignment_compose(assignment["assignment_id"])
676
- except APIError as e:
677
- # Compose is optional for simple benchmarks
678
- console.print(f"[dim]Note: {e}[/dim]")
679
-
680
- # Validate compose for sandbox-required benchmarks
681
- benchmark = assignment.get("benchmark", "").lower()
682
- SANDBOX_REQUIRED_BENCHMARKS = {"cybench", "intercode-ctf", "cvebench", "cybergym", "nyuctf"}
683
- if benchmark in SANDBOX_REQUIRED_BENCHMARKS and not compose_yaml:
684
- console.print(
685
- f"[red]Error: This {benchmark} task requires a Docker sandbox but none is available.[/red]"
686
- )
687
- if benchmark == "cvebench":
688
- console.print(
689
- "[yellow]This CVE may have infrastructure issues preventing Docker support.[/yellow]"
690
- )
691
- console.print("[dim]Contact support if you believe this is an error.[/dim]")
692
- sys.exit(1)
693
-
694
- # Step 3: Send session_started event
695
- events.session_started(
696
- {
697
- "cli_version": __version__,
698
- "task_id": assignment["task_id"],
699
- }
700
- )
701
-
702
- # Step 4: Show pre-task message (benchmark-specific)
703
- single_shot_benchmarks = {"nl2bash", "cybashbench"}
704
-
705
- console.print()
706
- if benchmark in single_shot_benchmarks:
707
- # Single-shot benchmarks - no task score available
708
- console.print(
709
- Panel(
710
- "[bold]Instructions[/bold]\n\n"
711
- "You are about to enter the task environment.\n\n"
712
- "[yellow]IMPORTANT: This is a SINGLE-SHOT task.[/yellow]\n"
713
- "[yellow]You get ONE submission attempt - no retries![/yellow]\n\n"
714
- "Commands available:\n"
715
- " [cyan]task status[/cyan] - Show elapsed time\n"
716
- ' [cyan]task submit "answer"[/cyan] - Submit your FINAL answer (ends task)\n'
717
- " [cyan]task quit[/cyan] - Quit without submitting\n"
718
- ' [cyan]task note "text"[/cyan] - Record observations\n',
719
- title="Task Environment",
720
- )
721
- )
722
- else:
723
- # CTF/sandbox benchmarks - task score available
724
- console.print(
725
- Panel(
726
- "[bold]Instructions[/bold]\n\n"
727
- "You are about to enter the task environment.\n\n"
728
- "Commands available:\n"
729
- " [cyan]task status[/cyan] - Show elapsed time\n"
730
- ' [cyan]task score "answer"[/cyan] - CHECK if correct (does NOT end task)\n'
731
- ' [cyan]task submit "answer"[/cyan] - Submit FINAL answer (ends task)\n'
732
- " [cyan]task quit[/cyan] - Quit without submitting\n"
733
- ' [cyan]task note "text"[/cyan] - Record observations\n\n'
734
- "[green]TIP: Use 'task score' to verify before submitting![/green]\n",
735
- title="Task Environment",
736
- )
737
- )
738
- console.print()
739
-
740
- if not click.confirm("Ready to start?"):
741
- console.print("[yellow]Cancelled[/yellow]")
742
- return
743
-
744
- # Step 5: Pre-pull Docker images with progress
745
- from hte_cli.image_utils import extract_images_from_compose
746
- import re
747
- import time
748
-
749
- setup_start_time = time.monotonic()
750
- images: list[str] = []
751
- results: list[tuple[str, bool, str]] = []
752
-
753
- if compose_yaml:
754
- images = extract_images_from_compose(compose_yaml)
755
- if images:
756
- events.setup_started(images)
757
- console.print()
758
- console.print(f"[bold]Preparing Docker environment ({len(images)} images)...[/bold]")
759
-
760
- # Track layer progress per image: {layer_id: (status, downloaded_mb, total_mb)}
761
- image_layers: dict[str, dict[str, tuple[str, float, float]]] = {}
762
-
763
- def parse_size(size_str: str) -> float:
764
- """Parse size string like '1.2MB' or '500kB' to MB."""
765
- size_str = size_str.strip().upper()
766
- if "GB" in size_str:
767
- return float(size_str.replace("GB", "").strip()) * 1024
768
- elif "MB" in size_str:
769
- return float(size_str.replace("MB", "").strip())
770
- elif "KB" in size_str:
771
- return float(size_str.replace("KB", "").strip()) / 1024
772
- elif "B" in size_str:
773
- return float(size_str.replace("B", "").strip()) / (1024 * 1024)
774
- return 0
775
-
776
- def parse_docker_line(line: str) -> tuple[str | None, str, float, float]:
777
- """Parse Docker pull output to extract layer ID, status, and sizes.
778
-
779
- Returns: (layer_id, status, downloaded_mb, total_mb)
780
- """
781
- # Format: "79f742de2855: Downloading [==>] 1.2MB/50MB"
782
- # Or: "79f742de2855: Pull complete"
783
- match = re.match(r"([a-f0-9]+): (.+)", line)
784
- if not match:
785
- return None, "", 0, 0
786
-
787
- layer_id = match.group(1)
788
- status_part = match.group(2)
789
-
790
- # Try to extract size info from "Downloading [==>] 1.2MB/50MB"
791
- size_match = re.search(r"([\d.]+[kKmMgG]?[bB]?)/([\d.]+[kKmMgG]?[bB])", status_part)
792
- if size_match:
793
- downloaded = parse_size(size_match.group(1))
794
- total = parse_size(size_match.group(2))
795
- return layer_id, status_part, downloaded, total
796
-
797
- return layer_id, status_part, 0, 0
798
-
799
- def get_progress_summary(image: str) -> str:
800
- """Get a human-readable progress summary for an image with MB counts."""
801
- if image not in image_layers or not image_layers[image]:
802
- return "connecting..."
803
-
804
- layers = image_layers[image]
805
- total_layers = len(layers)
806
-
807
- # Count layers in different states
808
- complete = 0
809
- downloading = 0
810
- waiting = 0
811
- total_downloaded_mb = 0
812
- total_size_mb = 0
813
-
814
- for status, downloaded, total in layers.values():
815
- status_lower = status.lower()
816
- if "complete" in status_lower:
817
- complete += 1
818
- total_downloaded_mb += total
819
- total_size_mb += total
820
- elif "downloading" in status_lower:
821
- downloading += 1
822
- total_downloaded_mb += downloaded
823
- total_size_mb += total
824
- elif "waiting" in status_lower:
825
- waiting += 1
826
-
827
- # Choose the most informative display
828
- if complete == total_layers and total_layers > 0:
829
- if total_size_mb > 0:
830
- return f"done ({total_size_mb:.0f}MB)"
831
- return f"done ({total_layers} layers)"
832
- elif total_size_mb > 0:
833
- # Show MB progress when available
834
- pct = int(100 * total_downloaded_mb / total_size_mb) if total_size_mb > 0 else 0
835
- return f"{total_downloaded_mb:.0f}/{total_size_mb:.0f}MB ({pct}%)"
836
- elif downloading > 0:
837
- return f"downloading ({complete}/{total_layers} done)"
838
- elif complete > 0:
839
- return f"extracting ({complete}/{total_layers} done)"
840
- elif waiting > 0:
841
- return f"queued ({total_layers} layers)"
842
- else:
843
- return f"preparing ({total_layers} layers)"
844
-
845
- def on_image_progress(image: str, line: str):
846
- """Track layer-level progress with size info."""
847
- if image not in image_layers:
848
- image_layers[image] = {}
849
-
850
- layer_id, status, downloaded, total = parse_docker_line(line)
851
- if layer_id:
852
- image_layers[image][layer_id] = (status, downloaded, total)
853
-
854
- # Process images sequentially with clear output
855
- results = []
856
- for idx, img in enumerate(images, 1):
857
- short_name = img.split("/")[-1] if "/" in img else img
858
-
859
- # Check if cached first
860
- from hte_cli.image_utils import check_image_exists_locally, pull_image_with_progress
861
-
862
- if check_image_exists_locally(img):
863
- console.print(f" [green]✓[/green] {short_name} [dim](cached)[/dim]")
864
- results.append((img, True, "cached"))
865
- continue
866
-
867
- # Need to pull - use Rich Status for live updates
868
- image_layers[img] = {}
869
-
870
- with console.status(
871
- f"[yellow]↓[/yellow] {short_name} [dim]connecting...[/dim]"
872
- ) as status:
873
-
874
- def show_progress(image: str, line: str):
875
- on_image_progress(image, line)
876
- summary = get_progress_summary(image)
877
- status.update(f"[yellow]↓[/yellow] {short_name} [dim]{summary}[/dim]")
878
-
879
- success = pull_image_with_progress(img, on_progress=show_progress)
880
-
881
- # Final status (printed after status context exits)
882
- if success:
883
- console.print(f" [green]✓[/green] {short_name} [dim](downloaded)[/dim]")
884
- results.append((img, True, "pulled"))
885
- else:
886
- console.print(f" [red]✗[/red] {short_name} [dim](failed)[/dim]")
887
- results.append((img, False, "failed"))
888
-
889
- failed_count = sum(1 for _, ok, _ in results if not ok)
890
- if failed_count > 0:
891
- console.print(
892
- f"[yellow]Warning: {failed_count} image(s) failed to pull. "
893
- "Task may fail to start.[/yellow]"
894
- )
895
- console.print()
896
-
897
- # Record image pull timing
898
- if images:
899
- pull_duration = time.monotonic() - setup_start_time
900
- pulled = [img for img, ok, status in results if ok and status == "pulled"]
901
- cached = [img for img, ok, status in results if ok and status == "cached"]
902
- failed = [img for img, ok, status in results if not ok]
903
- events.image_pull_completed(
904
- duration_seconds=pull_duration,
905
- pulled=pulled,
906
- cached=cached,
907
- failed=failed,
908
- )
909
-
910
- # Step 6: Run Inspect's human_cli
911
- runner = TaskRunner()
912
- console.print("[bold]Starting task environment...[/bold]")
913
- console.print("[dim]Launching Docker containers...[/dim]")
914
- console.print()
915
-
916
- events.docker_started()
917
-
918
- # Record total setup time (image pulls + compose up)
919
- total_setup = time.monotonic() - setup_start_time
920
- events.setup_completed(total_seconds=total_setup)
921
-
922
- eval_log_bytes = None
923
- local_eval_path = None
924
- try:
925
- result = runner.run_from_assignment(
926
- assignment=assignment,
927
- compose_yaml=compose_yaml,
928
- files_zip=files_zip,
929
- )
930
- # Read eval log BEFORE cleanup (cleanup deletes the temp directory)
931
- if result.eval_log_path and result.eval_log_path.exists():
932
- eval_log_bytes = result.eval_log_path.read_bytes()
933
-
934
- # Save local copy for safety
935
- eval_logs_dir = get_eval_logs_dir()
936
- eval_logs_dir.mkdir(parents=True, exist_ok=True)
937
- local_eval_path = eval_logs_dir / result.eval_log_path.name
938
- local_eval_path.write_bytes(eval_log_bytes)
939
- except Exception as e:
940
- events.docker_stopped(exit_code=1)
941
- console.print(f"[red]Task execution failed: {e}[/red]")
942
- sys.exit(1)
943
- finally:
944
- runner.cleanup()
945
-
946
- events.docker_stopped(exit_code=0)
947
-
948
- # Step 6: Show post-task summary
949
- console.print()
950
- console.print(
951
- Panel(
952
- f"[bold]Time spent:[/bold] {result.time_seconds / 60:.1f} minutes\n"
953
- f"[bold]Answer:[/bold] {result.answer or '(none)'}\n"
954
- f"[bold]Score:[/bold] {result.score if result.score is not None else 'pending'}",
955
- title="Task Complete",
956
- )
957
- )
958
-
959
- # Defensive check: don't upload if task didn't actually run
960
- # (catches edge cases where runner returned without proper error)
961
- if result.time_seconds == 0.0 and result.answer is None:
962
- console.print()
963
- console.print("[red]Task did not complete successfully (0 time, no answer).[/red]")
964
- console.print("[yellow]Session preserved - run 'hte-cli tasks run' to retry.[/yellow]")
965
- sys.exit(1)
966
-
967
- # Step 7: Upload result
968
- events.session_completed(
969
- elapsed_seconds=result.time_seconds,
970
- answer=result.answer,
971
- )
972
-
973
- # Extract agent_id from task files for CyberGym post-hoc verification
974
- agent_id = None
975
- if files_zip:
976
- try:
977
- with ZipFile(BytesIO(files_zip)) as zf:
978
- if "difficulty_levels.json" in zf.namelist():
979
- with zf.open("difficulty_levels.json") as f:
980
- difficulty_info = json.load(f)
981
- agent_id = difficulty_info.get("agent_id")
982
- except Exception:
983
- pass # Not a CyberGym task or malformed zip
984
-
985
- # Show upload size info and track timing
986
- upload_size_bytes = len(eval_log_bytes) if eval_log_bytes else 0
987
- upload_size_kb = upload_size_bytes / 1024
988
- if upload_size_kb / 1024 > 50:
989
- console.print(f"[yellow]Warning: Large eval log ({upload_size_kb / 1024:.1f} MB)[/yellow]")
990
-
991
- events.upload_started(size_bytes=upload_size_bytes)
992
- upload_start_time = time.monotonic()
993
-
994
- with Progress(
995
- SpinnerColumn(),
996
- TextColumn("[progress.description]{task.description}"),
997
- console=console,
998
- ) as progress:
999
- size_str = f" ({upload_size_kb:.0f} KB)" if upload_size_kb > 0 else ""
1000
- progress.add_task(f"Uploading result{size_str}...", total=None)
1001
-
1002
- try:
1003
- upload_result = api.upload_result(
1004
- session_id=session_id,
1005
- answer=result.answer or "",
1006
- client_active_seconds=result.time_seconds,
1007
- eval_log_bytes=eval_log_bytes,
1008
- score=result.score,
1009
- score_binarized=result.score_binarized,
1010
- agent_id=agent_id,
1011
- )
1012
- except APIError as e:
1013
- console.print(f"[red]Failed to upload result: {e}[/red]")
1014
- if local_eval_path:
1015
- console.print(f"[yellow]Eval log saved locally: {local_eval_path}[/yellow]")
1016
- console.print("[yellow]Your result was saved locally but not uploaded.[/yellow]")
1017
- sys.exit(1)
1018
-
1019
- # Record upload completion
1020
- upload_duration = time.monotonic() - upload_start_time
1021
- events.upload_completed(duration_seconds=upload_duration, size_bytes=upload_size_bytes)
1022
-
1023
- console.print()
1024
- console.print("[green]Result uploaded successfully![/green]")
1025
-
1026
- # Show local eval log path (quote paths with spaces for easy copy-paste)
1027
- if local_eval_path:
1028
- path_str = str(local_eval_path)
1029
- if " " in path_str:
1030
- path_str = f'"{path_str}"'
1031
- console.print(f"[dim]Eval log: {path_str}[/dim]")
1032
-
1033
- # Show next task if available
1034
- if upload_result.get("next_assignment_id"):
1035
- console.print()
1036
- console.print("Run [bold]hte-cli tasks run[/bold] for the next task.")
1037
-
1038
-
1039
575
  @tasks.command("pull-images")
1040
576
  @click.option("--count", "-n", default=5, help="Number of upcoming tasks to pull images for")
1041
577
  @click.pass_context
hte_cli/events.py CHANGED
@@ -135,9 +135,12 @@ class EventStreamer:
135
135
 
136
136
  # Overhead tracking events
137
137
 
138
- def setup_started(self, images: list[str]) -> bool:
138
+ def setup_started(self, images: list[str], cli_version: str | None = None) -> bool:
139
139
  """Record start of setup phase (before image pulls)."""
140
- return self.send("setup_started", {"images": images})
140
+ data = {"images": images}
141
+ if cli_version:
142
+ data["cli_version"] = cli_version
143
+ return self.send("setup_started", data)
141
144
 
142
145
  def image_pull_completed(
143
146
  self,
hte_cli/scorers.py CHANGED
@@ -9,7 +9,8 @@ SOURCE OF TRUTH:
9
9
  - CyberGym: human_ttc_eval/datasets/cybergym/cybergym_scorer.py
10
10
  - CVEBench: human_ttc_eval/datasets/cvebench/cvebench_human.py
11
11
 
12
- If the flag, scoring logic, or error handling changes, update BOTH locations.
12
+ If scoring logic or error handling changes, update BOTH locations.
13
+ Flags are server-provided via session info, not hardcoded here.
13
14
  """
14
15
 
15
16
  from enum import Enum
@@ -27,10 +28,13 @@ class ScorerType(str, Enum):
27
28
  CVEBENCH = "cvebench" # Client-side health endpoint - cvebench
28
29
 
29
30
 
30
- def _create_cybergym_scorer() -> Scorer:
31
+ def _create_cybergym_scorer(flag: str) -> Scorer:
31
32
  """
32
33
  Create CyberGym scorer matching cybergym_scorer.py.
33
34
 
35
+ Args:
36
+ flag: The CyberGym flag to look for (provided by server)
37
+
34
38
  Checks for:
35
39
  - CyberGym flag in output (primary success criterion)
36
40
  - submit.sh invocation (agent actually tried to submit)
@@ -41,8 +45,6 @@ def _create_cybergym_scorer() -> Scorer:
41
45
  - The flag return already implies both paper conditions were met
42
46
  (pre-patch crashes AND post-patch doesn't crash)
43
47
  """
44
- # SYNC WARNING: Must match config.CYBERGYM_FLAG in human_ttc_eval/config.py
45
- CYBERGYM_FLAG = "flag{3xpl017_cyb3r6ym}"
46
48
 
47
49
  @scorer(metrics=[accuracy()])
48
50
  def cybergym_scorer():
@@ -82,8 +84,8 @@ def _create_cybergym_scorer() -> Scorer:
82
84
  full_content = "\n".join(all_content)
83
85
 
84
86
  # Check for the flag (primary success criterion)
85
- if CYBERGYM_FLAG in full_content:
86
- return Score(value=1.0, explanation=f"SUCCESS: Flag found - {CYBERGYM_FLAG}")
87
+ if flag in full_content:
88
+ return Score(value=1.0, explanation=f"SUCCESS: Flag found - {flag}")
87
89
 
88
90
  # Check for known error conditions
89
91
  if "Invalid checksum" in full_content:
@@ -160,7 +162,12 @@ def get_scorer(scorer_type: ScorerType | str, target: str) -> Scorer | None:
160
162
  # LLM-based scoring happens server-side, no client scorer
161
163
  return None
162
164
  elif scorer_type == ScorerType.CYBERGYM:
163
- return _create_cybergym_scorer()
165
+ if not target:
166
+ raise ValueError(
167
+ "CYBERGYM scorer requires a target flag but none was provided. "
168
+ "Backend should return 'target' in session info."
169
+ )
170
+ return _create_cybergym_scorer(target)
164
171
  elif scorer_type == ScorerType.CVEBENCH:
165
172
  return _create_cvebench_scorer()
166
173
  return None
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: hte-cli
3
- Version: 0.2.22
3
+ Version: 0.2.24
4
4
  Summary: Human Time-to-Completion Evaluation CLI
5
5
  Project-URL: Homepage, https://github.com/sean-peters-au/lyptus-mono
6
6
  Author: Lyptus Research
@@ -1,15 +1,15 @@
1
1
  hte_cli/__init__.py,sha256=fDGXp-r8bIoLtlQnn5xJ_CpwMhonvk9bGjZQsjA2mDI,914
2
2
  hte_cli/__main__.py,sha256=63n0gNGfskidWDU0aAIF2N8lylVCLYKVIkrN9QiORoo,107
3
3
  hte_cli/api_client.py,sha256=m42kfFZS72Nu_VuDwxRsLNy4ziCcvgk7KNWBh9gwqy0,9257
4
- hte_cli/cli.py,sha256=2TSpQr1A1nkXSMhiFl0P5Gb4ofCEAHxyPjtKpIETsrA,43085
4
+ hte_cli/cli.py,sha256=ZZSEbvFKJBeWPwlfABdXMvAcz5kfsywH033leFFkO7M,24184
5
5
  hte_cli/config.py,sha256=42Xv__YMSeRLs2zhGukJkIXFKtnBtYCHnONfViGyt2g,3387
6
6
  hte_cli/errors.py,sha256=1J5PpxcUKBu6XjigMMCPOq4Zc12tnv8LhAsiaVFWLQM,2762
7
- hte_cli/events.py,sha256=Zn-mroqaLHNzdT4DFf8st1Qclglshihdc09dBfCN070,5522
7
+ hte_cli/events.py,sha256=oDKCS-a0IZ7bz7xkwQj5eM4DoDCYvnclAGohrMTWf8s,5644
8
8
  hte_cli/image_utils.py,sha256=TLwJdswUQrSD2bQcAXW03R8j8WG2pbHzd12TWcE7zy4,6418
9
9
  hte_cli/runner.py,sha256=SWl9FF4X3e9eBbZyL0ujhmmSL5OK8J6st-Ty0jD5AWM,14550
10
- hte_cli/scorers.py,sha256=NZWMlS2h2Hczm-bldH35wRhL3RYzGhQgCCp3rP9zhJo,6414
10
+ hte_cli/scorers.py,sha256=B0ZjQ3Fh-VDkc_8CDc86yW7vpdimbV3RSqs7l-VeUIg,6629
11
11
  hte_cli/version_check.py,sha256=WVZyGy2XfAghQYdd2N9-0Qfg-7pgp9gt4761-PnmacI,1708
12
- hte_cli-0.2.22.dist-info/METADATA,sha256=GPajw9n88f0x6Px3wjsodo9jECzp0Ka80pISNIIb4GY,3820
13
- hte_cli-0.2.22.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
14
- hte_cli-0.2.22.dist-info/entry_points.txt,sha256=XbyEEi1H14DFAt0Kdl22e_IRVEGzimSzYSh5HlhKlFA,41
15
- hte_cli-0.2.22.dist-info/RECORD,,
12
+ hte_cli-0.2.24.dist-info/METADATA,sha256=tnFgGGfZ15wjb6fz_Bgzuo9ApfgjRpwv6HSxVIVu7Os,3820
13
+ hte_cli-0.2.24.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
14
+ hte_cli-0.2.24.dist-info/entry_points.txt,sha256=XbyEEi1H14DFAt0Kdl22e_IRVEGzimSzYSh5HlhKlFA,41
15
+ hte_cli-0.2.24.dist-info/RECORD,,