hte-cli 0.2.29__tar.gz → 0.2.31__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. {hte_cli-0.2.29 → hte_cli-0.2.31}/PKG-INFO +1 -1
  2. {hte_cli-0.2.29 → hte_cli-0.2.31}/pyproject.toml +1 -1
  3. {hte_cli-0.2.29 → hte_cli-0.2.31}/src/hte_cli/api_client.py +24 -0
  4. {hte_cli-0.2.29 → hte_cli-0.2.31}/src/hte_cli/cli.py +114 -20
  5. {hte_cli-0.2.29 → hte_cli-0.2.31}/src/hte_cli/image_utils.py +1 -4
  6. {hte_cli-0.2.29 → hte_cli-0.2.31}/tests/unit/test_image_utils.py +3 -1
  7. {hte_cli-0.2.29 → hte_cli-0.2.31}/uv.lock +1 -1
  8. {hte_cli-0.2.29 → hte_cli-0.2.31}/.gitignore +0 -0
  9. {hte_cli-0.2.29 → hte_cli-0.2.31}/README.md +0 -0
  10. {hte_cli-0.2.29 → hte_cli-0.2.31}/src/hte_cli/__init__.py +0 -0
  11. {hte_cli-0.2.29 → hte_cli-0.2.31}/src/hte_cli/__main__.py +0 -0
  12. {hte_cli-0.2.29 → hte_cli-0.2.31}/src/hte_cli/config.py +0 -0
  13. {hte_cli-0.2.29 → hte_cli-0.2.31}/src/hte_cli/errors.py +0 -0
  14. {hte_cli-0.2.29 → hte_cli-0.2.31}/src/hte_cli/events.py +0 -0
  15. {hte_cli-0.2.29 → hte_cli-0.2.31}/src/hte_cli/runner.py +0 -0
  16. {hte_cli-0.2.29 → hte_cli-0.2.31}/src/hte_cli/scorers.py +0 -0
  17. {hte_cli-0.2.29 → hte_cli-0.2.31}/src/hte_cli/version_check.py +0 -0
  18. {hte_cli-0.2.29 → hte_cli-0.2.31}/tests/__init__.py +0 -0
  19. {hte_cli-0.2.29 → hte_cli-0.2.31}/tests/e2e/__init__.py +0 -0
  20. {hte_cli-0.2.29 → hte_cli-0.2.31}/tests/e2e/automated_runner.py +0 -0
  21. {hte_cli-0.2.29 → hte_cli-0.2.31}/tests/e2e/conftest.py +0 -0
  22. {hte_cli-0.2.29 → hte_cli-0.2.31}/tests/e2e/e2e_test.py +0 -0
  23. {hte_cli-0.2.29 → hte_cli-0.2.31}/tests/e2e/test_benchmark_flows.py +0 -0
  24. {hte_cli-0.2.29 → hte_cli-0.2.31}/tests/e2e/test_eval_logs.py +0 -0
  25. {hte_cli-0.2.29 → hte_cli-0.2.31}/tests/e2e/test_infrastructure.py +0 -0
  26. {hte_cli-0.2.29 → hte_cli-0.2.31}/tests/e2e/test_runtime_imports.py +0 -0
  27. {hte_cli-0.2.29 → hte_cli-0.2.31}/tests/e2e/test_session_lifecycle.py +0 -0
  28. {hte_cli-0.2.29 → hte_cli-0.2.31}/tests/e2e/verify_docker_deps.py +0 -0
  29. {hte_cli-0.2.29 → hte_cli-0.2.31}/tests/unit/__init__.py +0 -0
  30. {hte_cli-0.2.29 → hte_cli-0.2.31}/tests/unit/conftest.py +0 -0
  31. {hte_cli-0.2.29 → hte_cli-0.2.31}/tests/unit/test_runner.py +0 -0
  32. {hte_cli-0.2.29 → hte_cli-0.2.31}/tests/unit/test_scorers.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: hte-cli
3
- Version: 0.2.29
3
+ Version: 0.2.31
4
4
  Summary: Human Time-to-Completion Evaluation CLI
5
5
  Project-URL: Homepage, https://github.com/sean-peters-au/lyptus-mono
6
6
  Author: Lyptus Research
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "hte-cli"
3
- version = "0.2.29"
3
+ version = "0.2.31"
4
4
  description = "Human Time-to-Completion Evaluation CLI"
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.11"
@@ -265,3 +265,27 @@ class APIClient:
265
265
  json=payload,
266
266
  timeout=UPLOAD_TIMEOUT,
267
267
  )
268
+
269
+ def upload_partial_log(
270
+ self,
271
+ session_id: str,
272
+ eval_log_bytes: bytes,
273
+ ) -> dict:
274
+ """Upload partial eval log for interrupted sessions.
275
+
276
+ Args:
277
+ session_id: The session ID
278
+ eval_log_bytes: Partial eval log content
279
+
280
+ Returns:
281
+ Response dict with status and log_path
282
+ """
283
+ payload = {
284
+ "eval_log_base64": base64.b64encode(eval_log_bytes).decode("ascii"),
285
+ }
286
+
287
+ return self.post(
288
+ f"/sessions/{session_id}/partial-log",
289
+ json=payload,
290
+ timeout=UPLOAD_TIMEOUT,
291
+ )
@@ -23,6 +23,42 @@ console = Console()
23
23
  SUPPORT_EMAIL = "jacktpayne51@gmail.com"
24
24
 
25
25
 
26
+ def _find_eval_log_bytes(runner) -> bytes | None:
27
+ """Find and read eval log bytes from runner's work directory.
28
+
29
+ Used for interrupted sessions to upload partial logs.
30
+ """
31
+ try:
32
+ # Look for eval logs in the work directory
33
+ if not runner.work_dir.exists():
34
+ return None
35
+
36
+ # Find any .eval files in the work directory tree
37
+ eval_files = list(runner.work_dir.rglob("*.eval"))
38
+ if not eval_files:
39
+ return None
40
+
41
+ # Get the most recent one
42
+ eval_files.sort(key=lambda p: p.stat().st_mtime, reverse=True)
43
+ return eval_files[0].read_bytes()
44
+ except Exception:
45
+ return None
46
+
47
+
48
+ def _upload_partial_log(api: APIClient, session_id: str, eval_log_bytes: bytes, console: Console) -> None:
49
+ """Upload partial eval log for interrupted session.
50
+
51
+ Best-effort: silently handles failures to not block exit.
52
+ """
53
+ try:
54
+ size_kb = len(eval_log_bytes) / 1024
55
+ console.print(f"[dim]Uploading partial eval log ({size_kb:.0f} KB)...[/dim]")
56
+ api.upload_partial_log(session_id, eval_log_bytes)
57
+ console.print("[dim]Partial eval log uploaded.[/dim]")
58
+ except Exception as e:
59
+ console.print(f"[dim]Could not upload partial log: {e}[/dim]")
60
+
61
+
26
62
  @click.group()
27
63
  @click.version_option(__version__, prog_name="hte-cli")
28
64
  @click.pass_context
@@ -282,6 +318,11 @@ def session_join(ctx, session_id: str, force_setup: bool):
282
318
  # Validate compose for sandbox-required benchmarks
283
319
  benchmark = session_info.get("benchmark", "").lower()
284
320
  SANDBOX_REQUIRED_BENCHMARKS = {"cybench", "intercode-ctf", "cvebench", "cybergym", "nyuctf"}
321
+
322
+ # NYU CTF requires a shared Docker network for agent/game-server communication
323
+ if benchmark == "nyuctf":
324
+ _ensure_nyuctf_network()
325
+
285
326
  if benchmark in SANDBOX_REQUIRED_BENCHMARKS and not compose_yaml and not is_reconnect:
286
327
  console.print(
287
328
  f"[red]Error: {benchmark} requires a Docker sandbox but no compose file was found.[/red]"
@@ -333,15 +374,11 @@ def session_join(ctx, session_id: str, force_setup: bool):
333
374
  host_platform = get_host_docker_platform()
334
375
 
335
376
  if is_linux_arm:
336
- console.print(
337
- f"[yellow]![/yellow] Detected [bold]Linux ARM64[/bold] environment"
338
- )
377
+ console.print("[yellow]![/yellow] Detected [bold]Linux ARM64[/bold] environment")
339
378
  console.print(
340
379
  f" [dim]Will verify cached images match host architecture ({host_platform})[/dim]"
341
380
  )
342
- console.print(
343
- f" [dim]Mismatched images will be automatically re-pulled[/dim]"
344
- )
381
+ console.print(" [dim]Mismatched images will be automatically re-pulled[/dim]")
345
382
  console.print()
346
383
 
347
384
  console.print(f"[bold]Step 2:[/bold] Pulling {len(images)} Docker image(s)...")
@@ -377,7 +414,7 @@ def session_join(ctx, session_id: str, force_setup: bool):
377
414
  f" [dim]Cached image: {image_arch} | Host: {host_arch}[/dim]"
378
415
  )
379
416
  console.print(
380
- f" [dim]Removing cached image and re-pulling correct architecture...[/dim]"
417
+ " [dim]Removing cached image and re-pulling correct architecture...[/dim]"
381
418
  )
382
419
 
383
420
  needed_fix, fix_msg = fix_image_architecture(img)
@@ -391,7 +428,7 @@ def session_join(ctx, session_id: str, force_setup: bool):
391
428
  # No ARM variant available - this is an x86-only image
392
429
  # Re-pull the amd64 version and warn about QEMU
393
430
  console.print(
394
- f" [dim]No ARM variant available - re-pulling x86 version...[/dim]"
431
+ " [dim]No ARM variant available - re-pulling x86 version...[/dim]"
395
432
  )
396
433
  success = pull_image_with_progress(img)
397
434
  if success:
@@ -402,7 +439,9 @@ def session_join(ctx, session_id: str, force_setup: bool):
402
439
  pulled_images.append(img)
403
440
  continue
404
441
  else:
405
- console.print(f" [red]✗[/red] {short_name} [dim](failed to pull)[/dim]")
442
+ console.print(
443
+ f" [red]✗[/red] {short_name} [dim](failed to pull)[/dim]"
444
+ )
406
445
  failed_images.append(img)
407
446
  pull_errors[img] = "failed to pull x86 fallback"
408
447
  continue
@@ -426,12 +465,12 @@ def session_join(ctx, session_id: str, force_setup: bool):
426
465
 
427
466
  def show_progress(image: str, line: str):
428
467
  # Show docker output directly - includes MB progress from PTY
429
- # Lines look like: "abc123: Downloading 360.9MB/4.075GB"
468
+ # Lines look like: "abc123: Downloading [======> ] 360.9MB/4.075GB"
430
469
  if ": " in line:
431
470
  parts = line.split(": ", 1)
432
471
  if len(parts) == 2:
433
472
  layer_id = parts[0][-8:]
434
- layer_status = parts[1][:45]
473
+ layer_status = parts[1][:70] # Increased to include size info
435
474
  display = f"{layer_id}: {layer_status}"
436
475
  if display != last_status[0]:
437
476
  last_status[0] = display
@@ -450,6 +489,7 @@ def session_join(ctx, session_id: str, force_setup: bool):
450
489
  # On Linux ARM64, verify pulled image architecture
451
490
  if is_linux_arm:
452
491
  from hte_cli.image_utils import get_image_architecture
492
+
453
493
  pulled_arch = get_image_architecture(img)
454
494
 
455
495
  if pulled_arch == "arm64":
@@ -462,7 +502,7 @@ def session_join(ctx, session_id: str, force_setup: bool):
462
502
  f" [yellow]![/yellow] {short_name} [dim](downloaded, arch: amd64)[/dim]"
463
503
  )
464
504
  console.print(
465
- f" [yellow]This is an x86 image - requires QEMU emulation on ARM[/yellow]"
505
+ " [yellow]This is an x86 image - requires QEMU emulation on ARM[/yellow]"
466
506
  )
467
507
  x86_images_on_arm.append(img)
468
508
  else:
@@ -494,9 +534,7 @@ def session_join(ctx, session_id: str, force_setup: bool):
494
534
  console.print(
495
535
  f"[yellow]⚠ Warning:[/yellow] {len(x86_images_on_arm)} x86 image(s) detected on ARM host"
496
536
  )
497
- console.print(
498
- " These require QEMU emulation. If container fails to start, run:"
499
- )
537
+ console.print(" These require QEMU emulation. If container fails to start, run:")
500
538
  console.print(
501
539
  " [bold]docker run --privileged --rm tonistiigi/binfmt --install all[/bold]"
502
540
  )
@@ -513,14 +551,21 @@ def session_join(ctx, session_id: str, force_setup: bool):
513
551
 
514
552
  # Architecture-specific advice
515
553
  if is_linux_arm:
516
- console.print(f" 2. You're on Linux ARM64 - try: docker pull <image> --platform linux/arm64")
517
- console.print(" 3. For x86-only images, enable QEMU: docker run --privileged --rm tonistiigi/binfmt --install all")
554
+ console.print(
555
+ " 2. You're on Linux ARM64 - try: docker pull <image> --platform linux/arm64"
556
+ )
557
+ console.print(
558
+ " 3. For x86-only images, enable QEMU: docker run --privileged --rm tonistiigi/binfmt --install all"
559
+ )
518
560
  else:
519
561
  console.print(" 2. Try manual pull: docker pull <image>")
520
562
 
521
563
  console.print(" 4. Check network connectivity")
522
564
  console.print()
523
- console.print("Session remains active - you can retry with: hte-cli session join " + session_id)
565
+ console.print(
566
+ "Session remains active - you can retry with: hte-cli session join "
567
+ + session_id
568
+ )
524
569
  sys.exit(1)
525
570
 
526
571
  # Send setup_completed - THIS STARTS THE TIMER ON SERVER
@@ -574,6 +619,10 @@ def session_join(ctx, session_id: str, force_setup: bool):
574
619
  eval_log_bytes = result.eval_log_path.read_bytes()
575
620
  except KeyboardInterrupt:
576
621
  events.docker_stopped(exit_code=130)
622
+ # Try to find and upload any partial eval log before exiting
623
+ eval_log_bytes = _find_eval_log_bytes(runner)
624
+ if eval_log_bytes:
625
+ _upload_partial_log(api, session_id, eval_log_bytes, console)
577
626
  console.print()
578
627
  console.print(
579
628
  "[yellow]Interrupted. Session remains active - you can reconnect later.[/yellow]"
@@ -581,6 +630,10 @@ def session_join(ctx, session_id: str, force_setup: bool):
581
630
  sys.exit(0)
582
631
  except Exception as e:
583
632
  events.docker_stopped(exit_code=1)
633
+ # Try to upload partial log on failure too
634
+ eval_log_bytes = _find_eval_log_bytes(runner)
635
+ if eval_log_bytes:
636
+ _upload_partial_log(api, session_id, eval_log_bytes, console)
584
637
  console.print(f"[red]Task execution failed: {e}[/red]")
585
638
  sys.exit(1)
586
639
  finally:
@@ -930,7 +983,9 @@ def diagnose_cmd():
930
983
  console.print(" [red]✗[/red] QEMU x86 emulation NOT working")
931
984
  console.print()
932
985
  console.print(" [yellow]To enable QEMU emulation, run:[/yellow]")
933
- console.print(" [bold]docker run --privileged --rm tonistiigi/binfmt --install all[/bold]")
986
+ console.print(
987
+ " [bold]docker run --privileged --rm tonistiigi/binfmt --install all[/bold]"
988
+ )
934
989
  except subprocess.TimeoutExpired:
935
990
  console.print(" [yellow]![/yellow] QEMU test timed out")
936
991
  except Exception as e:
@@ -974,7 +1029,10 @@ def _check_docker() -> tuple[bool, str | None]:
974
1029
  timeout=10,
975
1030
  )
976
1031
  if result.returncode != 0:
977
- return False, "Docker is not running. Start Docker (Docker Desktop, colima, or dockerd)."
1032
+ return (
1033
+ False,
1034
+ "Docker is not running. Start Docker (Docker Desktop, colima, or dockerd).",
1035
+ )
978
1036
  except FileNotFoundError:
979
1037
  return False, "Docker is not installed. Install from https://docs.docker.com/get-docker/"
980
1038
  except Exception as e:
@@ -1023,5 +1081,41 @@ def _check_docker() -> tuple[bool, str | None]:
1023
1081
  return True, None
1024
1082
 
1025
1083
 
1084
+ def _ensure_nyuctf_network() -> None:
1085
+ """Ensure the ctfnet Docker network exists for NYU CTF challenges.
1086
+
1087
+ NYU CTF tasks use a shared Docker network ('ctfnet') for communication
1088
+ between the agent container and game-server container. This network must
1089
+ exist before docker compose up is called, since it's declared as external.
1090
+ """
1091
+ import subprocess
1092
+
1093
+ NETWORK_NAME = "ctfnet"
1094
+
1095
+ try:
1096
+ # Check if network exists
1097
+ result = subprocess.run(
1098
+ ["docker", "network", "inspect", NETWORK_NAME],
1099
+ capture_output=True,
1100
+ text=True,
1101
+ timeout=10,
1102
+ )
1103
+ if result.returncode == 0:
1104
+ return # Network exists
1105
+
1106
+ # Create the network
1107
+ subprocess.run(
1108
+ ["docker", "network", "create", NETWORK_NAME],
1109
+ capture_output=True,
1110
+ text=True,
1111
+ check=True,
1112
+ timeout=10,
1113
+ )
1114
+ except subprocess.CalledProcessError:
1115
+ pass # Network creation failed, will error later with clearer message
1116
+ except (subprocess.TimeoutExpired, FileNotFoundError):
1117
+ pass # Docker not available, will error later
1118
+
1119
+
1026
1120
  if __name__ == "__main__":
1027
1121
  cli()
@@ -124,6 +124,7 @@ def is_running_in_linux_vm_on_arm() -> bool:
124
124
  True if running Linux on ARM64
125
125
  """
126
126
  import sys
127
+
127
128
  return sys.platform == "linux" and get_host_architecture() in ("aarch64", "arm64")
128
129
 
129
130
 
@@ -313,10 +314,6 @@ def pull_image_with_progress(
313
314
 
314
315
  # Read output from master with timeout
315
316
  output_buffer = ""
316
- # Regex to parse docker progress: "abc123: Downloading [===> ] 10.5MB/50MB"
317
- progress_pattern = re.compile(
318
- r"([a-f0-9]+):\s*(Downloading|Extracting|Verifying Checksum|Download complete|Pull complete|Already exists|Waiting)(?:\s+\[.*?\]\s+)?(\d+\.?\d*\s*[kMG]?B)?(?:/(\d+\.?\d*\s*[kMG]?B))?"
319
- )
320
317
 
321
318
  while True:
322
319
  # Check if process is done
@@ -610,7 +610,9 @@ class TestFixImageArchitecture:
610
610
  @patch("hte_cli.image_utils.remove_image")
611
611
  @patch("hte_cli.image_utils.check_image_architecture_matches_host")
612
612
  @patch("hte_cli.image_utils.platform.machine")
613
- def test_returns_false_when_repull_fails(self, mock_machine, mock_check, mock_remove, mock_pull):
613
+ def test_returns_false_when_repull_fails(
614
+ self, mock_machine, mock_check, mock_remove, mock_pull
615
+ ):
614
616
  """Returns (False, message) when re-pull fails."""
615
617
  mock_machine.return_value = "aarch64"
616
618
  mock_check.return_value = (False, "amd64", "aarch64")
@@ -625,7 +625,7 @@ wheels = [
625
625
 
626
626
  [[package]]
627
627
  name = "hte-cli"
628
- version = "0.2.29"
628
+ version = "0.2.30"
629
629
  source = { editable = "." }
630
630
  dependencies = [
631
631
  { name = "click" },
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes