PyPI - hte-cli - Versions diffs - 0.2.21__tar.gz → 0.2.22__tar.gz - Mend

hte-cli 0.2.21tar.gz → 0.2.22tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

{hte_cli-0.2.21 → hte_cli-0.2.22}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: hte-cli
-Version: 0.2.21
+Version: 0.2.22
 Summary: Human Time-to-Completion Evaluation CLI
 Project-URL: Homepage, https://github.com/sean-peters-au/lyptus-mono
 Author: Lyptus Research

{hte_cli-0.2.21 → hte_cli-0.2.22}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "hte-cli"
-version = "0.2.21"
+version = "0.2.22"
 description = "Human Time-to-Completion Evaluation CLI"
 readme = "README.md"
 requires-python = ">=3.11"

{hte_cli-0.2.21 → hte_cli-0.2.22}/src/hte_cli/cli.py RENAMED Viewed

@@ -280,6 +280,14 @@ def session_join(ctx, session_id: str, force_setup: bool):
         },
     }
+    # Send session_started event (records CLI version for debugging)
+    events.session_started(
+        {
+            "cli_version": __version__,
+            "task_id": session_info["task_id"],
+        }
+    )
     # Step 3: Run setup (skip if reconnecting without force)
     setup_start_time = time.monotonic()
     images = []
@@ -429,13 +437,21 @@ def session_join(ctx, session_id: str, force_setup: bool):
         console.print(f"Answer: {result.answer}")
         console.print(f"Time: {result.time_seconds:.1f}s")
+        # Track upload size and timing
+        upload_size_bytes = len(eval_log_bytes) if eval_log_bytes else 0
+        upload_size_kb = upload_size_bytes / 1024
+        events.upload_started(size_bytes=upload_size_bytes)
+        upload_start_time = time.monotonic()
         # Upload to server
         with Progress(
             SpinnerColumn(),
             TextColumn("[progress.description]{task.description}"),
             console=console,
         ) as progress:
-            progress.add_task("Uploading result...", total=None)
+            size_str = f" ({upload_size_kb:.0f} KB)" if upload_size_kb > 0 else ""
+            progress.add_task(f"Uploading result{size_str}...", total=None)
             try:
                 upload_result = api.upload_result(
                     session_id=session_id,
@@ -450,6 +466,10 @@ def session_join(ctx, session_id: str, force_setup: bool):
                 console.print(f"[red]Failed to upload result: {e}[/red]")
                 sys.exit(1)
+        # Record upload completion
+        upload_duration = time.monotonic() - upload_start_time
+        events.upload_completed(duration_seconds=upload_duration, size_bytes=upload_size_bytes)
         if upload_result.get("score") is not None:
             console.print(f"Score: {upload_result['score']}")

{hte_cli-0.2.21 → hte_cli-0.2.22}/tests/e2e/automated_runner.py RENAMED Viewed

@@ -52,6 +52,10 @@ def strip_ansi(text: str) -> str:
 VPS_HOST = os.environ.get("VPS_HOST", "root@209.38.25.118")
+# Use the pipx-installed CLI (what experts use), not the local venv version
+# This ensures E2E tests validate the actual published PyPI package
+HTE_CLI_PATH = Path.home() / ".local" / "bin" / "hte-cli"
 # Environment verification commands per benchmark
 # Based on docs/benchmark-environments.md
 # Format: (command, expected_pattern, description)
@@ -155,15 +159,15 @@ ENVIRONMENT_CHECKS = {
         ("pwd", "/root", "Working directory is /root"),
         # OS: Kali Linux (same base as cybench)
         ("cat /etc/os-release | grep '^NAME='", "Kali", "Kali Linux base image"),
-        # CyberGym-specific: submit.sh MUST exist for PoC submission
-        ("test -f /submit.sh && echo 'submit.sh'", "submit.sh", "submit.sh present at /submit.sh"),
-        ("test -x /submit.sh && echo 'executable'", "executable", "submit.sh is executable"),
+        # CyberGym-specific: submit.sh in home dir for PoC submission
+        ("test -f /root/submit.sh && echo 'submit.sh'", "submit.sh", "submit.sh present in /root"),
+        ("test -x /root/submit.sh && echo 'executable'", "executable", "submit.sh is executable"),
         # Required tools - STRICT (same image as cybench)
         ("which gdb", "/usr/bin/gdb", "gdb at /usr/bin/gdb"),
         ("which ghidra", "/usr/bin/ghidra", "ghidra at /usr/bin/ghidra"),
         ("which radare2", "/usr/bin/radare2", "radare2 at /usr/bin/radare2"),
-        # Core security tools - STRICT
-        ("which python3", "/usr/bin/python3", "python3 at /usr/bin/python3"),
+        # Python via venv (same as cybench/cybench:latest)
+        ("test -f /venv/bin/python3 && echo 'venv'", "venv", "Python venv at /venv"),
         ("which nmap", "/usr/bin/nmap", "nmap at /usr/bin/nmap"),
         ("which curl", "/usr/bin/curl", "curl at /usr/bin/curl"),
         # apt-get for additional tools
@@ -203,7 +207,7 @@ TASK_FILE_CHECKS = {
         "primary_knowledge": [],  # No local files - network-based
     },
     "cybergym": {
-        "*": ["/submit.sh"],  # All cybergym tasks need submit.sh
+        "*": ["/root/submit.sh"],  # All cybergym tasks need submit.sh in home
     },
     "nyuctf": {
         "*": ["/home/ctfplayer/ctf_files"],  # Standard location
@@ -233,19 +237,19 @@ def get_file_checks_for_task(benchmark: str, task_id: str) -> list[tuple]:
 SUBMISSION_TESTS = {
     "intercode-ctf": {
         "score_cmd": 'task score "test_flag_123"',
-        "score_expect": "incorrect",  # Should say incorrect
+        "score_expect": "Score:",  # Output shows "Score: I" (I=Incorrect)
         "submit_cmd": 'task submit "test_flag_for_e2e"',
         "submit_expect": "submitted",
     },
     "nyuctf": {
         "score_cmd": 'task score "test_flag_123"',
-        "score_expect": "incorrect",
+        "score_expect": "Score:",  # Output shows "Score: I" (I=Incorrect)
         "submit_cmd": 'task submit "test_flag_for_e2e"',
         "submit_expect": "submitted",
     },
     "cybench": {
         "score_cmd": 'task score "test_flag_123"',
-        "score_expect": "incorrect",
+        "score_expect": "Score:",  # Output shows "Score: I" (I=Incorrect)
         "submit_cmd": 'task submit "test_flag_for_e2e"',
         "submit_expect": "submitted",
     },
@@ -266,6 +270,12 @@ SUBMISSION_TESTS = {
         "submit_cmd": 'task submit "ls -la"',
         "submit_expect": "submitted",
     },
+    "nl2bash": {
+        # Single-shot - no score command (same as cybashbench)
+        "score_cmd": None,
+        "submit_cmd": 'task submit "ls -la"',
+        "submit_expect": "submitted",
+    },
 }
@@ -374,9 +384,16 @@ def run_automated_test(task_id: str, benchmark: str, timeout: int = 300) -> list
     # Start the CLI using the new session join flow
     # Session has status="created", so CLI will run full setup
-    console.print(f"Launching hte-cli session join {session_id}...")
+    # Use explicit pipx path to test the published PyPI version, not local dev
+    if not HTE_CLI_PATH.exists():
+        console.print(f"[red]hte-cli not found at {HTE_CLI_PATH}[/red]")
+        console.print("[yellow]Install with: pipx install hte-cli[/yellow]")
+        results.append(TestResult("CLI installed", False, "", f"hte-cli not at {HTE_CLI_PATH}"))
+        return results
+    console.print(f"Launching {HTE_CLI_PATH} session join {session_id}...")
     child = pexpect.spawn(
-        f"hte-cli session join {session_id}",
+        f"{HTE_CLI_PATH} session join {session_id}",
         encoding="utf-8",
         timeout=timeout,
         env={**os.environ, "TERM": "dumb"},  # Disable colors for easier parsing
@@ -426,6 +443,7 @@ def run_automated_test(task_id: str, benchmark: str, timeout: int = 300) -> list
         results.append(TestResult("Environment setup", True, "Environment ready"))
         # Wait for the "Login to the system" message and docker exec command
+        # CVE bench builds containers from source, can take 5+ minutes
         console.print("Waiting for docker exec command...")
         idx = child.expect(
             [
@@ -433,7 +451,7 @@ def run_automated_test(task_id: str, benchmark: str, timeout: int = 300) -> list
                 r"docker exec -it",
                 pexpect.TIMEOUT,
             ],
-            timeout=120,
+            timeout=300,  # 5 minutes for slow builds (cvebench)
         )
         if idx == 2:  # TIMEOUT
@@ -603,6 +621,11 @@ def run_automated_test(task_id: str, benchmark: str, timeout: int = 300) -> list
         # Test score command if available
         if sub_tests.get("score_cmd"):
+            # Clear buffer before score test to avoid capturing stale output
+            try:
+                docker_child.read_nonblocking(size=10000, timeout=0.5)
+            except Exception:
+                pass
             docker_child.sendline(sub_tests["score_cmd"])
             time.sleep(2)
             docker_child.expect(prompt_patterns[:-1], timeout=30)
@@ -776,9 +799,15 @@ def verify_artifacts(task_id: str, benchmark: str) -> list[TestResult]:
                 )
             )
-        results.append(
-            TestResult("Score recorded", score != "", f"Score: {score}" if score else "No score")
-        )
+        # Score check - some benchmarks compute scores server-side later (not immediately)
+        no_immediate_score = benchmark in ("cybashbench", "nl2bash")
+        if no_immediate_score:
+            # These benchmarks don't produce immediate scores - skip check
+            pass
+        else:
+            results.append(
+                TestResult("Score recorded", score != "", f"Score: {score}" if score else "No score")
+            )
         # Check events (new flow uses setup_started/setup_completed instead of session_started)
         events = ssh_query(f"""

{hte_cli-0.2.21 → hte_cli-0.2.22}/tests/e2e/e2e_test.py RENAMED Viewed

@@ -37,6 +37,9 @@ TEST_NAME = "E2E Test User"
 # CLI config path (matches platformdirs on macOS)
 CLI_CONFIG_PATH = Path.home() / "Library" / "Application Support" / "hte-cli" / "config.json"
+# Use the pipx-installed CLI (what experts use), not the local venv version
+HTE_CLI_PATH = Path.home() / ".local" / "bin" / "hte-cli"
 # Task assignments: 4 per benchmark
 # First 2 for pytest API tests, last 2 for interactive tests
 BENCHMARK_TASKS = {
@@ -347,10 +350,10 @@ def setup(admin_password: str, yes: bool):
     CLI_CONFIG_PATH.write_text(json.dumps(config, indent=2))
     console.print("[green]CLI config written[/green]")
-    # 7. Verify CLI works
+    # 7. Verify CLI works (use pipx version, not local venv)
     console.print("\nVerifying CLI authentication...")
     result = subprocess.run(
-        ["hte-cli", "auth", "status"],
+        [str(HTE_CLI_PATH), "auth", "status"],
         capture_output=True,
         text=True,
     )

{hte_cli-0.2.21 → hte_cli-0.2.22}/tests/e2e/test_eval_logs.py RENAMED Viewed

@@ -28,6 +28,15 @@ LOCAL_EVAL_LOGS_DIR = Path.home() / "Library" / "Application Support" / "hte-cli
 VPS_EVAL_LOGS_DIR = "/opt/hte-web/data/eval_logs"
+def db_path_to_host_path(db_path: str) -> str:
+    """Translate container path stored in DB to host path on VPS.
+    Backend runs in Docker with /opt/hte-web/data mounted as /data,
+    so paths are stored as /data/... but host has /opt/hte-web/data/...
+    """
+    return db_path.replace("/data/", "/opt/hte-web/data/")
 def ssh_query(query: str) -> str:
     """Run a sqlite3 query on the VPS."""
     result = subprocess.run(
@@ -129,9 +138,14 @@ class TestVPSEvalLogs:
         """)
         # All completed sessions should have eval log paths
-        assert int(with_path) == int(
-            count
-        ), f"Only {with_path}/{count} completed sessions have eval_log_path"
+        # Handle empty string from SQL query
+        with_path_count = int(with_path) if with_path else 0
+        total_count = int(count) if count else 0
+        if total_count == 0:
+            pytest.skip("No completed sessions to check")
+        assert with_path_count == total_count, f"Only {with_path_count}/{total_count} completed sessions have eval_log_path"
     def test_eval_log_files_exist_on_vps(self):
         """Eval log files referenced in DB should exist on VPS."""
@@ -148,8 +162,9 @@ class TestVPSEvalLogs:
         for path in paths.split("\n"):
             if path:
-                exists = ssh_command(f"test -f {path} && echo exists")
-                assert exists == "exists", f"Eval log not found: {path}"
+                host_path = db_path_to_host_path(path)
+                exists = ssh_command(f"test -f {host_path} && echo exists")
+                assert exists == "exists", f"Eval log not found: {host_path} (DB path: {path})"
 # =============================================================================
@@ -176,32 +191,34 @@ class TestEvalLogFormat:
     def test_eval_log_can_be_decompressed(self):
         """Eval logs should be valid gzip files."""
-        path = ssh_query("""
+        db_path = ssh_query("""
             SELECT eval_log_path FROM sessions
             WHERE status = 'submitted'
             AND eval_log_path IS NOT NULL
             LIMIT 1
         """)
-        if not path:
+        if not db_path:
             pytest.skip("No eval logs to test")
+        path = db_path_to_host_path(db_path)
         # Try to decompress
         result = ssh_command(f"gunzip -t {path} 2>&1 && echo ok")
         assert "ok" in result, f"Eval log not valid gzip: {result}"
     def test_eval_log_contains_expected_structure(self):
         """Eval logs should contain expected Inspect AI structure."""
-        path = ssh_query("""
+        db_path = ssh_query("""
             SELECT eval_log_path FROM sessions
             WHERE status = 'submitted'
             AND eval_log_path IS NOT NULL
             LIMIT 1
         """)
-        if not path:
+        if not db_path:
             pytest.skip("No eval logs to test")
+        path = db_path_to_host_path(db_path)
         # List contents of the gzipped eval (it's actually a zip inside gzip)
         # First copy to temp, decompress, check structure
         result = ssh_command(f"""
@@ -226,40 +243,43 @@ class TestEvalLogUpload:
     """Test eval log upload functionality."""
     def test_upload_event_recorded(self):
-        """Upload events should be recorded in session_events."""
+        """Upload events should be recorded in session_events for sessions with eval logs."""
+        # Only check sessions that have eval_log_path (proves upload succeeded)
         session_id = ssh_query(f"""
             SELECT id FROM sessions
             WHERE user_id = '{get_test_user_id()}'
             AND status = 'submitted'
+            AND eval_log_path IS NOT NULL
             LIMIT 1
         """)
         if not session_id:
-            pytest.skip("No completed sessions")
+            pytest.skip("No completed sessions with eval logs")
         events = ssh_query(f"""
             SELECT event_type FROM session_events
             WHERE session_id = '{session_id}'
         """)
-        # Should have upload-related events for completed sessions
+        # Should have upload-related events for sessions with eval logs
         event_list = events.split("\n") if events else []
         has_upload = any("upload" in e.lower() for e in event_list)
-        # Completed sessions should have upload events
         assert has_upload, f"No upload events found for session {session_id}. Events: {event_list[:5]}"
     def test_eval_log_size_reasonable(self):
         """Eval logs should be reasonably sized (not empty, not huge)."""
-        path = ssh_query("""
+        db_path = ssh_query("""
             SELECT eval_log_path FROM sessions
             WHERE status = 'submitted'
             AND eval_log_path IS NOT NULL
             LIMIT 1
         """)
-        if not path:
+        if not db_path:
             pytest.skip("No eval logs to test")
+        path = db_path_to_host_path(db_path)
         size = ssh_command(f"stat -c%s {path} 2>/dev/null || stat -f%z {path}")
         if size.isdigit():

{hte_cli-0.2.21 → hte_cli-0.2.22}/uv.lock RENAMED Viewed

@@ -625,7 +625,7 @@ wheels = [
 [[package]]
 name = "hte-cli"
-version = "0.2.20"
+version = "0.2.21"
 source = { editable = "." }
 dependencies = [
     { name = "click" },