PyPI - hte-cli - Versions diffs - 0.2.19__tar.gz → 0.2.22__tar.gz - Mend

hte-cli 0.2.19tar.gz → 0.2.22tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

{hte_cli-0.2.19 → hte_cli-0.2.22}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: hte-cli
-Version: 0.2.19
+Version: 0.2.22
 Summary: Human Time-to-Completion Evaluation CLI
 Project-URL: Homepage, https://github.com/sean-peters-au/lyptus-mono
 Author: Lyptus Research

{hte_cli-0.2.19 → hte_cli-0.2.22}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "hte-cli"
-version = "0.2.19"
+version = "0.2.22"
 description = "Human Time-to-Completion Evaluation CLI"
 readme = "README.md"
 requires-python = ">=3.11"

{hte_cli-0.2.19 → hte_cli-0.2.22}/src/hte_cli/cli.py RENAMED Viewed

@@ -280,6 +280,14 @@ def session_join(ctx, session_id: str, force_setup: bool):
         },
     }
+    # Send session_started event (records CLI version for debugging)
+    events.session_started(
+        {
+            "cli_version": __version__,
+            "task_id": session_info["task_id"],
+        }
+    )
     # Step 3: Run setup (skip if reconnecting without force)
     setup_start_time = time.monotonic()
     images = []
@@ -429,13 +437,21 @@ def session_join(ctx, session_id: str, force_setup: bool):
         console.print(f"Answer: {result.answer}")
         console.print(f"Time: {result.time_seconds:.1f}s")
+        # Track upload size and timing
+        upload_size_bytes = len(eval_log_bytes) if eval_log_bytes else 0
+        upload_size_kb = upload_size_bytes / 1024
+        events.upload_started(size_bytes=upload_size_bytes)
+        upload_start_time = time.monotonic()
         # Upload to server
         with Progress(
             SpinnerColumn(),
             TextColumn("[progress.description]{task.description}"),
             console=console,
         ) as progress:
-            progress.add_task("Uploading result...", total=None)
+            size_str = f" ({upload_size_kb:.0f} KB)" if upload_size_kb > 0 else ""
+            progress.add_task(f"Uploading result{size_str}...", total=None)
             try:
                 upload_result = api.upload_result(
                     session_id=session_id,
@@ -450,6 +466,10 @@ def session_join(ctx, session_id: str, force_setup: bool):
                 console.print(f"[red]Failed to upload result: {e}[/red]")
                 sys.exit(1)
+        # Record upload completion
+        upload_duration = time.monotonic() - upload_start_time
+        events.upload_completed(duration_seconds=upload_duration, size_bytes=upload_size_bytes)
         if upload_result.get("score") is not None:
             console.print(f"Score: {upload_result['score']}")

{hte_cli-0.2.19 → hte_cli-0.2.22}/tests/e2e/automated_runner.py RENAMED Viewed

@@ -52,11 +52,15 @@ def strip_ansi(text: str) -> str:
 VPS_HOST = os.environ.get("VPS_HOST", "root@209.38.25.118")
+# Use the pipx-installed CLI (what experts use), not the local venv version
+# This ensures E2E tests validate the actual published PyPI package
+HTE_CLI_PATH = Path.home() / ".local" / "bin" / "hte-cli"
 # Environment verification commands per benchmark
 # Based on docs/benchmark-environments.md
 # Format: (command, expected_pattern, description)
-# expected_pattern=None means just check command succeeds (exit 0)
-# These tests are STRICT - they verify the exact environment specified in docs
+# STRICT: All tests must pass - no optional/informational checks
+# These tests verify the exact environment specified in docs
 ENVIRONMENT_CHECKS = {
     "intercode-ctf": [
         # Working directory: /ctf/{task_num} (e.g., /ctf/3, /ctf/85)
@@ -69,8 +73,8 @@ ENVIRONMENT_CHECKS = {
         ("which netcat", "/usr/bin/netcat", "netcat installed"),
         ("which tcpdump", "/usr/bin/tcpdump", "tcpdump installed"),
         ("which curl", "/usr/bin/curl", "curl installed"),
-        # Forensics tools
-        ("which mmls", None, "sleuthkit installed (mmls)"),
+        # Forensics tools - STRICT: must exist
+        ("which mmls", "/usr/bin/mmls", "mmls (sleuthkit) at /usr/bin/mmls"),
         # Python packages - must import successfully
         ("python3 -c 'from pwn import *; print(\"pwntools\")'", "pwntools", "pwntools importable"),
         ("python3 -c 'from scapy.all import *; print(\"scapy\")'", "scapy", "scapy importable"),
@@ -90,16 +94,17 @@ ENVIRONMENT_CHECKS = {
         ("pwd", "/home/ctfplayer", "Working directory is /home/ctfplayer"),
         # User: ctfplayer (NOT root)
         ("whoami", "ctfplayer", "Running as ctfplayer user"),
-        ("id -u", None, "User ID check"),  # Should be non-zero
+        # User ID must be 1000 (standard non-root)
+        ("id -u", "1000", "User ID is 1000"),
         # Passwordless sudo must work
         ("sudo -n whoami", "root", "Passwordless sudo to root"),
         # OS: Ubuntu 22.04 specifically
         ("cat /etc/os-release | grep VERSION_ID", "22.04", "Ubuntu 22.04 base"),
-        # Core tools at expected paths
+        # Core tools at expected paths - STRICT
         ("which gdb", "/usr/bin/gdb", "gdb at /usr/bin/gdb"),
-        ("which radare2", "/usr/bin/radare2", "radare2 installed"),
-        ("which sqlmap", "/usr/bin/sqlmap", "sqlmap installed"),
-        ("which jadx", None, "jadx installed"),
+        ("which radare2", "/usr/bin/radare2", "radare2 at /usr/bin/radare2"),
+        ("which sqlmap", "/usr/bin/sqlmap", "sqlmap at /usr/bin/sqlmap"),
+        ("which jadx", "/usr/local/bin/jadx", "jadx at /usr/local/bin/jadx"),
         # Python venv must exist at expected location
         (
             "test -f /home/ctfplayer/.ctfenv/bin/python3 && echo 'venv'",
@@ -112,72 +117,72 @@ ENVIRONMENT_CHECKS = {
         ("python3 -c 'import z3; print(\"z3\")'", "z3", "z3-solver importable"),
         # 32-bit support (for CTF binaries)
         ("test -f /lib32/libc.so.6 && echo 'lib32'", "lib32", "32-bit libc available"),
+        # Task files location - STRICT
+        ("test -d /home/ctfplayer/ctf_files && echo 'EXISTS'", "EXISTS", "ctf_files directory exists"),
     ],
     "cybench": [
-        # Working directory: /root (Kali default)
-        ("pwd", "/root", "Working directory is /root"),
+        # Working directory: /app (official cybench/cybench:latest image)
+        ("pwd", "/app", "Working directory is /app"),
         # OS: Kali Linux (must contain 'Kali' in name)
         ("cat /etc/os-release | grep '^NAME='", "Kali", "Kali Linux base image"),
-        # Image detection tools (see docs/cybench-image-investigation.md):
-        # - aisiuk/evals-cybench-agent-sandbox: NO gdb, NO ghidra, YES radare2
-        # - cybench/cybench:latest: YES gdb, NO ghidra, YES radare2
-        # - cybench/kali-linux-large:latest: YES gdb, YES ghidra, YES radare2
-        # All of these pass - we just want to know which image
-        ("which gdb || echo 'NOT_FOUND'", None, "[IMAGE] gdb"),
-        ("which ghidra || echo 'NOT_FOUND'", None, "[IMAGE] ghidra"),
-        ("which radare2 || echo 'NOT_FOUND'", None, "[IMAGE] radare2"),
-        ("which binwalk || echo 'NOT_FOUND'", None, "[IMAGE] binwalk"),
-        # Security tools
-        ("which nmap", "/usr/bin/nmap", "nmap installed"),
-        ("which sqlmap", "/usr/bin/sqlmap", "sqlmap installed"),
-        ("which msfconsole || echo 'NOT_FOUND'", None, "[IMAGE] msfconsole"),
-        ("which hydra || echo 'NOT_FOUND'", None, "[IMAGE] hydra"),
-        ("which john || echo 'NOT_FOUND'", None, "[IMAGE] john"),
-        # Docker-in-Docker capability (required for web challenges)
+        # Required tools - STRICT: must exist at expected paths
+        # Using cybench/cybench:latest which has all these tools
+        ("which gdb", "/usr/bin/gdb", "gdb at /usr/bin/gdb"),
+        ("which ghidra", "/usr/bin/ghidra", "ghidra at /usr/bin/ghidra"),
+        ("which radare2", "/usr/bin/radare2", "radare2 at /usr/bin/radare2"),
+        ("which binwalk", "/usr/bin/binwalk", "binwalk at /usr/bin/binwalk"),
+        # Security tools - STRICT
+        ("which nmap", "/usr/bin/nmap", "nmap at /usr/bin/nmap"),
+        ("which sqlmap", "/usr/bin/sqlmap", "sqlmap at /usr/bin/sqlmap"),
+        # Docker available (daemon may not be running in all test contexts)
         ("docker --version", "Docker version", "Docker available"),
-        ("docker ps 2>&1 | head -1", None, "Docker daemon accessible"),
-        # Python 3
-        ("which python3", None, "python3 installed"),
-        ("python3 --version", "Python 3", "Python 3.x"),
+        # NOTE: Docker daemon check (docker ps) removed - DinD requires privileged
+        # mode which isn't always available. Web challenges needing DinD will
+        # fail at runtime if daemon isn't accessible.
+        # Python via venv - STRICT
+        ("test -f /venv/bin/python3 && echo 'venv'", "venv", "Python venv at /venv"),
+        (
+            "/venv/bin/python3 -c 'from pwn import *; print(\"pwntools\")'",
+            "pwntools",
+            "pwntools in /venv",
+        ),
         # apt-get for additional tools
         ("apt-get --version", "apt", "apt-get available"),
         # Task files visible - cybench tasks have nested directories (blog/, nginx/, etc.)
-        # Check that files exist in /root (not just empty directory)
-        ("ls -la /root/ | grep -E '^d|^-' | wc -l", None, "Items in /root"),
+        # Check that files exist in /root (not just empty directory) using test -gt 0
+        ("test $(ls -A /root/ | wc -l) -gt 0", None, "Items in /root"),
         # For tasks with subdirs (chunky, etc.), verify structure is mounted
-        ("find /root -type f 2>/dev/null | head -5", None, "Files under /root (recursive)"),
+        ("test $(find /root -type f 2>/dev/null | head -5 | wc -l) -gt 0", None, "Files under /root (recursive)"),
     ],
     "cybergym": [
         # Working directory: /root
         ("pwd", "/root", "Working directory is /root"),
         # OS: Kali Linux (same base as cybench)
         ("cat /etc/os-release | grep '^NAME='", "Kali", "Kali Linux base image"),
-        # CyberGym-specific: submit.sh MUST exist for PoC submission
-        ("test -f /submit.sh && echo 'submit.sh'", "submit.sh", "submit.sh present at /submit.sh"),
-        ("test -x /submit.sh && echo 'executable'", "executable", "submit.sh is executable"),
-        # Image detection (same as cybench - uses cybench/cybench:latest)
-        ("which gdb || echo 'NOT_FOUND'", None, "[IMAGE] gdb"),
-        ("which ghidra || echo 'NOT_FOUND'", None, "[IMAGE] ghidra"),
-        ("which radare2 || echo 'NOT_FOUND'", None, "[IMAGE] radare2"),
-        # Core security tools
-        ("which python3", None, "python3 installed"),
-        ("which nmap", "/usr/bin/nmap", "nmap installed"),
-        ("which curl", "/usr/bin/curl", "curl installed"),
+        # CyberGym-specific: submit.sh in home dir for PoC submission
+        ("test -f /root/submit.sh && echo 'submit.sh'", "submit.sh", "submit.sh present in /root"),
+        ("test -x /root/submit.sh && echo 'executable'", "executable", "submit.sh is executable"),
+        # Required tools - STRICT (same image as cybench)
+        ("which gdb", "/usr/bin/gdb", "gdb at /usr/bin/gdb"),
+        ("which ghidra", "/usr/bin/ghidra", "ghidra at /usr/bin/ghidra"),
+        ("which radare2", "/usr/bin/radare2", "radare2 at /usr/bin/radare2"),
+        # Python via venv (same as cybench/cybench:latest)
+        ("test -f /venv/bin/python3 && echo 'venv'", "venv", "Python venv at /venv"),
+        ("which nmap", "/usr/bin/nmap", "nmap at /usr/bin/nmap"),
+        ("which curl", "/usr/bin/curl", "curl at /usr/bin/curl"),
         # apt-get for additional tools
         ("apt-get --version", "apt", "apt-get available"),
-        # Task files from setup.sh
-        ("ls /root/ 2>/dev/null | wc -l", None, "Files in /root"),
     ],
     "cvebench": [
         # OS: Kali Linux
         ("cat /etc/os-release | grep '^NAME='", "Kali", "Kali Linux base image"),
-        # Core security tools for CVE exploitation
-        ("which python3", None, "python3 installed"),
-        ("which nmap", "/usr/bin/nmap", "nmap installed"),
-        ("which curl", "/usr/bin/curl", "curl installed"),
-        ("which wget", "/usr/bin/wget", "wget installed"),
-        # Network access to target
-        ("which nc", None, "netcat installed"),
+        # Core security tools for CVE exploitation - STRICT
+        ("which python3", "/usr/bin/python3", "python3 at /usr/bin/python3"),
+        ("which nmap", "/usr/bin/nmap", "nmap at /usr/bin/nmap"),
+        ("which curl", "/usr/bin/curl", "curl at /usr/bin/curl"),
+        ("which wget", "/usr/bin/wget", "wget at /usr/bin/wget"),
+        # Network access to target - STRICT
+        ("which nc", "/usr/bin/nc", "netcat at /usr/bin/nc"),
         # apt-get for additional tools
         ("apt-get --version", "apt", "apt-get available"),
     ],
@@ -191,23 +196,60 @@ ENVIRONMENT_CHECKS = {
     ],
 }
+# Task-specific file expectations
+# Maps benchmark -> task_id -> list of expected paths
+# Use "*" as task_id for checks that apply to all tasks in a benchmark
+TASK_FILE_CHECKS = {
+    "cybench": {
+        "chunky": ["/root/blog", "/root/nginx", "/root/rcache"],
+        "avatar": ["/root/chall.py"],
+        "lootstash": ["/root/stash"],
+        "primary_knowledge": [],  # No local files - network-based
+    },
+    "cybergym": {
+        "*": ["/root/submit.sh"],  # All cybergym tasks need submit.sh in home
+    },
+    "nyuctf": {
+        "*": ["/home/ctfplayer/ctf_files"],  # Standard location
+    },
+    "intercode-ctf": {
+        "*": ["/ctf"],  # Task files in /ctf directory
+    },
+}
+def get_file_checks_for_task(benchmark: str, task_id: str) -> list[tuple]:
+    """Generate file existence checks based on task expectations."""
+    checks = []
+    task_files = TASK_FILE_CHECKS.get(benchmark, {})
+    # Try task-specific first, then fallback to wildcard
+    expected = task_files.get(task_id) or task_files.get("*", [])
+    for path in expected:
+        checks.append(
+            (f"test -e {path} && echo 'EXISTS'", "EXISTS", f"{path} exists")
+        )
+    return checks
 # Commands to run for submission tests
 SUBMISSION_TESTS = {
     "intercode-ctf": {
         "score_cmd": 'task score "test_flag_123"',
-        "score_expect": "incorrect",  # Should say incorrect
+        "score_expect": "Score:",  # Output shows "Score: I" (I=Incorrect)
         "submit_cmd": 'task submit "test_flag_for_e2e"',
         "submit_expect": "submitted",
     },
     "nyuctf": {
         "score_cmd": 'task score "test_flag_123"',
-        "score_expect": "incorrect",
+        "score_expect": "Score:",  # Output shows "Score: I" (I=Incorrect)
         "submit_cmd": 'task submit "test_flag_for_e2e"',
         "submit_expect": "submitted",
     },
     "cybench": {
         "score_cmd": 'task score "test_flag_123"',
-        "score_expect": "incorrect",
+        "score_expect": "Score:",  # Output shows "Score: I" (I=Incorrect)
         "submit_cmd": 'task submit "test_flag_for_e2e"',
         "submit_expect": "submitted",
     },
@@ -228,6 +270,12 @@ SUBMISSION_TESTS = {
         "submit_cmd": 'task submit "ls -la"',
         "submit_expect": "submitted",
     },
+    "nl2bash": {
+        # Single-shot - no score command (same as cybashbench)
+        "score_cmd": None,
+        "submit_cmd": 'task submit "ls -la"',
+        "submit_expect": "submitted",
+    },
 }
@@ -336,9 +384,16 @@ def run_automated_test(task_id: str, benchmark: str, timeout: int = 300) -> list
     # Start the CLI using the new session join flow
     # Session has status="created", so CLI will run full setup
-    console.print(f"Launching hte-cli session join {session_id}...")
+    # Use explicit pipx path to test the published PyPI version, not local dev
+    if not HTE_CLI_PATH.exists():
+        console.print(f"[red]hte-cli not found at {HTE_CLI_PATH}[/red]")
+        console.print("[yellow]Install with: pipx install hte-cli[/yellow]")
+        results.append(TestResult("CLI installed", False, "", f"hte-cli not at {HTE_CLI_PATH}"))
+        return results
+    console.print(f"Launching {HTE_CLI_PATH} session join {session_id}...")
     child = pexpect.spawn(
-        f"hte-cli session join {session_id}",
+        f"{HTE_CLI_PATH} session join {session_id}",
         encoding="utf-8",
         timeout=timeout,
         env={**os.environ, "TERM": "dumb"},  # Disable colors for easier parsing
@@ -388,6 +443,7 @@ def run_automated_test(task_id: str, benchmark: str, timeout: int = 300) -> list
         results.append(TestResult("Environment setup", True, "Environment ready"))
         # Wait for the "Login to the system" message and docker exec command
+        # CVE bench builds containers from source, can take 5+ minutes
         console.print("Waiting for docker exec command...")
         idx = child.expect(
             [
@@ -395,7 +451,7 @@ def run_automated_test(task_id: str, benchmark: str, timeout: int = 300) -> list
                 r"docker exec -it",
                 pexpect.TIMEOUT,
             ],
-            timeout=120,
+            timeout=300,  # 5 minutes for slow builds (cvebench)
         )
         if idx == 2:  # TIMEOUT
@@ -503,7 +559,12 @@ def run_automated_test(task_id: str, benchmark: str, timeout: int = 300) -> list
             # Get benchmark-specific checks from ENVIRONMENT_CHECKS
             env_checks = ENVIRONMENT_CHECKS.get(benchmark, [])
-            for check in env_checks:
+            # Add task-specific file checks
+            file_checks = get_file_checks_for_task(benchmark, task_id)
+            all_checks = env_checks + file_checks
+            for check in all_checks:
                 # Unpack: (command, expected_pattern, description)
                 if len(check) == 3:
                     cmd, expected, desc = check
@@ -517,7 +578,7 @@ def run_automated_test(task_id: str, benchmark: str, timeout: int = 300) -> list
                         ["docker", "exec", container_name, "sh", "-c", cmd],
                         capture_output=True,
                         text=True,
-                        timeout=15,
+                        timeout=30,  # Increased for slow imports (angr takes ~10s)
                     )
                     output = result.stdout.strip()
                     stderr = result.stderr.strip()
@@ -560,17 +621,29 @@ def run_automated_test(task_id: str, benchmark: str, timeout: int = 300) -> list
         # Test score command if available
         if sub_tests.get("score_cmd"):
+            # Clear buffer before score test to avoid capturing stale output
+            try:
+                docker_child.read_nonblocking(size=10000, timeout=0.5)
+            except Exception:
+                pass
             docker_child.sendline(sub_tests["score_cmd"])
             time.sleep(2)
             docker_child.expect(prompt_patterns[:-1], timeout=30)
             output = strip_ansi(docker_child.before or "")
-            results.append(
-                TestResult(
-                    "task score",
-                    True,  # Just checking it runs
-                    output[:200],
+            expected_score = sub_tests.get("score_expect")
+            if expected_score:
+                passed = expected_score.lower() in output.lower()
+                details = output[:200] if passed else f"Expected '{expected_score}' in output: {output[:100]}..."
+                results.append(TestResult("task score", passed, details))
+            else:
+                results.append(
+                    TestResult(
+                        "task score",
+                        True,  # Just checking it runs
+                        output[:200],
+                    )
                 )
-            )
         # Submit answer
         console.print("Submitting test answer...")
@@ -605,14 +678,15 @@ def run_automated_test(task_id: str, benchmark: str, timeout: int = 300) -> list
                 timeout=60,
             )
             # EOF (idx=4) is expected - container exits after task submit
-            if idx <= 4:
+            # TIMEOUT (idx=3) is a failure
+            if idx != 3:
                 results.append(
                     TestResult("Submission", True, "Answer submitted (container exited)")
                 )
             else:
                 results.append(
                     TestResult(
-                        "Submission", False, docker_child.before or "", "Submission may have failed"
+                        "Submission", False, docker_child.before or "", "Submission timed out waiting for result"
                     )
                 )
         elif idx < 3:
@@ -624,7 +698,7 @@ def run_automated_test(task_id: str, benchmark: str, timeout: int = 300) -> list
         else:
             results.append(
                 TestResult(
-                    "Submission", False, docker_child.before or "", "Submission may have failed"
+                    "Submission", False, docker_child.before or "", "Submission timed out at prompt"
                 )
             )
@@ -671,10 +745,19 @@ def run_automated_test(task_id: str, benchmark: str, timeout: int = 300) -> list
     return results
-def verify_artifacts(task_id: str, _benchmark: str) -> list[TestResult]:
+def verify_artifacts(task_id: str, benchmark: str) -> list[TestResult]:
     """Verify session and eval log artifacts were created."""
     results = []
+    # Get expected answer from SUBMISSION_TESTS
+    sub_tests = SUBMISSION_TESTS.get(benchmark, {})
+    submit_cmd = sub_tests.get("submit_cmd", "")
+    expected_answer = None
+    if 'task submit "' in submit_cmd:
+        expected_answer = submit_cmd.split('task submit "')[1].rstrip('"')
+    elif "task submit '" in submit_cmd:
+        expected_answer = submit_cmd.split("task submit '")[1].rstrip("'")
     # Check session in database
     session_info = ssh_query(f"""
         SELECT id, status, score, client_active_seconds, answer
@@ -699,14 +782,32 @@ def verify_artifacts(task_id: str, _benchmark: str) -> list[TestResult]:
                 "Active time recorded", float(active_seconds or 0) > 0, f"Seconds: {active_seconds}"
             )
         )
-        results.append(
-            TestResult(
-                "Answer recorded", bool(answer), f"Answer: {answer[:50]}..." if answer else ""
+        # Verify answer
+        if expected_answer and benchmark != "cybergym": # Cybergym submits file content, hard to verify here
+             results.append(
+                TestResult(
+                    "Answer matches submission",
+                    answer == expected_answer,
+                    f"Expected: '{expected_answer}', Got: '{answer}'"
+                )
+            )
+        else:
+             results.append(
+                TestResult(
+                    "Answer recorded", bool(answer), f"Answer: {answer[:50]}..." if answer else ""
+                )
+            )
+        # Score check - some benchmarks compute scores server-side later (not immediately)
+        no_immediate_score = benchmark in ("cybashbench", "nl2bash")
+        if no_immediate_score:
+            # These benchmarks don't produce immediate scores - skip check
+            pass
+        else:
+            results.append(
+                TestResult("Score recorded", score != "", f"Score: {score}" if score else "No score")
             )
-        )
-        results.append(
-            TestResult("Score recorded", score != "", f"Score: {score}" if score else "No score")
-        )
         # Check events (new flow uses setup_started/setup_completed instead of session_started)
         events = ssh_query(f"""

{hte_cli-0.2.19 → hte_cli-0.2.22}/tests/e2e/conftest.py RENAMED Viewed

@@ -22,6 +22,41 @@ VPS_HOST = os.environ.get("VPS_HOST", "root@209.38.25.118")
 TEST_EMAIL = "e2e-test@lyptus.dev"
 CLI_CONFIG_PATH = Path.home() / "Library" / "Application Support" / "hte-cli" / "config.json"
+# Expected task assignments per benchmark (4 per benchmark)
+# First 2 for pytest API tests, last 2 for interactive/automated tests
+EXPECTED_TASKS = {
+    "cybergym": ["arvo:60121", "arvo:8046", "arvo:41073", "arvo:43984"],
+    "cybench": ["chunky", "avatar", "lootstash", "primary_knowledge"],
+    "intercode-ctf": [
+        "intercode-ctf_general_skills/task_4",
+        "intercode-ctf_reverse/task_1",
+        "intercode-ctf_reverse/task_3",
+        "intercode-ctf_reverse/task_85",
+    ],
+    "nyuctf": [
+        "2013f-cry-stfu",
+        "2016q-rev-rock",
+        "2023q-rev-baby_s_first",
+        "2019q-cry-super_curve",
+    ],
+    "cybashbench": [
+        "cybashbench_pwn/nl2bash_003",
+        "cybashbench_reverse/nl2bash_004",
+        "cybashbench_web/nl2bash_001",
+        "cybashbench_forensics/nl2bash_005",
+    ],
+    "cvebench": ["CVE-2024-5084", "CVE-2024-4442", "CVE-2024-22120", "CVE-2024-32511"],
+    "nl2bash": [
+        "nl2bash_complex/task_8581",
+        "nl2bash_complex/task_713",
+        "nl2bash_complex/task_712",
+        "nl2bash_complex/task_8796",
+    ],
+}
+# Total expected assignments (calculated from EXPECTED_TASKS)
+EXPECTED_ASSIGNMENT_COUNT = sum(len(tasks) for tasks in EXPECTED_TASKS.values())
 def ssh_query(query: str) -> str:
     """Run a sqlite3 query on the VPS (READ-ONLY)."""

{hte_cli-0.2.19 → hte_cli-0.2.22}/tests/e2e/e2e_test.py RENAMED Viewed

@@ -37,6 +37,9 @@ TEST_NAME = "E2E Test User"
 # CLI config path (matches platformdirs on macOS)
 CLI_CONFIG_PATH = Path.home() / "Library" / "Application Support" / "hte-cli" / "config.json"
+# Use the pipx-installed CLI (what experts use), not the local venv version
+HTE_CLI_PATH = Path.home() / ".local" / "bin" / "hte-cli"
 # Task assignments: 4 per benchmark
 # First 2 for pytest API tests, last 2 for interactive tests
 BENCHMARK_TASKS = {
@@ -61,6 +64,12 @@ BENCHMARK_TASKS = {
         "cybashbench_forensics/nl2bash_005",
     ],
     "cvebench": ["CVE-2024-5084", "CVE-2024-4442", "CVE-2024-22120", "CVE-2024-32511"],
+    "nl2bash": [
+        "nl2bash_complex/task_8581",
+        "nl2bash_complex/task_713",
+        "nl2bash_complex/task_712",
+        "nl2bash_complex/task_8796",
+    ],
 }
 # Instructions for each benchmark type
@@ -341,10 +350,10 @@ def setup(admin_password: str, yes: bool):
     CLI_CONFIG_PATH.write_text(json.dumps(config, indent=2))
     console.print("[green]CLI config written[/green]")
-    # 7. Verify CLI works
+    # 7. Verify CLI works (use pipx version, not local venv)
     console.print("\nVerifying CLI authentication...")
     result = subprocess.run(
-        ["hte-cli", "auth", "status"],
+        [str(HTE_CLI_PATH), "auth", "status"],
         capture_output=True,
         text=True,
     )
@@ -688,38 +697,158 @@ def cleanup():
     help="Admin password for API access",
 )
 @click.option("--yes", "-y", is_flag=True, help="Skip confirmation prompts")
-def full(admin_password: str, yes: bool):
-    """Run complete E2E test suite (setup, run all, verify, cleanup)."""
-    console.print("\n[bold]Full E2E Test Suite[/bold]\n")
-    console.print("[yellow]This will run all benchmarks interactively.[/yellow]")
-    console.print("You'll need to interact with each task container.\n")
-    if not yes and not click.confirm("Continue?"):
+@click.option("--skip-setup", is_flag=True, help="Skip setup if already done")
+@click.option("--cleanup-after", is_flag=True, help="Run cleanup after tests")
+def full(admin_password: str, yes: bool, skip_setup: bool, cleanup_after: bool):
+    """Run complete E2E test suite in 3 phases.
+    Phase 1: Infrastructure tests (pytest, fast, no containers)
+    Phase 2: Automated benchmark E2E tests (pexpect, creates completed sessions)
+    Phase 3: Session verification tests (pytest, validates completed sessions)
+    This is fully automated - no user interaction required.
+    """
+    console.print(Panel("[bold]Full E2E Test Suite - 3 Phases[/bold]", style="cyan"))
+    console.print("""
+[dim]Phase 1:[/dim] Infrastructure tests (pytest)
+[dim]Phase 2:[/dim] Automated benchmark E2E tests (pexpect)
+[dim]Phase 3:[/dim] Session verification tests (pytest)
+""")
+    if not yes and not click.confirm("Run full automated E2E suite?"):
         raise click.ClickException("Aborted")
-    # Setup
-    ctx = click.get_current_context()
-    ctx.invoke(setup, admin_password=admin_password, yes=yes)
+    results = {"phase1": None, "phase2": {}, "phase3": None}
+    tests_dir = Path(__file__).parent
+    # Setup (unless skipped)
+    if not skip_setup:
+        console.print("\n" + "=" * 60)
+        console.print("[bold cyan]SETUP: Creating test user and assignments[/bold cyan]")
+        console.print("=" * 60)
+        ctx = click.get_current_context()
+        ctx.invoke(setup, admin_password=admin_password, yes=True)
+    # Phase 1: Infrastructure tests
+    console.print("\n" + "=" * 60)
+    console.print("[bold cyan]PHASE 1: Infrastructure Tests[/bold cyan]")
+    console.print("=" * 60)
+    console.print("[dim]Running pytest on infrastructure, imports, benchmark flows...[/dim]\n")
+    phase1_result = subprocess.run(
+        [
+            "uv", "run", "pytest",
+            str(tests_dir / "test_infrastructure.py"),
+            str(tests_dir / "test_runtime_imports.py"),
+            str(tests_dir / "test_benchmark_flows.py"),
+            "-v", "--tb=short",
+        ],
+        cwd=tests_dir.parent.parent,
+    )
+    results["phase1"] = phase1_result.returncode == 0
-    # Run each benchmark
-    for benchmark in BENCHMARK_TASKS.keys():
-        console.print(f"\n{'=' * 50}")
-        console.print(f"[bold]Benchmark: {benchmark}[/bold]")
+    if not results["phase1"]:
+        console.print("\n[red bold]Phase 1 FAILED - stopping[/red bold]")
+        _print_full_summary(results)
+        raise SystemExit(1)
+    console.print("\n[green]Phase 1 PASSED[/green]")
-        for i in range(2):
-            if click.confirm(f"\nRun task {i+1}/2 for {benchmark}?"):
-                ctx.invoke(run, benchmark=benchmark, task_index=i)
+    # Phase 2: Automated benchmark E2E tests
+    console.print("\n" + "=" * 60)
+    console.print("[bold cyan]PHASE 2: Automated Benchmark E2E Tests[/bold cyan]")
+    console.print("=" * 60)
+    console.print("[dim]Running automated tests for each benchmark via pexpect...[/dim]\n")
+    from automated_runner import run_benchmark_test
+    for benchmark in BENCHMARK_TASKS.keys():
+        console.print(f"\n[bold]--- {benchmark} ---[/bold]")
+        try:
+            # Run task index 2 (third task, reserved for automated E2E)
+            success = run_benchmark_test(benchmark, task_index=2)
+            results["phase2"][benchmark] = success
+            if success:
+                console.print(f"[green]{benchmark}: PASSED[/green]")
+            else:
+                console.print(f"[red]{benchmark}: FAILED[/red]")
+        except Exception as e:
+            console.print(f"[red]{benchmark}: ERROR - {e}[/red]")
+            results["phase2"][benchmark] = False
+    phase2_passed = all(results["phase2"].values())
+    if not phase2_passed:
+        console.print("\n[yellow]Phase 2 had failures - continuing to Phase 3[/yellow]")
+    # Phase 3: Session verification tests
+    console.print("\n" + "=" * 60)
+    console.print("[bold cyan]PHASE 3: Session Verification Tests[/bold cyan]")
+    console.print("=" * 60)
+    console.print("[dim]Running pytest on session lifecycle and eval logs...[/dim]\n")
+    phase3_result = subprocess.run(
+        [
+            "uv", "run", "pytest",
+            str(tests_dir / "test_session_lifecycle.py"),
+            str(tests_dir / "test_eval_logs.py"),
+            "-v", "--tb=short",
+        ],
+        cwd=tests_dir.parent.parent,
+    )
+    results["phase3"] = phase3_result.returncode == 0
-    # Verify
-    console.print(f"\n{'=' * 50}")
-    ctx.invoke(verify, admin_password=admin_password)
+    # Summary
+    _print_full_summary(results)
     # Cleanup
-    console.print(f"\n{'=' * 50}")
-    if click.confirm("Run cleanup?"):
+    if cleanup_after:
+        console.print("\n" + "=" * 60)
+        console.print("[bold cyan]CLEANUP[/bold cyan]")
+        ctx = click.get_current_context()
         ctx.invoke(cleanup)
-    console.print("\n[bold green]Full E2E test complete![/bold green]")
+    # Exit with appropriate code
+    all_passed = results["phase1"] and phase2_passed and results["phase3"]
+    if all_passed:
+        console.print("\n[bold green]All phases PASSED![/bold green]")
+    else:
+        console.print("\n[bold red]Some phases FAILED[/bold red]")
+        raise SystemExit(1)
+def _print_full_summary(results: dict):
+    """Print summary table of all phases."""
+    console.print("\n" + "=" * 60)
+    console.print("[bold]SUMMARY[/bold]")
+    console.print("=" * 60)
+    table = Table()
+    table.add_column("Phase", style="cyan")
+    table.add_column("Status")
+    table.add_column("Details")
+    # Phase 1
+    if results["phase1"] is not None:
+        status = "[green]PASSED[/green]" if results["phase1"] else "[red]FAILED[/red]"
+        table.add_row("Phase 1: Infrastructure", status, "pytest infra/imports/flows")
+    # Phase 2
+    if results["phase2"]:
+        passed = sum(1 for v in results["phase2"].values() if v)
+        total = len(results["phase2"])
+        status = "[green]PASSED[/green]" if passed == total else f"[yellow]{passed}/{total}[/yellow]"
+        details = ", ".join(
+            f"[green]{b}[/green]" if v else f"[red]{b}[/red]"
+            for b, v in results["phase2"].items()
+        )
+        table.add_row("Phase 2: Benchmarks", status, details)
+    # Phase 3
+    if results["phase3"] is not None:
+        status = "[green]PASSED[/green]" if results["phase3"] else "[red]FAILED[/red]"
+        table.add_row("Phase 3: Verification", status, "pytest lifecycle/logs")
+    console.print(table)
 if __name__ == "__main__":

{hte_cli-0.2.19 → hte_cli-0.2.22}/tests/e2e/test_benchmark_flows.py RENAMED Viewed

@@ -13,7 +13,14 @@ Run with: uv run pytest tests/e2e/test_benchmark_flows.py -v
 import pytest
 import requests
-from tests.e2e.conftest import BASE_URL, get_test_user_id, ssh_command, ssh_query
+from tests.e2e.conftest import (
+    BASE_URL,
+    EXPECTED_ASSIGNMENT_COUNT,
+    EXPECTED_TASKS,
+    get_test_user_id,
+    ssh_command,
+    ssh_query,
+)
 # Benchmark test configurations
 # First 2 tasks for pytest API tests, last 2 for interactive tests
@@ -367,12 +374,14 @@ class TestCrossBenchmark:
             assert int(count) > 0, f"No assignments for {benchmark}"
     def test_total_assignments_correct(self):
-        """Total assignments should be 24 (4 per benchmark)."""
+        """Total assignments should match expected count (4 per benchmark)."""
         count = ssh_query(f"""
             SELECT COUNT(*) FROM assignments
             WHERE user_id = '{get_test_user_id()}'
         """)
-        assert int(count) == 24
+        assert int(count) == EXPECTED_ASSIGNMENT_COUNT, (
+            f"Expected {EXPECTED_ASSIGNMENT_COUNT} assignments, got {count}"
+        )
 # =============================================================================

{hte_cli-0.2.19 → hte_cli-0.2.22}/tests/e2e/test_eval_logs.py RENAMED Viewed

@@ -28,6 +28,15 @@ LOCAL_EVAL_LOGS_DIR = Path.home() / "Library" / "Application Support" / "hte-cli
 VPS_EVAL_LOGS_DIR = "/opt/hte-web/data/eval_logs"
+def db_path_to_host_path(db_path: str) -> str:
+    """Translate container path stored in DB to host path on VPS.
+    Backend runs in Docker with /opt/hte-web/data mounted as /data,
+    so paths are stored as /data/... but host has /opt/hte-web/data/...
+    """
+    return db_path.replace("/data/", "/opt/hte-web/data/")
 def ssh_query(query: str) -> str:
     """Run a sqlite3 query on the VPS."""
     result = subprocess.run(
@@ -85,8 +94,8 @@ class TestLocalEvalLogs:
             pytest.skip("Local eval logs directory not found")
         logs = list(LOCAL_EVAL_LOGS_DIR.glob("*.eval"))
-        # Just verify we can list them
-        assert isinstance(logs, list)
+        # Verify we found eval logs (if E2E tests have run, there should be some)
+        assert len(logs) > 0, f"No eval logs found in {LOCAL_EVAL_LOGS_DIR}"
 # =============================================================================
@@ -103,11 +112,12 @@ class TestVPSEvalLogs:
         assert result == "exists", "VPS eval logs directory not found"
     def test_vps_eval_log_count(self):
-        """Should be able to count eval logs on VPS."""
+        """Should have eval logs on VPS if sessions have completed."""
         result = ssh_command(f"find {VPS_EVAL_LOGS_DIR} -name '*.eval.gz' 2>/dev/null | wc -l")
-        count = int(result.strip()) if result.strip().isdigit() else 0
-        # Just verify we can count them
-        assert count >= 0
+        assert result.strip().isdigit(), f"Invalid count result: {result}"
+        count = int(result.strip())
+        # If E2E tests have run, there should be eval logs
+        assert count > 0, f"No eval logs found on VPS in {VPS_EVAL_LOGS_DIR}"
     def test_completed_sessions_have_eval_log_path(self):
         """Completed sessions should have eval_log_path recorded."""
@@ -128,9 +138,14 @@ class TestVPSEvalLogs:
         """)
         # All completed sessions should have eval log paths
-        assert int(with_path) == int(
-            count
-        ), f"Only {with_path}/{count} completed sessions have eval_log_path"
+        # Handle empty string from SQL query
+        with_path_count = int(with_path) if with_path else 0
+        total_count = int(count) if count else 0
+        if total_count == 0:
+            pytest.skip("No completed sessions to check")
+        assert with_path_count == total_count, f"Only {with_path_count}/{total_count} completed sessions have eval_log_path"
     def test_eval_log_files_exist_on_vps(self):
         """Eval log files referenced in DB should exist on VPS."""
@@ -147,8 +162,9 @@ class TestVPSEvalLogs:
         for path in paths.split("\n"):
             if path:
-                exists = ssh_command(f"test -f {path} && echo exists")
-                assert exists == "exists", f"Eval log not found: {path}"
+                host_path = db_path_to_host_path(path)
+                exists = ssh_command(f"test -f {host_path} && echo exists")
+                assert exists == "exists", f"Eval log not found: {host_path} (DB path: {path})"
 # =============================================================================
@@ -175,32 +191,34 @@ class TestEvalLogFormat:
     def test_eval_log_can_be_decompressed(self):
         """Eval logs should be valid gzip files."""
-        path = ssh_query("""
+        db_path = ssh_query("""
             SELECT eval_log_path FROM sessions
             WHERE status = 'submitted'
             AND eval_log_path IS NOT NULL
             LIMIT 1
         """)
-        if not path:
+        if not db_path:
             pytest.skip("No eval logs to test")
+        path = db_path_to_host_path(db_path)
         # Try to decompress
         result = ssh_command(f"gunzip -t {path} 2>&1 && echo ok")
         assert "ok" in result, f"Eval log not valid gzip: {result}"
     def test_eval_log_contains_expected_structure(self):
         """Eval logs should contain expected Inspect AI structure."""
-        path = ssh_query("""
+        db_path = ssh_query("""
             SELECT eval_log_path FROM sessions
             WHERE status = 'submitted'
             AND eval_log_path IS NOT NULL
             LIMIT 1
         """)
-        if not path:
+        if not db_path:
             pytest.skip("No eval logs to test")
+        path = db_path_to_host_path(db_path)
         # List contents of the gzipped eval (it's actually a zip inside gzip)
         # First copy to temp, decompress, check structure
         result = ssh_command(f"""
@@ -225,40 +243,43 @@ class TestEvalLogUpload:
     """Test eval log upload functionality."""
     def test_upload_event_recorded(self):
-        """Upload events should be recorded in session_events."""
+        """Upload events should be recorded in session_events for sessions with eval logs."""
+        # Only check sessions that have eval_log_path (proves upload succeeded)
         session_id = ssh_query(f"""
             SELECT id FROM sessions
             WHERE user_id = '{get_test_user_id()}'
             AND status = 'submitted'
+            AND eval_log_path IS NOT NULL
             LIMIT 1
         """)
         if not session_id:
-            pytest.skip("No completed sessions")
+            pytest.skip("No completed sessions with eval logs")
         events = ssh_query(f"""
             SELECT event_type FROM session_events
             WHERE session_id = '{session_id}'
         """)
-        # Should have upload-related events
+        # Should have upload-related events for sessions with eval logs
         event_list = events.split("\n") if events else []
         has_upload = any("upload" in e.lower() for e in event_list)
-        # Note: upload events might not always exist
-        assert isinstance(has_upload, bool)
+        assert has_upload, f"No upload events found for session {session_id}. Events: {event_list[:5]}"
     def test_eval_log_size_reasonable(self):
         """Eval logs should be reasonably sized (not empty, not huge)."""
-        path = ssh_query("""
+        db_path = ssh_query("""
             SELECT eval_log_path FROM sessions
             WHERE status = 'submitted'
             AND eval_log_path IS NOT NULL
             LIMIT 1
         """)
-        if not path:
+        if not db_path:
             pytest.skip("No eval logs to test")
+        path = db_path_to_host_path(db_path)
         size = ssh_command(f"stat -c%s {path} 2>/dev/null || stat -f%z {path}")
         if size.isdigit():

{hte_cli-0.2.19 → hte_cli-0.2.22}/tests/e2e/test_infrastructure.py RENAMED Viewed

@@ -25,31 +25,8 @@ from tests.e2e.conftest import (
     ssh_query,
 )
-# Expected task assignments (4 per benchmark)
-# First 2 for pytest API tests, last 2 for interactive tests
-EXPECTED_TASKS = {
-    "cybergym": ["arvo:60121", "arvo:8046", "arvo:41073", "arvo:43984"],
-    "cybench": ["chunky", "avatar", "lootstash", "primary_knowledge"],
-    "intercode-ctf": [
-        "intercode-ctf_general_skills/task_4",
-        "intercode-ctf_reverse/task_1",
-        "intercode-ctf_reverse/task_3",
-        "intercode-ctf_reverse/task_85",
-    ],
-    "nyuctf": [
-        "2013f-cry-stfu",
-        "2016q-rev-rock",
-        "2023q-rev-baby_s_first",
-        "2019q-cry-super_curve",
-    ],
-    "cybashbench": [
-        "cybashbench_pwn/nl2bash_003",
-        "cybashbench_reverse/nl2bash_004",
-        "cybashbench_web/nl2bash_001",
-        "cybashbench_forensics/nl2bash_005",
-    ],
-    "cvebench": ["CVE-2024-5084", "CVE-2024-4442", "CVE-2024-22120", "CVE-2024-32511"],
-}
+# Import shared constants from conftest
+from tests.e2e.conftest import EXPECTED_TASKS, EXPECTED_ASSIGNMENT_COUNT
 # =============================================================================
@@ -133,11 +110,13 @@ class TestAssignments:
     """Test that task assignments are correctly set up."""
     def test_correct_number_of_assignments(self):
-        """Test user should have exactly 12 assignments."""
+        """Test user should have expected number of assignments."""
         count = ssh_query(
             f"SELECT COUNT(*) FROM assignments WHERE user_id = '{get_test_user_id()}'"
         )
-        assert int(count) == 24
+        assert int(count) == EXPECTED_ASSIGNMENT_COUNT, (
+            f"Expected {EXPECTED_ASSIGNMENT_COUNT} assignments, got {count}"
+        )
     @pytest.mark.parametrize("benchmark,tasks", EXPECTED_TASKS.items())
     def test_benchmark_tasks_assigned(self, benchmark, tasks):
@@ -226,8 +205,9 @@ class TestAPIEndpoints:
         )
         assert response.status_code == 200
         assignments = response.json()
-        # User may have different number of assignments
-        assert isinstance(assignments, list)
+        # Test user should have assignments from E2E setup
+        assert isinstance(assignments, list), "Expected list of assignments"
+        assert len(assignments) > 0, "Test user should have at least one assignment"
     def test_assignment_has_task_info(self, api_headers):
         """Assignments should include task information."""

{hte_cli-0.2.19 → hte_cli-0.2.22}/tests/e2e/test_runtime_imports.py RENAMED Viewed

@@ -149,8 +149,10 @@ print(f'Loaded {len(HUMAN_REGISTRY)} benchmarks: {list(HUMAN_REGISTRY.keys())}')
             pytest.fail(f"Import failed in container: {result.stderr}")
         assert "Loaded" in result.stdout
-        # Should have at least 6 benchmarks
-        assert "6 benchmarks" in result.stdout or "7 benchmarks" in result.stdout
+        # Should have exactly 7 benchmarks
+        assert "7 benchmarks" in result.stdout, (
+            f"Expected 7 benchmarks, got: {result.stdout}"
+        )
     def test_backend_can_import_adapters(self):
         """Backend should be able to instantiate adapters."""
@@ -176,9 +178,11 @@ for name, cls in HUMAN_REGISTRY.items():
         if "FAIL" in result.stdout:
             pytest.fail(f"Adapter instantiation failed: {result.stdout}")
-        # Should have OK for all benchmarks
+        # All benchmarks should show OK - STRICT check
         for benchmark in BENCHMARKS:
-            assert f"{benchmark}: OK" in result.stdout or benchmark not in result.stdout
+            assert f"{benchmark}: OK" in result.stdout, (
+                f"Benchmark {benchmark} not found or not OK in output: {result.stdout}"
+            )
 class TestLocalImports:

{hte_cli-0.2.19 → hte_cli-0.2.22}/tests/e2e/test_session_lifecycle.py RENAMED Viewed

@@ -164,17 +164,26 @@ class TestSessionCompletion:
     def test_completed_session_has_score(self):
         """Completed sessions should have a score."""
+        # Count total submitted sessions
+        total_submitted = ssh_query(f"""
+            SELECT COUNT(*) FROM sessions
+            WHERE user_id = '{get_test_user_id()}'
+            AND status = 'submitted'
+        """)
+        total = int(total_submitted) if total_submitted else 0
+        if total == 0:
+            pytest.skip("No submitted sessions to verify")
+        # Count sessions without score
         sessions_without_score = ssh_query(f"""
             SELECT COUNT(*) FROM sessions
             WHERE user_id = '{get_test_user_id()}'
             AND status = 'submitted'
             AND score IS NULL
         """)
-        # Note: score can legitimately be NULL for some benchmarks
-        # This test documents expected behavior
-        count = int(sessions_without_score)
-        # We just want to verify we can query this
-        assert count >= 0
+        count = int(sessions_without_score) if sessions_without_score else 0
+        # Most submitted sessions should have scores (some benchmarks may not score)
+        assert count < total, f"All {total} sessions missing scores"
     def test_completed_session_has_answer(self):
         """Completed sessions should have an answer."""
@@ -208,14 +217,16 @@ class TestSessionState:
     """Test session state verification (read-only)."""
     def test_abandoned_sessions_count(self):
-        """Verify we can count abandoned sessions."""
+        """Verify abandoned sessions exist and are queryable."""
         abandoned_count = ssh_query(f"""
             SELECT COUNT(*) FROM sessions
             WHERE user_id = '{get_test_user_id()}'
             AND status = 'abandoned'
         """)
-        # Just verify we can query abandoned sessions
-        assert int(abandoned_count) >= 0
+        count = int(abandoned_count) if abandoned_count else 0
+        # Verify the query returned a valid number (not empty/error)
+        assert abandoned_count.strip().isdigit(), f"Query returned invalid value: {abandoned_count}"
+        # Note: count can legitimately be 0 if no sessions were abandoned
     def test_no_stuck_sessions_older_than_24h(self):
         """No in_progress sessions should be older than 24 hours."""
@@ -387,8 +398,9 @@ class TestSessionCancellation:
             WHERE user_id = '{get_test_user_id()}'
             AND status = 'cancelled'
         """)
-        # Just verify we can query cancelled sessions
-        assert int(cancelled) >= 0
+        # Verify query returned valid result
+        assert cancelled.strip().isdigit(), f"Query returned invalid value: {cancelled}"
+        # Note: count can legitimately be 0 if no sessions were cancelled
     def test_no_orphaned_in_progress_after_cancel(self):
         """Assignments should not be in_progress if session is cancelled."""

{hte_cli-0.2.19 → hte_cli-0.2.22}/uv.lock RENAMED Viewed

@@ -625,7 +625,7 @@ wheels = [
 [[package]]
 name = "hte-cli"
-version = "0.2.18"
+version = "0.2.21"
 source = { editable = "." }
 dependencies = [
     { name = "click" },