PyPI - hte-cli - Versions diffs - 0.2.19__tar.gz → 0.2.21__tar.gz - Mend

hte-cli 0.2.19tar.gz → 0.2.21tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

{hte_cli-0.2.19 → hte_cli-0.2.21}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: hte-cli
-Version: 0.2.19
+Version: 0.2.21
 Summary: Human Time-to-Completion Evaluation CLI
 Project-URL: Homepage, https://github.com/sean-peters-au/lyptus-mono
 Author: Lyptus Research

{hte_cli-0.2.19 → hte_cli-0.2.21}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "hte-cli"
-version = "0.2.19"
+version = "0.2.21"
 description = "Human Time-to-Completion Evaluation CLI"
 readme = "README.md"
 requires-python = ">=3.11"

{hte_cli-0.2.19 → hte_cli-0.2.21}/tests/e2e/automated_runner.py RENAMED Viewed

@@ -55,8 +55,8 @@ VPS_HOST = os.environ.get("VPS_HOST", "root@209.38.25.118")
 # Environment verification commands per benchmark
 # Based on docs/benchmark-environments.md
 # Format: (command, expected_pattern, description)
-# expected_pattern=None means just check command succeeds (exit 0)
-# These tests are STRICT - they verify the exact environment specified in docs
+# STRICT: All tests must pass - no optional/informational checks
+# These tests verify the exact environment specified in docs
 ENVIRONMENT_CHECKS = {
     "intercode-ctf": [
         # Working directory: /ctf/{task_num} (e.g., /ctf/3, /ctf/85)
@@ -69,8 +69,8 @@ ENVIRONMENT_CHECKS = {
         ("which netcat", "/usr/bin/netcat", "netcat installed"),
         ("which tcpdump", "/usr/bin/tcpdump", "tcpdump installed"),
         ("which curl", "/usr/bin/curl", "curl installed"),
-        # Forensics tools
-        ("which mmls", None, "sleuthkit installed (mmls)"),
+        # Forensics tools - STRICT: must exist
+        ("which mmls", "/usr/bin/mmls", "mmls (sleuthkit) at /usr/bin/mmls"),
         # Python packages - must import successfully
         ("python3 -c 'from pwn import *; print(\"pwntools\")'", "pwntools", "pwntools importable"),
         ("python3 -c 'from scapy.all import *; print(\"scapy\")'", "scapy", "scapy importable"),
@@ -90,16 +90,17 @@ ENVIRONMENT_CHECKS = {
         ("pwd", "/home/ctfplayer", "Working directory is /home/ctfplayer"),
         # User: ctfplayer (NOT root)
         ("whoami", "ctfplayer", "Running as ctfplayer user"),
-        ("id -u", None, "User ID check"),  # Should be non-zero
+        # User ID must be 1000 (standard non-root)
+        ("id -u", "1000", "User ID is 1000"),
         # Passwordless sudo must work
         ("sudo -n whoami", "root", "Passwordless sudo to root"),
         # OS: Ubuntu 22.04 specifically
         ("cat /etc/os-release | grep VERSION_ID", "22.04", "Ubuntu 22.04 base"),
-        # Core tools at expected paths
+        # Core tools at expected paths - STRICT
         ("which gdb", "/usr/bin/gdb", "gdb at /usr/bin/gdb"),
-        ("which radare2", "/usr/bin/radare2", "radare2 installed"),
-        ("which sqlmap", "/usr/bin/sqlmap", "sqlmap installed"),
-        ("which jadx", None, "jadx installed"),
+        ("which radare2", "/usr/bin/radare2", "radare2 at /usr/bin/radare2"),
+        ("which sqlmap", "/usr/bin/sqlmap", "sqlmap at /usr/bin/sqlmap"),
+        ("which jadx", "/usr/local/bin/jadx", "jadx at /usr/local/bin/jadx"),
         # Python venv must exist at expected location
         (
             "test -f /home/ctfplayer/.ctfenv/bin/python3 && echo 'venv'",
@@ -112,40 +113,42 @@ ENVIRONMENT_CHECKS = {
         ("python3 -c 'import z3; print(\"z3\")'", "z3", "z3-solver importable"),
         # 32-bit support (for CTF binaries)
         ("test -f /lib32/libc.so.6 && echo 'lib32'", "lib32", "32-bit libc available"),
+        # Task files location - STRICT
+        ("test -d /home/ctfplayer/ctf_files && echo 'EXISTS'", "EXISTS", "ctf_files directory exists"),
     ],
     "cybench": [
-        # Working directory: /root (Kali default)
-        ("pwd", "/root", "Working directory is /root"),
+        # Working directory: /app (official cybench/cybench:latest image)
+        ("pwd", "/app", "Working directory is /app"),
         # OS: Kali Linux (must contain 'Kali' in name)
         ("cat /etc/os-release | grep '^NAME='", "Kali", "Kali Linux base image"),
-        # Image detection tools (see docs/cybench-image-investigation.md):
-        # - aisiuk/evals-cybench-agent-sandbox: NO gdb, NO ghidra, YES radare2
-        # - cybench/cybench:latest: YES gdb, NO ghidra, YES radare2
-        # - cybench/kali-linux-large:latest: YES gdb, YES ghidra, YES radare2
-        # All of these pass - we just want to know which image
-        ("which gdb || echo 'NOT_FOUND'", None, "[IMAGE] gdb"),
-        ("which ghidra || echo 'NOT_FOUND'", None, "[IMAGE] ghidra"),
-        ("which radare2 || echo 'NOT_FOUND'", None, "[IMAGE] radare2"),
-        ("which binwalk || echo 'NOT_FOUND'", None, "[IMAGE] binwalk"),
-        # Security tools
-        ("which nmap", "/usr/bin/nmap", "nmap installed"),
-        ("which sqlmap", "/usr/bin/sqlmap", "sqlmap installed"),
-        ("which msfconsole || echo 'NOT_FOUND'", None, "[IMAGE] msfconsole"),
-        ("which hydra || echo 'NOT_FOUND'", None, "[IMAGE] hydra"),
-        ("which john || echo 'NOT_FOUND'", None, "[IMAGE] john"),
-        # Docker-in-Docker capability (required for web challenges)
+        # Required tools - STRICT: must exist at expected paths
+        # Using cybench/cybench:latest which has all these tools
+        ("which gdb", "/usr/bin/gdb", "gdb at /usr/bin/gdb"),
+        ("which ghidra", "/usr/bin/ghidra", "ghidra at /usr/bin/ghidra"),
+        ("which radare2", "/usr/bin/radare2", "radare2 at /usr/bin/radare2"),
+        ("which binwalk", "/usr/bin/binwalk", "binwalk at /usr/bin/binwalk"),
+        # Security tools - STRICT
+        ("which nmap", "/usr/bin/nmap", "nmap at /usr/bin/nmap"),
+        ("which sqlmap", "/usr/bin/sqlmap", "sqlmap at /usr/bin/sqlmap"),
+        # Docker available (daemon may not be running in all test contexts)
         ("docker --version", "Docker version", "Docker available"),
-        ("docker ps 2>&1 | head -1", None, "Docker daemon accessible"),
-        # Python 3
-        ("which python3", None, "python3 installed"),
-        ("python3 --version", "Python 3", "Python 3.x"),
+        # NOTE: Docker daemon check (docker ps) removed - DinD requires privileged
+        # mode which isn't always available. Web challenges needing DinD will
+        # fail at runtime if daemon isn't accessible.
+        # Python via venv - STRICT
+        ("test -f /venv/bin/python3 && echo 'venv'", "venv", "Python venv at /venv"),
+        (
+            "/venv/bin/python3 -c 'from pwn import *; print(\"pwntools\")'",
+            "pwntools",
+            "pwntools in /venv",
+        ),
         # apt-get for additional tools
         ("apt-get --version", "apt", "apt-get available"),
         # Task files visible - cybench tasks have nested directories (blog/, nginx/, etc.)
-        # Check that files exist in /root (not just empty directory)
-        ("ls -la /root/ | grep -E '^d|^-' | wc -l", None, "Items in /root"),
+        # Check that files exist in /root (not just empty directory) using test -gt 0
+        ("test $(ls -A /root/ | wc -l) -gt 0", None, "Items in /root"),
         # For tasks with subdirs (chunky, etc.), verify structure is mounted
-        ("find /root -type f 2>/dev/null | head -5", None, "Files under /root (recursive)"),
+        ("test $(find /root -type f 2>/dev/null | head -5 | wc -l) -gt 0", None, "Files under /root (recursive)"),
     ],
     "cybergym": [
         # Working directory: /root
@@ -155,29 +158,27 @@ ENVIRONMENT_CHECKS = {
         # CyberGym-specific: submit.sh MUST exist for PoC submission
         ("test -f /submit.sh && echo 'submit.sh'", "submit.sh", "submit.sh present at /submit.sh"),
         ("test -x /submit.sh && echo 'executable'", "executable", "submit.sh is executable"),
-        # Image detection (same as cybench - uses cybench/cybench:latest)
-        ("which gdb || echo 'NOT_FOUND'", None, "[IMAGE] gdb"),
-        ("which ghidra || echo 'NOT_FOUND'", None, "[IMAGE] ghidra"),
-        ("which radare2 || echo 'NOT_FOUND'", None, "[IMAGE] radare2"),
-        # Core security tools
-        ("which python3", None, "python3 installed"),
-        ("which nmap", "/usr/bin/nmap", "nmap installed"),
-        ("which curl", "/usr/bin/curl", "curl installed"),
+        # Required tools - STRICT (same image as cybench)
+        ("which gdb", "/usr/bin/gdb", "gdb at /usr/bin/gdb"),
+        ("which ghidra", "/usr/bin/ghidra", "ghidra at /usr/bin/ghidra"),
+        ("which radare2", "/usr/bin/radare2", "radare2 at /usr/bin/radare2"),
+        # Core security tools - STRICT
+        ("which python3", "/usr/bin/python3", "python3 at /usr/bin/python3"),
+        ("which nmap", "/usr/bin/nmap", "nmap at /usr/bin/nmap"),
+        ("which curl", "/usr/bin/curl", "curl at /usr/bin/curl"),
         # apt-get for additional tools
         ("apt-get --version", "apt", "apt-get available"),
-        # Task files from setup.sh
-        ("ls /root/ 2>/dev/null | wc -l", None, "Files in /root"),
     ],
     "cvebench": [
         # OS: Kali Linux
         ("cat /etc/os-release | grep '^NAME='", "Kali", "Kali Linux base image"),
-        # Core security tools for CVE exploitation
-        ("which python3", None, "python3 installed"),
-        ("which nmap", "/usr/bin/nmap", "nmap installed"),
-        ("which curl", "/usr/bin/curl", "curl installed"),
-        ("which wget", "/usr/bin/wget", "wget installed"),
-        # Network access to target
-        ("which nc", None, "netcat installed"),
+        # Core security tools for CVE exploitation - STRICT
+        ("which python3", "/usr/bin/python3", "python3 at /usr/bin/python3"),
+        ("which nmap", "/usr/bin/nmap", "nmap at /usr/bin/nmap"),
+        ("which curl", "/usr/bin/curl", "curl at /usr/bin/curl"),
+        ("which wget", "/usr/bin/wget", "wget at /usr/bin/wget"),
+        # Network access to target - STRICT
+        ("which nc", "/usr/bin/nc", "netcat at /usr/bin/nc"),
         # apt-get for additional tools
         ("apt-get --version", "apt", "apt-get available"),
     ],
@@ -191,6 +192,43 @@ ENVIRONMENT_CHECKS = {
     ],
 }
+# Task-specific file expectations
+# Maps benchmark -> task_id -> list of expected paths
+# Use "*" as task_id for checks that apply to all tasks in a benchmark
+TASK_FILE_CHECKS = {
+    "cybench": {
+        "chunky": ["/root/blog", "/root/nginx", "/root/rcache"],
+        "avatar": ["/root/chall.py"],
+        "lootstash": ["/root/stash"],
+        "primary_knowledge": [],  # No local files - network-based
+    },
+    "cybergym": {
+        "*": ["/submit.sh"],  # All cybergym tasks need submit.sh
+    },
+    "nyuctf": {
+        "*": ["/home/ctfplayer/ctf_files"],  # Standard location
+    },
+    "intercode-ctf": {
+        "*": ["/ctf"],  # Task files in /ctf directory
+    },
+}
+def get_file_checks_for_task(benchmark: str, task_id: str) -> list[tuple]:
+    """Generate file existence checks based on task expectations."""
+    checks = []
+    task_files = TASK_FILE_CHECKS.get(benchmark, {})
+    # Try task-specific first, then fallback to wildcard
+    expected = task_files.get(task_id) or task_files.get("*", [])
+    for path in expected:
+        checks.append(
+            (f"test -e {path} && echo 'EXISTS'", "EXISTS", f"{path} exists")
+        )
+    return checks
 # Commands to run for submission tests
 SUBMISSION_TESTS = {
     "intercode-ctf": {
@@ -503,7 +541,12 @@ def run_automated_test(task_id: str, benchmark: str, timeout: int = 300) -> list
             # Get benchmark-specific checks from ENVIRONMENT_CHECKS
             env_checks = ENVIRONMENT_CHECKS.get(benchmark, [])
-            for check in env_checks:
+            # Add task-specific file checks
+            file_checks = get_file_checks_for_task(benchmark, task_id)
+            all_checks = env_checks + file_checks
+            for check in all_checks:
                 # Unpack: (command, expected_pattern, description)
                 if len(check) == 3:
                     cmd, expected, desc = check
@@ -517,7 +560,7 @@ def run_automated_test(task_id: str, benchmark: str, timeout: int = 300) -> list
                         ["docker", "exec", container_name, "sh", "-c", cmd],
                         capture_output=True,
                         text=True,
-                        timeout=15,
+                        timeout=30,  # Increased for slow imports (angr takes ~10s)
                     )
                     output = result.stdout.strip()
                     stderr = result.stderr.strip()
@@ -564,13 +607,20 @@ def run_automated_test(task_id: str, benchmark: str, timeout: int = 300) -> list
             time.sleep(2)
             docker_child.expect(prompt_patterns[:-1], timeout=30)
             output = strip_ansi(docker_child.before or "")
-            results.append(
-                TestResult(
-                    "task score",
-                    True,  # Just checking it runs
-                    output[:200],
+            expected_score = sub_tests.get("score_expect")
+            if expected_score:
+                passed = expected_score.lower() in output.lower()
+                details = output[:200] if passed else f"Expected '{expected_score}' in output: {output[:100]}..."
+                results.append(TestResult("task score", passed, details))
+            else:
+                results.append(
+                    TestResult(
+                        "task score",
+                        True,  # Just checking it runs
+                        output[:200],
+                    )
                 )
-            )
         # Submit answer
         console.print("Submitting test answer...")
@@ -605,14 +655,15 @@ def run_automated_test(task_id: str, benchmark: str, timeout: int = 300) -> list
                 timeout=60,
             )
             # EOF (idx=4) is expected - container exits after task submit
-            if idx <= 4:
+            # TIMEOUT (idx=3) is a failure
+            if idx != 3:
                 results.append(
                     TestResult("Submission", True, "Answer submitted (container exited)")
                 )
             else:
                 results.append(
                     TestResult(
-                        "Submission", False, docker_child.before or "", "Submission may have failed"
+                        "Submission", False, docker_child.before or "", "Submission timed out waiting for result"
                     )
                 )
         elif idx < 3:
@@ -624,7 +675,7 @@ def run_automated_test(task_id: str, benchmark: str, timeout: int = 300) -> list
         else:
             results.append(
                 TestResult(
-                    "Submission", False, docker_child.before or "", "Submission may have failed"
+                    "Submission", False, docker_child.before or "", "Submission timed out at prompt"
                 )
             )
@@ -671,10 +722,19 @@ def run_automated_test(task_id: str, benchmark: str, timeout: int = 300) -> list
     return results
-def verify_artifacts(task_id: str, _benchmark: str) -> list[TestResult]:
+def verify_artifacts(task_id: str, benchmark: str) -> list[TestResult]:
     """Verify session and eval log artifacts were created."""
     results = []
+    # Get expected answer from SUBMISSION_TESTS
+    sub_tests = SUBMISSION_TESTS.get(benchmark, {})
+    submit_cmd = sub_tests.get("submit_cmd", "")
+    expected_answer = None
+    if 'task submit "' in submit_cmd:
+        expected_answer = submit_cmd.split('task submit "')[1].rstrip('"')
+    elif "task submit '" in submit_cmd:
+        expected_answer = submit_cmd.split("task submit '")[1].rstrip("'")
     # Check session in database
     session_info = ssh_query(f"""
         SELECT id, status, score, client_active_seconds, answer
@@ -699,11 +759,23 @@ def verify_artifacts(task_id: str, _benchmark: str) -> list[TestResult]:
                 "Active time recorded", float(active_seconds or 0) > 0, f"Seconds: {active_seconds}"
             )
         )
-        results.append(
-            TestResult(
-                "Answer recorded", bool(answer), f"Answer: {answer[:50]}..." if answer else ""
+        # Verify answer
+        if expected_answer and benchmark != "cybergym": # Cybergym submits file content, hard to verify here
+             results.append(
+                TestResult(
+                    "Answer matches submission",
+                    answer == expected_answer,
+                    f"Expected: '{expected_answer}', Got: '{answer}'"
+                )
             )
-        )
+        else:
+             results.append(
+                TestResult(
+                    "Answer recorded", bool(answer), f"Answer: {answer[:50]}..." if answer else ""
+                )
+            )
         results.append(
             TestResult("Score recorded", score != "", f"Score: {score}" if score else "No score")
         )

{hte_cli-0.2.19 → hte_cli-0.2.21}/tests/e2e/conftest.py RENAMED Viewed

@@ -22,6 +22,41 @@ VPS_HOST = os.environ.get("VPS_HOST", "root@209.38.25.118")
 TEST_EMAIL = "e2e-test@lyptus.dev"
 CLI_CONFIG_PATH = Path.home() / "Library" / "Application Support" / "hte-cli" / "config.json"
+# Expected task assignments per benchmark (4 per benchmark)
+# First 2 for pytest API tests, last 2 for interactive/automated tests
+EXPECTED_TASKS = {
+    "cybergym": ["arvo:60121", "arvo:8046", "arvo:41073", "arvo:43984"],
+    "cybench": ["chunky", "avatar", "lootstash", "primary_knowledge"],
+    "intercode-ctf": [
+        "intercode-ctf_general_skills/task_4",
+        "intercode-ctf_reverse/task_1",
+        "intercode-ctf_reverse/task_3",
+        "intercode-ctf_reverse/task_85",
+    ],
+    "nyuctf": [
+        "2013f-cry-stfu",
+        "2016q-rev-rock",
+        "2023q-rev-baby_s_first",
+        "2019q-cry-super_curve",
+    ],
+    "cybashbench": [
+        "cybashbench_pwn/nl2bash_003",
+        "cybashbench_reverse/nl2bash_004",
+        "cybashbench_web/nl2bash_001",
+        "cybashbench_forensics/nl2bash_005",
+    ],
+    "cvebench": ["CVE-2024-5084", "CVE-2024-4442", "CVE-2024-22120", "CVE-2024-32511"],
+    "nl2bash": [
+        "nl2bash_complex/task_8581",
+        "nl2bash_complex/task_713",
+        "nl2bash_complex/task_712",
+        "nl2bash_complex/task_8796",
+    ],
+}
+# Total expected assignments (calculated from EXPECTED_TASKS)
+EXPECTED_ASSIGNMENT_COUNT = sum(len(tasks) for tasks in EXPECTED_TASKS.values())
 def ssh_query(query: str) -> str:
     """Run a sqlite3 query on the VPS (READ-ONLY)."""

{hte_cli-0.2.19 → hte_cli-0.2.21}/tests/e2e/e2e_test.py RENAMED Viewed

@@ -61,6 +61,12 @@ BENCHMARK_TASKS = {
         "cybashbench_forensics/nl2bash_005",
     ],
     "cvebench": ["CVE-2024-5084", "CVE-2024-4442", "CVE-2024-22120", "CVE-2024-32511"],
+    "nl2bash": [
+        "nl2bash_complex/task_8581",
+        "nl2bash_complex/task_713",
+        "nl2bash_complex/task_712",
+        "nl2bash_complex/task_8796",
+    ],
 }
 # Instructions for each benchmark type
@@ -688,38 +694,158 @@ def cleanup():
     help="Admin password for API access",
 )
 @click.option("--yes", "-y", is_flag=True, help="Skip confirmation prompts")
-def full(admin_password: str, yes: bool):
-    """Run complete E2E test suite (setup, run all, verify, cleanup)."""
-    console.print("\n[bold]Full E2E Test Suite[/bold]\n")
-    console.print("[yellow]This will run all benchmarks interactively.[/yellow]")
-    console.print("You'll need to interact with each task container.\n")
-    if not yes and not click.confirm("Continue?"):
+@click.option("--skip-setup", is_flag=True, help="Skip setup if already done")
+@click.option("--cleanup-after", is_flag=True, help="Run cleanup after tests")
+def full(admin_password: str, yes: bool, skip_setup: bool, cleanup_after: bool):
+    """Run complete E2E test suite in 3 phases.
+    Phase 1: Infrastructure tests (pytest, fast, no containers)
+    Phase 2: Automated benchmark E2E tests (pexpect, creates completed sessions)
+    Phase 3: Session verification tests (pytest, validates completed sessions)
+    This is fully automated - no user interaction required.
+    """
+    console.print(Panel("[bold]Full E2E Test Suite - 3 Phases[/bold]", style="cyan"))
+    console.print("""
+[dim]Phase 1:[/dim] Infrastructure tests (pytest)
+[dim]Phase 2:[/dim] Automated benchmark E2E tests (pexpect)
+[dim]Phase 3:[/dim] Session verification tests (pytest)
+""")
+    if not yes and not click.confirm("Run full automated E2E suite?"):
         raise click.ClickException("Aborted")
-    # Setup
-    ctx = click.get_current_context()
-    ctx.invoke(setup, admin_password=admin_password, yes=yes)
+    results = {"phase1": None, "phase2": {}, "phase3": None}
+    tests_dir = Path(__file__).parent
+    # Setup (unless skipped)
+    if not skip_setup:
+        console.print("\n" + "=" * 60)
+        console.print("[bold cyan]SETUP: Creating test user and assignments[/bold cyan]")
+        console.print("=" * 60)
+        ctx = click.get_current_context()
+        ctx.invoke(setup, admin_password=admin_password, yes=True)
+    # Phase 1: Infrastructure tests
+    console.print("\n" + "=" * 60)
+    console.print("[bold cyan]PHASE 1: Infrastructure Tests[/bold cyan]")
+    console.print("=" * 60)
+    console.print("[dim]Running pytest on infrastructure, imports, benchmark flows...[/dim]\n")
+    phase1_result = subprocess.run(
+        [
+            "uv", "run", "pytest",
+            str(tests_dir / "test_infrastructure.py"),
+            str(tests_dir / "test_runtime_imports.py"),
+            str(tests_dir / "test_benchmark_flows.py"),
+            "-v", "--tb=short",
+        ],
+        cwd=tests_dir.parent.parent,
+    )
+    results["phase1"] = phase1_result.returncode == 0
+    if not results["phase1"]:
+        console.print("\n[red bold]Phase 1 FAILED - stopping[/red bold]")
+        _print_full_summary(results)
+        raise SystemExit(1)
-    # Run each benchmark
-    for benchmark in BENCHMARK_TASKS.keys():
-        console.print(f"\n{'=' * 50}")
-        console.print(f"[bold]Benchmark: {benchmark}[/bold]")
+    console.print("\n[green]Phase 1 PASSED[/green]")
+    # Phase 2: Automated benchmark E2E tests
+    console.print("\n" + "=" * 60)
+    console.print("[bold cyan]PHASE 2: Automated Benchmark E2E Tests[/bold cyan]")
+    console.print("=" * 60)
+    console.print("[dim]Running automated tests for each benchmark via pexpect...[/dim]\n")
-        for i in range(2):
-            if click.confirm(f"\nRun task {i+1}/2 for {benchmark}?"):
-                ctx.invoke(run, benchmark=benchmark, task_index=i)
+    from automated_runner import run_benchmark_test
-    # Verify
-    console.print(f"\n{'=' * 50}")
-    ctx.invoke(verify, admin_password=admin_password)
+    for benchmark in BENCHMARK_TASKS.keys():
+        console.print(f"\n[bold]--- {benchmark} ---[/bold]")
+        try:
+            # Run task index 2 (third task, reserved for automated E2E)
+            success = run_benchmark_test(benchmark, task_index=2)
+            results["phase2"][benchmark] = success
+            if success:
+                console.print(f"[green]{benchmark}: PASSED[/green]")
+            else:
+                console.print(f"[red]{benchmark}: FAILED[/red]")
+        except Exception as e:
+            console.print(f"[red]{benchmark}: ERROR - {e}[/red]")
+            results["phase2"][benchmark] = False
+    phase2_passed = all(results["phase2"].values())
+    if not phase2_passed:
+        console.print("\n[yellow]Phase 2 had failures - continuing to Phase 3[/yellow]")
+    # Phase 3: Session verification tests
+    console.print("\n" + "=" * 60)
+    console.print("[bold cyan]PHASE 3: Session Verification Tests[/bold cyan]")
+    console.print("=" * 60)
+    console.print("[dim]Running pytest on session lifecycle and eval logs...[/dim]\n")
+    phase3_result = subprocess.run(
+        [
+            "uv", "run", "pytest",
+            str(tests_dir / "test_session_lifecycle.py"),
+            str(tests_dir / "test_eval_logs.py"),
+            "-v", "--tb=short",
+        ],
+        cwd=tests_dir.parent.parent,
+    )
+    results["phase3"] = phase3_result.returncode == 0
+    # Summary
+    _print_full_summary(results)
     # Cleanup
-    console.print(f"\n{'=' * 50}")
-    if click.confirm("Run cleanup?"):
+    if cleanup_after:
+        console.print("\n" + "=" * 60)
+        console.print("[bold cyan]CLEANUP[/bold cyan]")
+        ctx = click.get_current_context()
         ctx.invoke(cleanup)
-    console.print("\n[bold green]Full E2E test complete![/bold green]")
+    # Exit with appropriate code
+    all_passed = results["phase1"] and phase2_passed and results["phase3"]
+    if all_passed:
+        console.print("\n[bold green]All phases PASSED![/bold green]")
+    else:
+        console.print("\n[bold red]Some phases FAILED[/bold red]")
+        raise SystemExit(1)
+def _print_full_summary(results: dict):
+    """Print summary table of all phases."""
+    console.print("\n" + "=" * 60)
+    console.print("[bold]SUMMARY[/bold]")
+    console.print("=" * 60)
+    table = Table()
+    table.add_column("Phase", style="cyan")
+    table.add_column("Status")
+    table.add_column("Details")
+    # Phase 1
+    if results["phase1"] is not None:
+        status = "[green]PASSED[/green]" if results["phase1"] else "[red]FAILED[/red]"
+        table.add_row("Phase 1: Infrastructure", status, "pytest infra/imports/flows")
+    # Phase 2
+    if results["phase2"]:
+        passed = sum(1 for v in results["phase2"].values() if v)
+        total = len(results["phase2"])
+        status = "[green]PASSED[/green]" if passed == total else f"[yellow]{passed}/{total}[/yellow]"
+        details = ", ".join(
+            f"[green]{b}[/green]" if v else f"[red]{b}[/red]"
+            for b, v in results["phase2"].items()
+        )
+        table.add_row("Phase 2: Benchmarks", status, details)
+    # Phase 3
+    if results["phase3"] is not None:
+        status = "[green]PASSED[/green]" if results["phase3"] else "[red]FAILED[/red]"
+        table.add_row("Phase 3: Verification", status, "pytest lifecycle/logs")
+    console.print(table)
 if __name__ == "__main__":

{hte_cli-0.2.19 → hte_cli-0.2.21}/tests/e2e/test_benchmark_flows.py RENAMED Viewed

@@ -13,7 +13,14 @@ Run with: uv run pytest tests/e2e/test_benchmark_flows.py -v
 import pytest
 import requests
-from tests.e2e.conftest import BASE_URL, get_test_user_id, ssh_command, ssh_query
+from tests.e2e.conftest import (
+    BASE_URL,
+    EXPECTED_ASSIGNMENT_COUNT,
+    EXPECTED_TASKS,
+    get_test_user_id,
+    ssh_command,
+    ssh_query,
+)
 # Benchmark test configurations
 # First 2 tasks for pytest API tests, last 2 for interactive tests
@@ -367,12 +374,14 @@ class TestCrossBenchmark:
             assert int(count) > 0, f"No assignments for {benchmark}"
     def test_total_assignments_correct(self):
-        """Total assignments should be 24 (4 per benchmark)."""
+        """Total assignments should match expected count (4 per benchmark)."""
         count = ssh_query(f"""
             SELECT COUNT(*) FROM assignments
             WHERE user_id = '{get_test_user_id()}'
         """)
-        assert int(count) == 24
+        assert int(count) == EXPECTED_ASSIGNMENT_COUNT, (
+            f"Expected {EXPECTED_ASSIGNMENT_COUNT} assignments, got {count}"
+        )
 # =============================================================================

{hte_cli-0.2.19 → hte_cli-0.2.21}/tests/e2e/test_eval_logs.py RENAMED Viewed

@@ -85,8 +85,8 @@ class TestLocalEvalLogs:
             pytest.skip("Local eval logs directory not found")
         logs = list(LOCAL_EVAL_LOGS_DIR.glob("*.eval"))
-        # Just verify we can list them
-        assert isinstance(logs, list)
+        # Verify we found eval logs (if E2E tests have run, there should be some)
+        assert len(logs) > 0, f"No eval logs found in {LOCAL_EVAL_LOGS_DIR}"
 # =============================================================================
@@ -103,11 +103,12 @@ class TestVPSEvalLogs:
         assert result == "exists", "VPS eval logs directory not found"
     def test_vps_eval_log_count(self):
-        """Should be able to count eval logs on VPS."""
+        """Should have eval logs on VPS if sessions have completed."""
         result = ssh_command(f"find {VPS_EVAL_LOGS_DIR} -name '*.eval.gz' 2>/dev/null | wc -l")
-        count = int(result.strip()) if result.strip().isdigit() else 0
-        # Just verify we can count them
-        assert count >= 0
+        assert result.strip().isdigit(), f"Invalid count result: {result}"
+        count = int(result.strip())
+        # If E2E tests have run, there should be eval logs
+        assert count > 0, f"No eval logs found on VPS in {VPS_EVAL_LOGS_DIR}"
     def test_completed_sessions_have_eval_log_path(self):
         """Completed sessions should have eval_log_path recorded."""
@@ -241,11 +242,11 @@ class TestEvalLogUpload:
             WHERE session_id = '{session_id}'
         """)
-        # Should have upload-related events
+        # Should have upload-related events for completed sessions
         event_list = events.split("\n") if events else []
         has_upload = any("upload" in e.lower() for e in event_list)
-        # Note: upload events might not always exist
-        assert isinstance(has_upload, bool)
+        # Completed sessions should have upload events
+        assert has_upload, f"No upload events found for session {session_id}. Events: {event_list[:5]}"
     def test_eval_log_size_reasonable(self):
         """Eval logs should be reasonably sized (not empty, not huge)."""

{hte_cli-0.2.19 → hte_cli-0.2.21}/tests/e2e/test_infrastructure.py RENAMED Viewed

@@ -25,31 +25,8 @@ from tests.e2e.conftest import (
     ssh_query,
 )
-# Expected task assignments (4 per benchmark)
-# First 2 for pytest API tests, last 2 for interactive tests
-EXPECTED_TASKS = {
-    "cybergym": ["arvo:60121", "arvo:8046", "arvo:41073", "arvo:43984"],
-    "cybench": ["chunky", "avatar", "lootstash", "primary_knowledge"],
-    "intercode-ctf": [
-        "intercode-ctf_general_skills/task_4",
-        "intercode-ctf_reverse/task_1",
-        "intercode-ctf_reverse/task_3",
-        "intercode-ctf_reverse/task_85",
-    ],
-    "nyuctf": [
-        "2013f-cry-stfu",
-        "2016q-rev-rock",
-        "2023q-rev-baby_s_first",
-        "2019q-cry-super_curve",
-    ],
-    "cybashbench": [
-        "cybashbench_pwn/nl2bash_003",
-        "cybashbench_reverse/nl2bash_004",
-        "cybashbench_web/nl2bash_001",
-        "cybashbench_forensics/nl2bash_005",
-    ],
-    "cvebench": ["CVE-2024-5084", "CVE-2024-4442", "CVE-2024-22120", "CVE-2024-32511"],
-}
+# Import shared constants from conftest
+from tests.e2e.conftest import EXPECTED_TASKS, EXPECTED_ASSIGNMENT_COUNT
 # =============================================================================
@@ -133,11 +110,13 @@ class TestAssignments:
     """Test that task assignments are correctly set up."""
     def test_correct_number_of_assignments(self):
-        """Test user should have exactly 12 assignments."""
+        """Test user should have expected number of assignments."""
         count = ssh_query(
             f"SELECT COUNT(*) FROM assignments WHERE user_id = '{get_test_user_id()}'"
         )
-        assert int(count) == 24
+        assert int(count) == EXPECTED_ASSIGNMENT_COUNT, (
+            f"Expected {EXPECTED_ASSIGNMENT_COUNT} assignments, got {count}"
+        )
     @pytest.mark.parametrize("benchmark,tasks", EXPECTED_TASKS.items())
     def test_benchmark_tasks_assigned(self, benchmark, tasks):
@@ -226,8 +205,9 @@ class TestAPIEndpoints:
         )
         assert response.status_code == 200
         assignments = response.json()
-        # User may have different number of assignments
-        assert isinstance(assignments, list)
+        # Test user should have assignments from E2E setup
+        assert isinstance(assignments, list), "Expected list of assignments"
+        assert len(assignments) > 0, "Test user should have at least one assignment"
     def test_assignment_has_task_info(self, api_headers):
         """Assignments should include task information."""

{hte_cli-0.2.19 → hte_cli-0.2.21}/tests/e2e/test_runtime_imports.py RENAMED Viewed

@@ -149,8 +149,10 @@ print(f'Loaded {len(HUMAN_REGISTRY)} benchmarks: {list(HUMAN_REGISTRY.keys())}')
             pytest.fail(f"Import failed in container: {result.stderr}")
         assert "Loaded" in result.stdout
-        # Should have at least 6 benchmarks
-        assert "6 benchmarks" in result.stdout or "7 benchmarks" in result.stdout
+        # Should have exactly 7 benchmarks
+        assert "7 benchmarks" in result.stdout, (
+            f"Expected 7 benchmarks, got: {result.stdout}"
+        )
     def test_backend_can_import_adapters(self):
         """Backend should be able to instantiate adapters."""
@@ -176,9 +178,11 @@ for name, cls in HUMAN_REGISTRY.items():
         if "FAIL" in result.stdout:
             pytest.fail(f"Adapter instantiation failed: {result.stdout}")
-        # Should have OK for all benchmarks
+        # All benchmarks should show OK - STRICT check
         for benchmark in BENCHMARKS:
-            assert f"{benchmark}: OK" in result.stdout or benchmark not in result.stdout
+            assert f"{benchmark}: OK" in result.stdout, (
+                f"Benchmark {benchmark} not found or not OK in output: {result.stdout}"
+            )
 class TestLocalImports:

{hte_cli-0.2.19 → hte_cli-0.2.21}/tests/e2e/test_session_lifecycle.py RENAMED Viewed

@@ -164,17 +164,26 @@ class TestSessionCompletion:
     def test_completed_session_has_score(self):
         """Completed sessions should have a score."""
+        # Count total submitted sessions
+        total_submitted = ssh_query(f"""
+            SELECT COUNT(*) FROM sessions
+            WHERE user_id = '{get_test_user_id()}'
+            AND status = 'submitted'
+        """)
+        total = int(total_submitted) if total_submitted else 0
+        if total == 0:
+            pytest.skip("No submitted sessions to verify")
+        # Count sessions without score
         sessions_without_score = ssh_query(f"""
             SELECT COUNT(*) FROM sessions
             WHERE user_id = '{get_test_user_id()}'
             AND status = 'submitted'
             AND score IS NULL
         """)
-        # Note: score can legitimately be NULL for some benchmarks
-        # This test documents expected behavior
-        count = int(sessions_without_score)
-        # We just want to verify we can query this
-        assert count >= 0
+        count = int(sessions_without_score) if sessions_without_score else 0
+        # Most submitted sessions should have scores (some benchmarks may not score)
+        assert count < total, f"All {total} sessions missing scores"
     def test_completed_session_has_answer(self):
         """Completed sessions should have an answer."""
@@ -208,14 +217,16 @@ class TestSessionState:
     """Test session state verification (read-only)."""
     def test_abandoned_sessions_count(self):
-        """Verify we can count abandoned sessions."""
+        """Verify abandoned sessions exist and are queryable."""
         abandoned_count = ssh_query(f"""
             SELECT COUNT(*) FROM sessions
             WHERE user_id = '{get_test_user_id()}'
             AND status = 'abandoned'
         """)
-        # Just verify we can query abandoned sessions
-        assert int(abandoned_count) >= 0
+        count = int(abandoned_count) if abandoned_count else 0
+        # Verify the query returned a valid number (not empty/error)
+        assert abandoned_count.strip().isdigit(), f"Query returned invalid value: {abandoned_count}"
+        # Note: count can legitimately be 0 if no sessions were abandoned
     def test_no_stuck_sessions_older_than_24h(self):
         """No in_progress sessions should be older than 24 hours."""
@@ -387,8 +398,9 @@ class TestSessionCancellation:
             WHERE user_id = '{get_test_user_id()}'
             AND status = 'cancelled'
         """)
-        # Just verify we can query cancelled sessions
-        assert int(cancelled) >= 0
+        # Verify query returned valid result
+        assert cancelled.strip().isdigit(), f"Query returned invalid value: {cancelled}"
+        # Note: count can legitimately be 0 if no sessions were cancelled
     def test_no_orphaned_in_progress_after_cancel(self):
         """Assignments should not be in_progress if session is cancelled."""

{hte_cli-0.2.19 → hte_cli-0.2.21}/uv.lock RENAMED Viewed

@@ -625,7 +625,7 @@ wheels = [
 [[package]]
 name = "hte-cli"
-version = "0.2.18"
+version = "0.2.20"
 source = { editable = "." }
 dependencies = [
     { name = "click" },