hte-cli 0.2.19__tar.gz → 0.2.21__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. {hte_cli-0.2.19 → hte_cli-0.2.21}/PKG-INFO +1 -1
  2. {hte_cli-0.2.19 → hte_cli-0.2.21}/pyproject.toml +1 -1
  3. {hte_cli-0.2.19 → hte_cli-0.2.21}/tests/e2e/automated_runner.py +139 -67
  4. {hte_cli-0.2.19 → hte_cli-0.2.21}/tests/e2e/conftest.py +35 -0
  5. {hte_cli-0.2.19 → hte_cli-0.2.21}/tests/e2e/e2e_test.py +149 -23
  6. {hte_cli-0.2.19 → hte_cli-0.2.21}/tests/e2e/test_benchmark_flows.py +12 -3
  7. {hte_cli-0.2.19 → hte_cli-0.2.21}/tests/e2e/test_eval_logs.py +10 -9
  8. {hte_cli-0.2.19 → hte_cli-0.2.21}/tests/e2e/test_infrastructure.py +9 -29
  9. {hte_cli-0.2.19 → hte_cli-0.2.21}/tests/e2e/test_runtime_imports.py +8 -4
  10. {hte_cli-0.2.19 → hte_cli-0.2.21}/tests/e2e/test_session_lifecycle.py +22 -10
  11. {hte_cli-0.2.19 → hte_cli-0.2.21}/uv.lock +1 -1
  12. {hte_cli-0.2.19 → hte_cli-0.2.21}/.gitignore +0 -0
  13. {hte_cli-0.2.19 → hte_cli-0.2.21}/README.md +0 -0
  14. {hte_cli-0.2.19 → hte_cli-0.2.21}/src/hte_cli/__init__.py +0 -0
  15. {hte_cli-0.2.19 → hte_cli-0.2.21}/src/hte_cli/__main__.py +0 -0
  16. {hte_cli-0.2.19 → hte_cli-0.2.21}/src/hte_cli/api_client.py +0 -0
  17. {hte_cli-0.2.19 → hte_cli-0.2.21}/src/hte_cli/cli.py +0 -0
  18. {hte_cli-0.2.19 → hte_cli-0.2.21}/src/hte_cli/config.py +0 -0
  19. {hte_cli-0.2.19 → hte_cli-0.2.21}/src/hte_cli/errors.py +0 -0
  20. {hte_cli-0.2.19 → hte_cli-0.2.21}/src/hte_cli/events.py +0 -0
  21. {hte_cli-0.2.19 → hte_cli-0.2.21}/src/hte_cli/image_utils.py +0 -0
  22. {hte_cli-0.2.19 → hte_cli-0.2.21}/src/hte_cli/runner.py +0 -0
  23. {hte_cli-0.2.19 → hte_cli-0.2.21}/src/hte_cli/scorers.py +0 -0
  24. {hte_cli-0.2.19 → hte_cli-0.2.21}/src/hte_cli/version_check.py +0 -0
  25. {hte_cli-0.2.19 → hte_cli-0.2.21}/tests/__init__.py +0 -0
  26. {hte_cli-0.2.19 → hte_cli-0.2.21}/tests/e2e/__init__.py +0 -0
  27. {hte_cli-0.2.19 → hte_cli-0.2.21}/tests/e2e/verify_docker_deps.py +0 -0
  28. {hte_cli-0.2.19 → hte_cli-0.2.21}/tests/unit/__init__.py +0 -0
  29. {hte_cli-0.2.19 → hte_cli-0.2.21}/tests/unit/conftest.py +0 -0
  30. {hte_cli-0.2.19 → hte_cli-0.2.21}/tests/unit/test_image_utils.py +0 -0
  31. {hte_cli-0.2.19 → hte_cli-0.2.21}/tests/unit/test_runner.py +0 -0
  32. {hte_cli-0.2.19 → hte_cli-0.2.21}/tests/unit/test_scorers.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: hte-cli
3
- Version: 0.2.19
3
+ Version: 0.2.21
4
4
  Summary: Human Time-to-Completion Evaluation CLI
5
5
  Project-URL: Homepage, https://github.com/sean-peters-au/lyptus-mono
6
6
  Author: Lyptus Research
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "hte-cli"
3
- version = "0.2.19"
3
+ version = "0.2.21"
4
4
  description = "Human Time-to-Completion Evaluation CLI"
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.11"
@@ -55,8 +55,8 @@ VPS_HOST = os.environ.get("VPS_HOST", "root@209.38.25.118")
55
55
  # Environment verification commands per benchmark
56
56
  # Based on docs/benchmark-environments.md
57
57
  # Format: (command, expected_pattern, description)
58
- # expected_pattern=None means just check command succeeds (exit 0)
59
- # These tests are STRICT - they verify the exact environment specified in docs
58
+ # STRICT: All tests must pass - no optional/informational checks
59
+ # These tests verify the exact environment specified in docs
60
60
  ENVIRONMENT_CHECKS = {
61
61
  "intercode-ctf": [
62
62
  # Working directory: /ctf/{task_num} (e.g., /ctf/3, /ctf/85)
@@ -69,8 +69,8 @@ ENVIRONMENT_CHECKS = {
69
69
  ("which netcat", "/usr/bin/netcat", "netcat installed"),
70
70
  ("which tcpdump", "/usr/bin/tcpdump", "tcpdump installed"),
71
71
  ("which curl", "/usr/bin/curl", "curl installed"),
72
- # Forensics tools
73
- ("which mmls", None, "sleuthkit installed (mmls)"),
72
+ # Forensics tools - STRICT: must exist
73
+ ("which mmls", "/usr/bin/mmls", "mmls (sleuthkit) at /usr/bin/mmls"),
74
74
  # Python packages - must import successfully
75
75
  ("python3 -c 'from pwn import *; print(\"pwntools\")'", "pwntools", "pwntools importable"),
76
76
  ("python3 -c 'from scapy.all import *; print(\"scapy\")'", "scapy", "scapy importable"),
@@ -90,16 +90,17 @@ ENVIRONMENT_CHECKS = {
90
90
  ("pwd", "/home/ctfplayer", "Working directory is /home/ctfplayer"),
91
91
  # User: ctfplayer (NOT root)
92
92
  ("whoami", "ctfplayer", "Running as ctfplayer user"),
93
- ("id -u", None, "User ID check"), # Should be non-zero
93
+ # User ID must be 1000 (standard non-root)
94
+ ("id -u", "1000", "User ID is 1000"),
94
95
  # Passwordless sudo must work
95
96
  ("sudo -n whoami", "root", "Passwordless sudo to root"),
96
97
  # OS: Ubuntu 22.04 specifically
97
98
  ("cat /etc/os-release | grep VERSION_ID", "22.04", "Ubuntu 22.04 base"),
98
- # Core tools at expected paths
99
+ # Core tools at expected paths - STRICT
99
100
  ("which gdb", "/usr/bin/gdb", "gdb at /usr/bin/gdb"),
100
- ("which radare2", "/usr/bin/radare2", "radare2 installed"),
101
- ("which sqlmap", "/usr/bin/sqlmap", "sqlmap installed"),
102
- ("which jadx", None, "jadx installed"),
101
+ ("which radare2", "/usr/bin/radare2", "radare2 at /usr/bin/radare2"),
102
+ ("which sqlmap", "/usr/bin/sqlmap", "sqlmap at /usr/bin/sqlmap"),
103
+ ("which jadx", "/usr/local/bin/jadx", "jadx at /usr/local/bin/jadx"),
103
104
  # Python venv must exist at expected location
104
105
  (
105
106
  "test -f /home/ctfplayer/.ctfenv/bin/python3 && echo 'venv'",
@@ -112,40 +113,42 @@ ENVIRONMENT_CHECKS = {
112
113
  ("python3 -c 'import z3; print(\"z3\")'", "z3", "z3-solver importable"),
113
114
  # 32-bit support (for CTF binaries)
114
115
  ("test -f /lib32/libc.so.6 && echo 'lib32'", "lib32", "32-bit libc available"),
116
+ # Task files location - STRICT
117
+ ("test -d /home/ctfplayer/ctf_files && echo 'EXISTS'", "EXISTS", "ctf_files directory exists"),
115
118
  ],
116
119
  "cybench": [
117
- # Working directory: /root (Kali default)
118
- ("pwd", "/root", "Working directory is /root"),
120
+ # Working directory: /app (official cybench/cybench:latest image)
121
+ ("pwd", "/app", "Working directory is /app"),
119
122
  # OS: Kali Linux (must contain 'Kali' in name)
120
123
  ("cat /etc/os-release | grep '^NAME='", "Kali", "Kali Linux base image"),
121
- # Image detection tools (see docs/cybench-image-investigation.md):
122
- # - aisiuk/evals-cybench-agent-sandbox: NO gdb, NO ghidra, YES radare2
123
- # - cybench/cybench:latest: YES gdb, NO ghidra, YES radare2
124
- # - cybench/kali-linux-large:latest: YES gdb, YES ghidra, YES radare2
125
- # All of these pass - we just want to know which image
126
- ("which gdb || echo 'NOT_FOUND'", None, "[IMAGE] gdb"),
127
- ("which ghidra || echo 'NOT_FOUND'", None, "[IMAGE] ghidra"),
128
- ("which radare2 || echo 'NOT_FOUND'", None, "[IMAGE] radare2"),
129
- ("which binwalk || echo 'NOT_FOUND'", None, "[IMAGE] binwalk"),
130
- # Security tools
131
- ("which nmap", "/usr/bin/nmap", "nmap installed"),
132
- ("which sqlmap", "/usr/bin/sqlmap", "sqlmap installed"),
133
- ("which msfconsole || echo 'NOT_FOUND'", None, "[IMAGE] msfconsole"),
134
- ("which hydra || echo 'NOT_FOUND'", None, "[IMAGE] hydra"),
135
- ("which john || echo 'NOT_FOUND'", None, "[IMAGE] john"),
136
- # Docker-in-Docker capability (required for web challenges)
124
+ # Required tools - STRICT: must exist at expected paths
125
+ # Using cybench/cybench:latest which has all these tools
126
+ ("which gdb", "/usr/bin/gdb", "gdb at /usr/bin/gdb"),
127
+ ("which ghidra", "/usr/bin/ghidra", "ghidra at /usr/bin/ghidra"),
128
+ ("which radare2", "/usr/bin/radare2", "radare2 at /usr/bin/radare2"),
129
+ ("which binwalk", "/usr/bin/binwalk", "binwalk at /usr/bin/binwalk"),
130
+ # Security tools - STRICT
131
+ ("which nmap", "/usr/bin/nmap", "nmap at /usr/bin/nmap"),
132
+ ("which sqlmap", "/usr/bin/sqlmap", "sqlmap at /usr/bin/sqlmap"),
133
+ # Docker available (daemon may not be running in all test contexts)
137
134
  ("docker --version", "Docker version", "Docker available"),
138
- ("docker ps 2>&1 | head -1", None, "Docker daemon accessible"),
139
- # Python 3
140
- ("which python3", None, "python3 installed"),
141
- ("python3 --version", "Python 3", "Python 3.x"),
135
+ # NOTE: Docker daemon check (docker ps) removed - DinD requires privileged
136
+ # mode which isn't always available. Web challenges needing DinD will
137
+ # fail at runtime if daemon isn't accessible.
138
+ # Python via venv - STRICT
139
+ ("test -f /venv/bin/python3 && echo 'venv'", "venv", "Python venv at /venv"),
140
+ (
141
+ "/venv/bin/python3 -c 'from pwn import *; print(\"pwntools\")'",
142
+ "pwntools",
143
+ "pwntools in /venv",
144
+ ),
142
145
  # apt-get for additional tools
143
146
  ("apt-get --version", "apt", "apt-get available"),
144
147
  # Task files visible - cybench tasks have nested directories (blog/, nginx/, etc.)
145
- # Check that files exist in /root (not just empty directory)
146
- ("ls -la /root/ | grep -E '^d|^-' | wc -l", None, "Items in /root"),
148
+ # Check that files exist in /root (not just empty directory) using test -gt 0
149
+ ("test $(ls -A /root/ | wc -l) -gt 0", None, "Items in /root"),
147
150
  # For tasks with subdirs (chunky, etc.), verify structure is mounted
148
- ("find /root -type f 2>/dev/null | head -5", None, "Files under /root (recursive)"),
151
+ ("test $(find /root -type f 2>/dev/null | head -5 | wc -l) -gt 0", None, "Files under /root (recursive)"),
149
152
  ],
150
153
  "cybergym": [
151
154
  # Working directory: /root
@@ -155,29 +158,27 @@ ENVIRONMENT_CHECKS = {
155
158
  # CyberGym-specific: submit.sh MUST exist for PoC submission
156
159
  ("test -f /submit.sh && echo 'submit.sh'", "submit.sh", "submit.sh present at /submit.sh"),
157
160
  ("test -x /submit.sh && echo 'executable'", "executable", "submit.sh is executable"),
158
- # Image detection (same as cybench - uses cybench/cybench:latest)
159
- ("which gdb || echo 'NOT_FOUND'", None, "[IMAGE] gdb"),
160
- ("which ghidra || echo 'NOT_FOUND'", None, "[IMAGE] ghidra"),
161
- ("which radare2 || echo 'NOT_FOUND'", None, "[IMAGE] radare2"),
162
- # Core security tools
163
- ("which python3", None, "python3 installed"),
164
- ("which nmap", "/usr/bin/nmap", "nmap installed"),
165
- ("which curl", "/usr/bin/curl", "curl installed"),
161
+ # Required tools - STRICT (same image as cybench)
162
+ ("which gdb", "/usr/bin/gdb", "gdb at /usr/bin/gdb"),
163
+ ("which ghidra", "/usr/bin/ghidra", "ghidra at /usr/bin/ghidra"),
164
+ ("which radare2", "/usr/bin/radare2", "radare2 at /usr/bin/radare2"),
165
+ # Core security tools - STRICT
166
+ ("which python3", "/usr/bin/python3", "python3 at /usr/bin/python3"),
167
+ ("which nmap", "/usr/bin/nmap", "nmap at /usr/bin/nmap"),
168
+ ("which curl", "/usr/bin/curl", "curl at /usr/bin/curl"),
166
169
  # apt-get for additional tools
167
170
  ("apt-get --version", "apt", "apt-get available"),
168
- # Task files from setup.sh
169
- ("ls /root/ 2>/dev/null | wc -l", None, "Files in /root"),
170
171
  ],
171
172
  "cvebench": [
172
173
  # OS: Kali Linux
173
174
  ("cat /etc/os-release | grep '^NAME='", "Kali", "Kali Linux base image"),
174
- # Core security tools for CVE exploitation
175
- ("which python3", None, "python3 installed"),
176
- ("which nmap", "/usr/bin/nmap", "nmap installed"),
177
- ("which curl", "/usr/bin/curl", "curl installed"),
178
- ("which wget", "/usr/bin/wget", "wget installed"),
179
- # Network access to target
180
- ("which nc", None, "netcat installed"),
175
+ # Core security tools for CVE exploitation - STRICT
176
+ ("which python3", "/usr/bin/python3", "python3 at /usr/bin/python3"),
177
+ ("which nmap", "/usr/bin/nmap", "nmap at /usr/bin/nmap"),
178
+ ("which curl", "/usr/bin/curl", "curl at /usr/bin/curl"),
179
+ ("which wget", "/usr/bin/wget", "wget at /usr/bin/wget"),
180
+ # Network access to target - STRICT
181
+ ("which nc", "/usr/bin/nc", "netcat at /usr/bin/nc"),
181
182
  # apt-get for additional tools
182
183
  ("apt-get --version", "apt", "apt-get available"),
183
184
  ],
@@ -191,6 +192,43 @@ ENVIRONMENT_CHECKS = {
191
192
  ],
192
193
  }
193
194
 
195
+ # Task-specific file expectations
196
+ # Maps benchmark -> task_id -> list of expected paths
197
+ # Use "*" as task_id for checks that apply to all tasks in a benchmark
198
+ TASK_FILE_CHECKS = {
199
+ "cybench": {
200
+ "chunky": ["/root/blog", "/root/nginx", "/root/rcache"],
201
+ "avatar": ["/root/chall.py"],
202
+ "lootstash": ["/root/stash"],
203
+ "primary_knowledge": [], # No local files - network-based
204
+ },
205
+ "cybergym": {
206
+ "*": ["/submit.sh"], # All cybergym tasks need submit.sh
207
+ },
208
+ "nyuctf": {
209
+ "*": ["/home/ctfplayer/ctf_files"], # Standard location
210
+ },
211
+ "intercode-ctf": {
212
+ "*": ["/ctf"], # Task files in /ctf directory
213
+ },
214
+ }
215
+
216
+
217
+ def get_file_checks_for_task(benchmark: str, task_id: str) -> list[tuple]:
218
+ """Generate file existence checks based on task expectations."""
219
+ checks = []
220
+ task_files = TASK_FILE_CHECKS.get(benchmark, {})
221
+
222
+ # Try task-specific first, then fallback to wildcard
223
+ expected = task_files.get(task_id) or task_files.get("*", [])
224
+
225
+ for path in expected:
226
+ checks.append(
227
+ (f"test -e {path} && echo 'EXISTS'", "EXISTS", f"{path} exists")
228
+ )
229
+
230
+ return checks
231
+
194
232
  # Commands to run for submission tests
195
233
  SUBMISSION_TESTS = {
196
234
  "intercode-ctf": {
@@ -503,7 +541,12 @@ def run_automated_test(task_id: str, benchmark: str, timeout: int = 300) -> list
503
541
 
504
542
  # Get benchmark-specific checks from ENVIRONMENT_CHECKS
505
543
  env_checks = ENVIRONMENT_CHECKS.get(benchmark, [])
506
- for check in env_checks:
544
+
545
+ # Add task-specific file checks
546
+ file_checks = get_file_checks_for_task(benchmark, task_id)
547
+ all_checks = env_checks + file_checks
548
+
549
+ for check in all_checks:
507
550
  # Unpack: (command, expected_pattern, description)
508
551
  if len(check) == 3:
509
552
  cmd, expected, desc = check
@@ -517,7 +560,7 @@ def run_automated_test(task_id: str, benchmark: str, timeout: int = 300) -> list
517
560
  ["docker", "exec", container_name, "sh", "-c", cmd],
518
561
  capture_output=True,
519
562
  text=True,
520
- timeout=15,
563
+ timeout=30, # Increased for slow imports (angr takes ~10s)
521
564
  )
522
565
  output = result.stdout.strip()
523
566
  stderr = result.stderr.strip()
@@ -564,13 +607,20 @@ def run_automated_test(task_id: str, benchmark: str, timeout: int = 300) -> list
564
607
  time.sleep(2)
565
608
  docker_child.expect(prompt_patterns[:-1], timeout=30)
566
609
  output = strip_ansi(docker_child.before or "")
567
- results.append(
568
- TestResult(
569
- "task score",
570
- True, # Just checking it runs
571
- output[:200],
610
+
611
+ expected_score = sub_tests.get("score_expect")
612
+ if expected_score:
613
+ passed = expected_score.lower() in output.lower()
614
+ details = output[:200] if passed else f"Expected '{expected_score}' in output: {output[:100]}..."
615
+ results.append(TestResult("task score", passed, details))
616
+ else:
617
+ results.append(
618
+ TestResult(
619
+ "task score",
620
+ True, # Just checking it runs
621
+ output[:200],
622
+ )
572
623
  )
573
- )
574
624
 
575
625
  # Submit answer
576
626
  console.print("Submitting test answer...")
@@ -605,14 +655,15 @@ def run_automated_test(task_id: str, benchmark: str, timeout: int = 300) -> list
605
655
  timeout=60,
606
656
  )
607
657
  # EOF (idx=4) is expected - container exits after task submit
608
- if idx <= 4:
658
+ # TIMEOUT (idx=3) is a failure
659
+ if idx != 3:
609
660
  results.append(
610
661
  TestResult("Submission", True, "Answer submitted (container exited)")
611
662
  )
612
663
  else:
613
664
  results.append(
614
665
  TestResult(
615
- "Submission", False, docker_child.before or "", "Submission may have failed"
666
+ "Submission", False, docker_child.before or "", "Submission timed out waiting for result"
616
667
  )
617
668
  )
618
669
  elif idx < 3:
@@ -624,7 +675,7 @@ def run_automated_test(task_id: str, benchmark: str, timeout: int = 300) -> list
624
675
  else:
625
676
  results.append(
626
677
  TestResult(
627
- "Submission", False, docker_child.before or "", "Submission may have failed"
678
+ "Submission", False, docker_child.before or "", "Submission timed out at prompt"
628
679
  )
629
680
  )
630
681
 
@@ -671,10 +722,19 @@ def run_automated_test(task_id: str, benchmark: str, timeout: int = 300) -> list
671
722
  return results
672
723
 
673
724
 
674
- def verify_artifacts(task_id: str, _benchmark: str) -> list[TestResult]:
725
+ def verify_artifacts(task_id: str, benchmark: str) -> list[TestResult]:
675
726
  """Verify session and eval log artifacts were created."""
676
727
  results = []
677
728
 
729
+ # Get expected answer from SUBMISSION_TESTS
730
+ sub_tests = SUBMISSION_TESTS.get(benchmark, {})
731
+ submit_cmd = sub_tests.get("submit_cmd", "")
732
+ expected_answer = None
733
+ if 'task submit "' in submit_cmd:
734
+ expected_answer = submit_cmd.split('task submit "')[1].rstrip('"')
735
+ elif "task submit '" in submit_cmd:
736
+ expected_answer = submit_cmd.split("task submit '")[1].rstrip("'")
737
+
678
738
  # Check session in database
679
739
  session_info = ssh_query(f"""
680
740
  SELECT id, status, score, client_active_seconds, answer
@@ -699,11 +759,23 @@ def verify_artifacts(task_id: str, _benchmark: str) -> list[TestResult]:
699
759
  "Active time recorded", float(active_seconds or 0) > 0, f"Seconds: {active_seconds}"
700
760
  )
701
761
  )
702
- results.append(
703
- TestResult(
704
- "Answer recorded", bool(answer), f"Answer: {answer[:50]}..." if answer else ""
762
+
763
+ # Verify answer
764
+ if expected_answer and benchmark != "cybergym": # Cybergym submits file content, hard to verify here
765
+ results.append(
766
+ TestResult(
767
+ "Answer matches submission",
768
+ answer == expected_answer,
769
+ f"Expected: '{expected_answer}', Got: '{answer}'"
770
+ )
705
771
  )
706
- )
772
+ else:
773
+ results.append(
774
+ TestResult(
775
+ "Answer recorded", bool(answer), f"Answer: {answer[:50]}..." if answer else ""
776
+ )
777
+ )
778
+
707
779
  results.append(
708
780
  TestResult("Score recorded", score != "", f"Score: {score}" if score else "No score")
709
781
  )
@@ -22,6 +22,41 @@ VPS_HOST = os.environ.get("VPS_HOST", "root@209.38.25.118")
22
22
  TEST_EMAIL = "e2e-test@lyptus.dev"
23
23
  CLI_CONFIG_PATH = Path.home() / "Library" / "Application Support" / "hte-cli" / "config.json"
24
24
 
25
+ # Expected task assignments per benchmark (4 per benchmark)
26
+ # First 2 for pytest API tests, last 2 for interactive/automated tests
27
+ EXPECTED_TASKS = {
28
+ "cybergym": ["arvo:60121", "arvo:8046", "arvo:41073", "arvo:43984"],
29
+ "cybench": ["chunky", "avatar", "lootstash", "primary_knowledge"],
30
+ "intercode-ctf": [
31
+ "intercode-ctf_general_skills/task_4",
32
+ "intercode-ctf_reverse/task_1",
33
+ "intercode-ctf_reverse/task_3",
34
+ "intercode-ctf_reverse/task_85",
35
+ ],
36
+ "nyuctf": [
37
+ "2013f-cry-stfu",
38
+ "2016q-rev-rock",
39
+ "2023q-rev-baby_s_first",
40
+ "2019q-cry-super_curve",
41
+ ],
42
+ "cybashbench": [
43
+ "cybashbench_pwn/nl2bash_003",
44
+ "cybashbench_reverse/nl2bash_004",
45
+ "cybashbench_web/nl2bash_001",
46
+ "cybashbench_forensics/nl2bash_005",
47
+ ],
48
+ "cvebench": ["CVE-2024-5084", "CVE-2024-4442", "CVE-2024-22120", "CVE-2024-32511"],
49
+ "nl2bash": [
50
+ "nl2bash_complex/task_8581",
51
+ "nl2bash_complex/task_713",
52
+ "nl2bash_complex/task_712",
53
+ "nl2bash_complex/task_8796",
54
+ ],
55
+ }
56
+
57
+ # Total expected assignments (calculated from EXPECTED_TASKS)
58
+ EXPECTED_ASSIGNMENT_COUNT = sum(len(tasks) for tasks in EXPECTED_TASKS.values())
59
+
25
60
 
26
61
  def ssh_query(query: str) -> str:
27
62
  """Run a sqlite3 query on the VPS (READ-ONLY)."""
@@ -61,6 +61,12 @@ BENCHMARK_TASKS = {
61
61
  "cybashbench_forensics/nl2bash_005",
62
62
  ],
63
63
  "cvebench": ["CVE-2024-5084", "CVE-2024-4442", "CVE-2024-22120", "CVE-2024-32511"],
64
+ "nl2bash": [
65
+ "nl2bash_complex/task_8581",
66
+ "nl2bash_complex/task_713",
67
+ "nl2bash_complex/task_712",
68
+ "nl2bash_complex/task_8796",
69
+ ],
64
70
  }
65
71
 
66
72
  # Instructions for each benchmark type
@@ -688,38 +694,158 @@ def cleanup():
688
694
  help="Admin password for API access",
689
695
  )
690
696
  @click.option("--yes", "-y", is_flag=True, help="Skip confirmation prompts")
691
- def full(admin_password: str, yes: bool):
692
- """Run complete E2E test suite (setup, run all, verify, cleanup)."""
693
- console.print("\n[bold]Full E2E Test Suite[/bold]\n")
694
- console.print("[yellow]This will run all benchmarks interactively.[/yellow]")
695
- console.print("You'll need to interact with each task container.\n")
696
-
697
- if not yes and not click.confirm("Continue?"):
697
+ @click.option("--skip-setup", is_flag=True, help="Skip setup if already done")
698
+ @click.option("--cleanup-after", is_flag=True, help="Run cleanup after tests")
699
+ def full(admin_password: str, yes: bool, skip_setup: bool, cleanup_after: bool):
700
+ """Run complete E2E test suite in 3 phases.
701
+
702
+ Phase 1: Infrastructure tests (pytest, fast, no containers)
703
+ Phase 2: Automated benchmark E2E tests (pexpect, creates completed sessions)
704
+ Phase 3: Session verification tests (pytest, validates completed sessions)
705
+
706
+ This is fully automated - no user interaction required.
707
+ """
708
+ console.print(Panel("[bold]Full E2E Test Suite - 3 Phases[/bold]", style="cyan"))
709
+ console.print("""
710
+ [dim]Phase 1:[/dim] Infrastructure tests (pytest)
711
+ [dim]Phase 2:[/dim] Automated benchmark E2E tests (pexpect)
712
+ [dim]Phase 3:[/dim] Session verification tests (pytest)
713
+ """)
714
+
715
+ if not yes and not click.confirm("Run full automated E2E suite?"):
698
716
  raise click.ClickException("Aborted")
699
717
 
700
- # Setup
701
- ctx = click.get_current_context()
702
- ctx.invoke(setup, admin_password=admin_password, yes=yes)
718
+ results = {"phase1": None, "phase2": {}, "phase3": None}
719
+ tests_dir = Path(__file__).parent
720
+
721
+ # Setup (unless skipped)
722
+ if not skip_setup:
723
+ console.print("\n" + "=" * 60)
724
+ console.print("[bold cyan]SETUP: Creating test user and assignments[/bold cyan]")
725
+ console.print("=" * 60)
726
+ ctx = click.get_current_context()
727
+ ctx.invoke(setup, admin_password=admin_password, yes=True)
728
+
729
+ # Phase 1: Infrastructure tests
730
+ console.print("\n" + "=" * 60)
731
+ console.print("[bold cyan]PHASE 1: Infrastructure Tests[/bold cyan]")
732
+ console.print("=" * 60)
733
+ console.print("[dim]Running pytest on infrastructure, imports, benchmark flows...[/dim]\n")
734
+
735
+ phase1_result = subprocess.run(
736
+ [
737
+ "uv", "run", "pytest",
738
+ str(tests_dir / "test_infrastructure.py"),
739
+ str(tests_dir / "test_runtime_imports.py"),
740
+ str(tests_dir / "test_benchmark_flows.py"),
741
+ "-v", "--tb=short",
742
+ ],
743
+ cwd=tests_dir.parent.parent,
744
+ )
745
+ results["phase1"] = phase1_result.returncode == 0
746
+
747
+ if not results["phase1"]:
748
+ console.print("\n[red bold]Phase 1 FAILED - stopping[/red bold]")
749
+ _print_full_summary(results)
750
+ raise SystemExit(1)
703
751
 
704
- # Run each benchmark
705
- for benchmark in BENCHMARK_TASKS.keys():
706
- console.print(f"\n{'=' * 50}")
707
- console.print(f"[bold]Benchmark: {benchmark}[/bold]")
752
+ console.print("\n[green]Phase 1 PASSED[/green]")
753
+
754
+ # Phase 2: Automated benchmark E2E tests
755
+ console.print("\n" + "=" * 60)
756
+ console.print("[bold cyan]PHASE 2: Automated Benchmark E2E Tests[/bold cyan]")
757
+ console.print("=" * 60)
758
+ console.print("[dim]Running automated tests for each benchmark via pexpect...[/dim]\n")
708
759
 
709
- for i in range(2):
710
- if click.confirm(f"\nRun task {i+1}/2 for {benchmark}?"):
711
- ctx.invoke(run, benchmark=benchmark, task_index=i)
760
+ from automated_runner import run_benchmark_test
712
761
 
713
- # Verify
714
- console.print(f"\n{'=' * 50}")
715
- ctx.invoke(verify, admin_password=admin_password)
762
+ for benchmark in BENCHMARK_TASKS.keys():
763
+ console.print(f"\n[bold]--- {benchmark} ---[/bold]")
764
+ try:
765
+ # Run task index 2 (third task, reserved for automated E2E)
766
+ success = run_benchmark_test(benchmark, task_index=2)
767
+ results["phase2"][benchmark] = success
768
+ if success:
769
+ console.print(f"[green]{benchmark}: PASSED[/green]")
770
+ else:
771
+ console.print(f"[red]{benchmark}: FAILED[/red]")
772
+ except Exception as e:
773
+ console.print(f"[red]{benchmark}: ERROR - {e}[/red]")
774
+ results["phase2"][benchmark] = False
775
+
776
+ phase2_passed = all(results["phase2"].values())
777
+ if not phase2_passed:
778
+ console.print("\n[yellow]Phase 2 had failures - continuing to Phase 3[/yellow]")
779
+
780
+ # Phase 3: Session verification tests
781
+ console.print("\n" + "=" * 60)
782
+ console.print("[bold cyan]PHASE 3: Session Verification Tests[/bold cyan]")
783
+ console.print("=" * 60)
784
+ console.print("[dim]Running pytest on session lifecycle and eval logs...[/dim]\n")
785
+
786
+ phase3_result = subprocess.run(
787
+ [
788
+ "uv", "run", "pytest",
789
+ str(tests_dir / "test_session_lifecycle.py"),
790
+ str(tests_dir / "test_eval_logs.py"),
791
+ "-v", "--tb=short",
792
+ ],
793
+ cwd=tests_dir.parent.parent,
794
+ )
795
+ results["phase3"] = phase3_result.returncode == 0
796
+
797
+ # Summary
798
+ _print_full_summary(results)
716
799
 
717
800
  # Cleanup
718
- console.print(f"\n{'=' * 50}")
719
- if click.confirm("Run cleanup?"):
801
+ if cleanup_after:
802
+ console.print("\n" + "=" * 60)
803
+ console.print("[bold cyan]CLEANUP[/bold cyan]")
804
+ ctx = click.get_current_context()
720
805
  ctx.invoke(cleanup)
721
806
 
722
- console.print("\n[bold green]Full E2E test complete![/bold green]")
807
+ # Exit with appropriate code
808
+ all_passed = results["phase1"] and phase2_passed and results["phase3"]
809
+ if all_passed:
810
+ console.print("\n[bold green]All phases PASSED![/bold green]")
811
+ else:
812
+ console.print("\n[bold red]Some phases FAILED[/bold red]")
813
+ raise SystemExit(1)
814
+
815
+
816
+ def _print_full_summary(results: dict):
817
+ """Print summary table of all phases."""
818
+ console.print("\n" + "=" * 60)
819
+ console.print("[bold]SUMMARY[/bold]")
820
+ console.print("=" * 60)
821
+
822
+ table = Table()
823
+ table.add_column("Phase", style="cyan")
824
+ table.add_column("Status")
825
+ table.add_column("Details")
826
+
827
+ # Phase 1
828
+ if results["phase1"] is not None:
829
+ status = "[green]PASSED[/green]" if results["phase1"] else "[red]FAILED[/red]"
830
+ table.add_row("Phase 1: Infrastructure", status, "pytest infra/imports/flows")
831
+
832
+ # Phase 2
833
+ if results["phase2"]:
834
+ passed = sum(1 for v in results["phase2"].values() if v)
835
+ total = len(results["phase2"])
836
+ status = "[green]PASSED[/green]" if passed == total else f"[yellow]{passed}/{total}[/yellow]"
837
+ details = ", ".join(
838
+ f"[green]{b}[/green]" if v else f"[red]{b}[/red]"
839
+ for b, v in results["phase2"].items()
840
+ )
841
+ table.add_row("Phase 2: Benchmarks", status, details)
842
+
843
+ # Phase 3
844
+ if results["phase3"] is not None:
845
+ status = "[green]PASSED[/green]" if results["phase3"] else "[red]FAILED[/red]"
846
+ table.add_row("Phase 3: Verification", status, "pytest lifecycle/logs")
847
+
848
+ console.print(table)
723
849
 
724
850
 
725
851
  if __name__ == "__main__":
@@ -13,7 +13,14 @@ Run with: uv run pytest tests/e2e/test_benchmark_flows.py -v
13
13
  import pytest
14
14
  import requests
15
15
 
16
- from tests.e2e.conftest import BASE_URL, get_test_user_id, ssh_command, ssh_query
16
+ from tests.e2e.conftest import (
17
+ BASE_URL,
18
+ EXPECTED_ASSIGNMENT_COUNT,
19
+ EXPECTED_TASKS,
20
+ get_test_user_id,
21
+ ssh_command,
22
+ ssh_query,
23
+ )
17
24
 
18
25
  # Benchmark test configurations
19
26
  # First 2 tasks for pytest API tests, last 2 for interactive tests
@@ -367,12 +374,14 @@ class TestCrossBenchmark:
367
374
  assert int(count) > 0, f"No assignments for {benchmark}"
368
375
 
369
376
  def test_total_assignments_correct(self):
370
- """Total assignments should be 24 (4 per benchmark)."""
377
+ """Total assignments should match expected count (4 per benchmark)."""
371
378
  count = ssh_query(f"""
372
379
  SELECT COUNT(*) FROM assignments
373
380
  WHERE user_id = '{get_test_user_id()}'
374
381
  """)
375
- assert int(count) == 24
382
+ assert int(count) == EXPECTED_ASSIGNMENT_COUNT, (
383
+ f"Expected {EXPECTED_ASSIGNMENT_COUNT} assignments, got {count}"
384
+ )
376
385
 
377
386
 
378
387
  # =============================================================================
@@ -85,8 +85,8 @@ class TestLocalEvalLogs:
85
85
  pytest.skip("Local eval logs directory not found")
86
86
 
87
87
  logs = list(LOCAL_EVAL_LOGS_DIR.glob("*.eval"))
88
- # Just verify we can list them
89
- assert isinstance(logs, list)
88
+ # Verify we found eval logs (if E2E tests have run, there should be some)
89
+ assert len(logs) > 0, f"No eval logs found in {LOCAL_EVAL_LOGS_DIR}"
90
90
 
91
91
 
92
92
  # =============================================================================
@@ -103,11 +103,12 @@ class TestVPSEvalLogs:
103
103
  assert result == "exists", "VPS eval logs directory not found"
104
104
 
105
105
  def test_vps_eval_log_count(self):
106
- """Should be able to count eval logs on VPS."""
106
+ """Should have eval logs on VPS if sessions have completed."""
107
107
  result = ssh_command(f"find {VPS_EVAL_LOGS_DIR} -name '*.eval.gz' 2>/dev/null | wc -l")
108
- count = int(result.strip()) if result.strip().isdigit() else 0
109
- # Just verify we can count them
110
- assert count >= 0
108
+ assert result.strip().isdigit(), f"Invalid count result: {result}"
109
+ count = int(result.strip())
110
+ # If E2E tests have run, there should be eval logs
111
+ assert count > 0, f"No eval logs found on VPS in {VPS_EVAL_LOGS_DIR}"
111
112
 
112
113
  def test_completed_sessions_have_eval_log_path(self):
113
114
  """Completed sessions should have eval_log_path recorded."""
@@ -241,11 +242,11 @@ class TestEvalLogUpload:
241
242
  WHERE session_id = '{session_id}'
242
243
  """)
243
244
 
244
- # Should have upload-related events
245
+ # Should have upload-related events for completed sessions
245
246
  event_list = events.split("\n") if events else []
246
247
  has_upload = any("upload" in e.lower() for e in event_list)
247
- # Note: upload events might not always exist
248
- assert isinstance(has_upload, bool)
248
+ # Completed sessions should have upload events
249
+ assert has_upload, f"No upload events found for session {session_id}. Events: {event_list[:5]}"
249
250
 
250
251
  def test_eval_log_size_reasonable(self):
251
252
  """Eval logs should be reasonably sized (not empty, not huge)."""
@@ -25,31 +25,8 @@ from tests.e2e.conftest import (
25
25
  ssh_query,
26
26
  )
27
27
 
28
- # Expected task assignments (4 per benchmark)
29
- # First 2 for pytest API tests, last 2 for interactive tests
30
- EXPECTED_TASKS = {
31
- "cybergym": ["arvo:60121", "arvo:8046", "arvo:41073", "arvo:43984"],
32
- "cybench": ["chunky", "avatar", "lootstash", "primary_knowledge"],
33
- "intercode-ctf": [
34
- "intercode-ctf_general_skills/task_4",
35
- "intercode-ctf_reverse/task_1",
36
- "intercode-ctf_reverse/task_3",
37
- "intercode-ctf_reverse/task_85",
38
- ],
39
- "nyuctf": [
40
- "2013f-cry-stfu",
41
- "2016q-rev-rock",
42
- "2023q-rev-baby_s_first",
43
- "2019q-cry-super_curve",
44
- ],
45
- "cybashbench": [
46
- "cybashbench_pwn/nl2bash_003",
47
- "cybashbench_reverse/nl2bash_004",
48
- "cybashbench_web/nl2bash_001",
49
- "cybashbench_forensics/nl2bash_005",
50
- ],
51
- "cvebench": ["CVE-2024-5084", "CVE-2024-4442", "CVE-2024-22120", "CVE-2024-32511"],
52
- }
28
+ # Import shared constants from conftest
29
+ from tests.e2e.conftest import EXPECTED_TASKS, EXPECTED_ASSIGNMENT_COUNT
53
30
 
54
31
 
55
32
  # =============================================================================
@@ -133,11 +110,13 @@ class TestAssignments:
133
110
  """Test that task assignments are correctly set up."""
134
111
 
135
112
  def test_correct_number_of_assignments(self):
136
- """Test user should have exactly 12 assignments."""
113
+ """Test user should have expected number of assignments."""
137
114
  count = ssh_query(
138
115
  f"SELECT COUNT(*) FROM assignments WHERE user_id = '{get_test_user_id()}'"
139
116
  )
140
- assert int(count) == 24
117
+ assert int(count) == EXPECTED_ASSIGNMENT_COUNT, (
118
+ f"Expected {EXPECTED_ASSIGNMENT_COUNT} assignments, got {count}"
119
+ )
141
120
 
142
121
  @pytest.mark.parametrize("benchmark,tasks", EXPECTED_TASKS.items())
143
122
  def test_benchmark_tasks_assigned(self, benchmark, tasks):
@@ -226,8 +205,9 @@ class TestAPIEndpoints:
226
205
  )
227
206
  assert response.status_code == 200
228
207
  assignments = response.json()
229
- # User may have different number of assignments
230
- assert isinstance(assignments, list)
208
+ # Test user should have assignments from E2E setup
209
+ assert isinstance(assignments, list), "Expected list of assignments"
210
+ assert len(assignments) > 0, "Test user should have at least one assignment"
231
211
 
232
212
  def test_assignment_has_task_info(self, api_headers):
233
213
  """Assignments should include task information."""
@@ -149,8 +149,10 @@ print(f'Loaded {len(HUMAN_REGISTRY)} benchmarks: {list(HUMAN_REGISTRY.keys())}')
149
149
  pytest.fail(f"Import failed in container: {result.stderr}")
150
150
 
151
151
  assert "Loaded" in result.stdout
152
- # Should have at least 6 benchmarks
153
- assert "6 benchmarks" in result.stdout or "7 benchmarks" in result.stdout
152
+ # Should have exactly 7 benchmarks
153
+ assert "7 benchmarks" in result.stdout, (
154
+ f"Expected 7 benchmarks, got: {result.stdout}"
155
+ )
154
156
 
155
157
  def test_backend_can_import_adapters(self):
156
158
  """Backend should be able to instantiate adapters."""
@@ -176,9 +178,11 @@ for name, cls in HUMAN_REGISTRY.items():
176
178
  if "FAIL" in result.stdout:
177
179
  pytest.fail(f"Adapter instantiation failed: {result.stdout}")
178
180
 
179
- # Should have OK for all benchmarks
181
+ # All benchmarks should show OK - STRICT check
180
182
  for benchmark in BENCHMARKS:
181
- assert f"{benchmark}: OK" in result.stdout or benchmark not in result.stdout
183
+ assert f"{benchmark}: OK" in result.stdout, (
184
+ f"Benchmark {benchmark} not found or not OK in output: {result.stdout}"
185
+ )
182
186
 
183
187
 
184
188
  class TestLocalImports:
@@ -164,17 +164,26 @@ class TestSessionCompletion:
164
164
 
165
165
  def test_completed_session_has_score(self):
166
166
  """Completed sessions should have a score."""
167
+ # Count total submitted sessions
168
+ total_submitted = ssh_query(f"""
169
+ SELECT COUNT(*) FROM sessions
170
+ WHERE user_id = '{get_test_user_id()}'
171
+ AND status = 'submitted'
172
+ """)
173
+ total = int(total_submitted) if total_submitted else 0
174
+ if total == 0:
175
+ pytest.skip("No submitted sessions to verify")
176
+
177
+ # Count sessions without score
167
178
  sessions_without_score = ssh_query(f"""
168
179
  SELECT COUNT(*) FROM sessions
169
180
  WHERE user_id = '{get_test_user_id()}'
170
181
  AND status = 'submitted'
171
182
  AND score IS NULL
172
183
  """)
173
- # Note: score can legitimately be NULL for some benchmarks
174
- # This test documents expected behavior
175
- count = int(sessions_without_score)
176
- # We just want to verify we can query this
177
- assert count >= 0
184
+ count = int(sessions_without_score) if sessions_without_score else 0
185
+ # Most submitted sessions should have scores (some benchmarks may not score)
186
+ assert count < total, f"All {total} sessions missing scores"
178
187
 
179
188
  def test_completed_session_has_answer(self):
180
189
  """Completed sessions should have an answer."""
@@ -208,14 +217,16 @@ class TestSessionState:
208
217
  """Test session state verification (read-only)."""
209
218
 
210
219
  def test_abandoned_sessions_count(self):
211
- """Verify we can count abandoned sessions."""
220
+ """Verify abandoned sessions exist and are queryable."""
212
221
  abandoned_count = ssh_query(f"""
213
222
  SELECT COUNT(*) FROM sessions
214
223
  WHERE user_id = '{get_test_user_id()}'
215
224
  AND status = 'abandoned'
216
225
  """)
217
- # Just verify we can query abandoned sessions
218
- assert int(abandoned_count) >= 0
226
+ count = int(abandoned_count) if abandoned_count else 0
227
+ # Verify the query returned a valid number (not empty/error)
228
+ assert abandoned_count.strip().isdigit(), f"Query returned invalid value: {abandoned_count}"
229
+ # Note: count can legitimately be 0 if no sessions were abandoned
219
230
 
220
231
  def test_no_stuck_sessions_older_than_24h(self):
221
232
  """No in_progress sessions should be older than 24 hours."""
@@ -387,8 +398,9 @@ class TestSessionCancellation:
387
398
  WHERE user_id = '{get_test_user_id()}'
388
399
  AND status = 'cancelled'
389
400
  """)
390
- # Just verify we can query cancelled sessions
391
- assert int(cancelled) >= 0
401
+ # Verify query returned valid result
402
+ assert cancelled.strip().isdigit(), f"Query returned invalid value: {cancelled}"
403
+ # Note: count can legitimately be 0 if no sessions were cancelled
392
404
 
393
405
  def test_no_orphaned_in_progress_after_cancel(self):
394
406
  """Assignments should not be in_progress if session is cancelled."""
@@ -625,7 +625,7 @@ wheels = [
625
625
 
626
626
  [[package]]
627
627
  name = "hte-cli"
628
- version = "0.2.18"
628
+ version = "0.2.20"
629
629
  source = { editable = "." }
630
630
  dependencies = [
631
631
  { name = "click" },
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes