hte-cli 0.2.19__tar.gz → 0.2.22__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. {hte_cli-0.2.19 → hte_cli-0.2.22}/PKG-INFO +1 -1
  2. {hte_cli-0.2.19 → hte_cli-0.2.22}/pyproject.toml +1 -1
  3. {hte_cli-0.2.19 → hte_cli-0.2.22}/src/hte_cli/cli.py +21 -1
  4. {hte_cli-0.2.19 → hte_cli-0.2.22}/tests/e2e/automated_runner.py +180 -79
  5. {hte_cli-0.2.19 → hte_cli-0.2.22}/tests/e2e/conftest.py +35 -0
  6. {hte_cli-0.2.19 → hte_cli-0.2.22}/tests/e2e/e2e_test.py +154 -25
  7. {hte_cli-0.2.19 → hte_cli-0.2.22}/tests/e2e/test_benchmark_flows.py +12 -3
  8. {hte_cli-0.2.19 → hte_cli-0.2.22}/tests/e2e/test_eval_logs.py +43 -22
  9. {hte_cli-0.2.19 → hte_cli-0.2.22}/tests/e2e/test_infrastructure.py +9 -29
  10. {hte_cli-0.2.19 → hte_cli-0.2.22}/tests/e2e/test_runtime_imports.py +8 -4
  11. {hte_cli-0.2.19 → hte_cli-0.2.22}/tests/e2e/test_session_lifecycle.py +22 -10
  12. {hte_cli-0.2.19 → hte_cli-0.2.22}/uv.lock +1 -1
  13. {hte_cli-0.2.19 → hte_cli-0.2.22}/.gitignore +0 -0
  14. {hte_cli-0.2.19 → hte_cli-0.2.22}/README.md +0 -0
  15. {hte_cli-0.2.19 → hte_cli-0.2.22}/src/hte_cli/__init__.py +0 -0
  16. {hte_cli-0.2.19 → hte_cli-0.2.22}/src/hte_cli/__main__.py +0 -0
  17. {hte_cli-0.2.19 → hte_cli-0.2.22}/src/hte_cli/api_client.py +0 -0
  18. {hte_cli-0.2.19 → hte_cli-0.2.22}/src/hte_cli/config.py +0 -0
  19. {hte_cli-0.2.19 → hte_cli-0.2.22}/src/hte_cli/errors.py +0 -0
  20. {hte_cli-0.2.19 → hte_cli-0.2.22}/src/hte_cli/events.py +0 -0
  21. {hte_cli-0.2.19 → hte_cli-0.2.22}/src/hte_cli/image_utils.py +0 -0
  22. {hte_cli-0.2.19 → hte_cli-0.2.22}/src/hte_cli/runner.py +0 -0
  23. {hte_cli-0.2.19 → hte_cli-0.2.22}/src/hte_cli/scorers.py +0 -0
  24. {hte_cli-0.2.19 → hte_cli-0.2.22}/src/hte_cli/version_check.py +0 -0
  25. {hte_cli-0.2.19 → hte_cli-0.2.22}/tests/__init__.py +0 -0
  26. {hte_cli-0.2.19 → hte_cli-0.2.22}/tests/e2e/__init__.py +0 -0
  27. {hte_cli-0.2.19 → hte_cli-0.2.22}/tests/e2e/verify_docker_deps.py +0 -0
  28. {hte_cli-0.2.19 → hte_cli-0.2.22}/tests/unit/__init__.py +0 -0
  29. {hte_cli-0.2.19 → hte_cli-0.2.22}/tests/unit/conftest.py +0 -0
  30. {hte_cli-0.2.19 → hte_cli-0.2.22}/tests/unit/test_image_utils.py +0 -0
  31. {hte_cli-0.2.19 → hte_cli-0.2.22}/tests/unit/test_runner.py +0 -0
  32. {hte_cli-0.2.19 → hte_cli-0.2.22}/tests/unit/test_scorers.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: hte-cli
3
- Version: 0.2.19
3
+ Version: 0.2.22
4
4
  Summary: Human Time-to-Completion Evaluation CLI
5
5
  Project-URL: Homepage, https://github.com/sean-peters-au/lyptus-mono
6
6
  Author: Lyptus Research
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "hte-cli"
3
- version = "0.2.19"
3
+ version = "0.2.22"
4
4
  description = "Human Time-to-Completion Evaluation CLI"
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.11"
@@ -280,6 +280,14 @@ def session_join(ctx, session_id: str, force_setup: bool):
280
280
  },
281
281
  }
282
282
 
283
+ # Send session_started event (records CLI version for debugging)
284
+ events.session_started(
285
+ {
286
+ "cli_version": __version__,
287
+ "task_id": session_info["task_id"],
288
+ }
289
+ )
290
+
283
291
  # Step 3: Run setup (skip if reconnecting without force)
284
292
  setup_start_time = time.monotonic()
285
293
  images = []
@@ -429,13 +437,21 @@ def session_join(ctx, session_id: str, force_setup: bool):
429
437
  console.print(f"Answer: {result.answer}")
430
438
  console.print(f"Time: {result.time_seconds:.1f}s")
431
439
 
440
+ # Track upload size and timing
441
+ upload_size_bytes = len(eval_log_bytes) if eval_log_bytes else 0
442
+ upload_size_kb = upload_size_bytes / 1024
443
+
444
+ events.upload_started(size_bytes=upload_size_bytes)
445
+ upload_start_time = time.monotonic()
446
+
432
447
  # Upload to server
433
448
  with Progress(
434
449
  SpinnerColumn(),
435
450
  TextColumn("[progress.description]{task.description}"),
436
451
  console=console,
437
452
  ) as progress:
438
- progress.add_task("Uploading result...", total=None)
453
+ size_str = f" ({upload_size_kb:.0f} KB)" if upload_size_kb > 0 else ""
454
+ progress.add_task(f"Uploading result{size_str}...", total=None)
439
455
  try:
440
456
  upload_result = api.upload_result(
441
457
  session_id=session_id,
@@ -450,6 +466,10 @@ def session_join(ctx, session_id: str, force_setup: bool):
450
466
  console.print(f"[red]Failed to upload result: {e}[/red]")
451
467
  sys.exit(1)
452
468
 
469
+ # Record upload completion
470
+ upload_duration = time.monotonic() - upload_start_time
471
+ events.upload_completed(duration_seconds=upload_duration, size_bytes=upload_size_bytes)
472
+
453
473
  if upload_result.get("score") is not None:
454
474
  console.print(f"Score: {upload_result['score']}")
455
475
 
@@ -52,11 +52,15 @@ def strip_ansi(text: str) -> str:
52
52
 
53
53
  VPS_HOST = os.environ.get("VPS_HOST", "root@209.38.25.118")
54
54
 
55
+ # Use the pipx-installed CLI (what experts use), not the local venv version
56
+ # This ensures E2E tests validate the actual published PyPI package
57
+ HTE_CLI_PATH = Path.home() / ".local" / "bin" / "hte-cli"
58
+
55
59
  # Environment verification commands per benchmark
56
60
  # Based on docs/benchmark-environments.md
57
61
  # Format: (command, expected_pattern, description)
58
- # expected_pattern=None means just check command succeeds (exit 0)
59
- # These tests are STRICT - they verify the exact environment specified in docs
62
+ # STRICT: All tests must pass - no optional/informational checks
63
+ # These tests verify the exact environment specified in docs
60
64
  ENVIRONMENT_CHECKS = {
61
65
  "intercode-ctf": [
62
66
  # Working directory: /ctf/{task_num} (e.g., /ctf/3, /ctf/85)
@@ -69,8 +73,8 @@ ENVIRONMENT_CHECKS = {
69
73
  ("which netcat", "/usr/bin/netcat", "netcat installed"),
70
74
  ("which tcpdump", "/usr/bin/tcpdump", "tcpdump installed"),
71
75
  ("which curl", "/usr/bin/curl", "curl installed"),
72
- # Forensics tools
73
- ("which mmls", None, "sleuthkit installed (mmls)"),
76
+ # Forensics tools - STRICT: must exist
77
+ ("which mmls", "/usr/bin/mmls", "mmls (sleuthkit) at /usr/bin/mmls"),
74
78
  # Python packages - must import successfully
75
79
  ("python3 -c 'from pwn import *; print(\"pwntools\")'", "pwntools", "pwntools importable"),
76
80
  ("python3 -c 'from scapy.all import *; print(\"scapy\")'", "scapy", "scapy importable"),
@@ -90,16 +94,17 @@ ENVIRONMENT_CHECKS = {
90
94
  ("pwd", "/home/ctfplayer", "Working directory is /home/ctfplayer"),
91
95
  # User: ctfplayer (NOT root)
92
96
  ("whoami", "ctfplayer", "Running as ctfplayer user"),
93
- ("id -u", None, "User ID check"), # Should be non-zero
97
+ # User ID must be 1000 (standard non-root)
98
+ ("id -u", "1000", "User ID is 1000"),
94
99
  # Passwordless sudo must work
95
100
  ("sudo -n whoami", "root", "Passwordless sudo to root"),
96
101
  # OS: Ubuntu 22.04 specifically
97
102
  ("cat /etc/os-release | grep VERSION_ID", "22.04", "Ubuntu 22.04 base"),
98
- # Core tools at expected paths
103
+ # Core tools at expected paths - STRICT
99
104
  ("which gdb", "/usr/bin/gdb", "gdb at /usr/bin/gdb"),
100
- ("which radare2", "/usr/bin/radare2", "radare2 installed"),
101
- ("which sqlmap", "/usr/bin/sqlmap", "sqlmap installed"),
102
- ("which jadx", None, "jadx installed"),
105
+ ("which radare2", "/usr/bin/radare2", "radare2 at /usr/bin/radare2"),
106
+ ("which sqlmap", "/usr/bin/sqlmap", "sqlmap at /usr/bin/sqlmap"),
107
+ ("which jadx", "/usr/local/bin/jadx", "jadx at /usr/local/bin/jadx"),
103
108
  # Python venv must exist at expected location
104
109
  (
105
110
  "test -f /home/ctfplayer/.ctfenv/bin/python3 && echo 'venv'",
@@ -112,72 +117,72 @@ ENVIRONMENT_CHECKS = {
112
117
  ("python3 -c 'import z3; print(\"z3\")'", "z3", "z3-solver importable"),
113
118
  # 32-bit support (for CTF binaries)
114
119
  ("test -f /lib32/libc.so.6 && echo 'lib32'", "lib32", "32-bit libc available"),
120
+ # Task files location - STRICT
121
+ ("test -d /home/ctfplayer/ctf_files && echo 'EXISTS'", "EXISTS", "ctf_files directory exists"),
115
122
  ],
116
123
  "cybench": [
117
- # Working directory: /root (Kali default)
118
- ("pwd", "/root", "Working directory is /root"),
124
+ # Working directory: /app (official cybench/cybench:latest image)
125
+ ("pwd", "/app", "Working directory is /app"),
119
126
  # OS: Kali Linux (must contain 'Kali' in name)
120
127
  ("cat /etc/os-release | grep '^NAME='", "Kali", "Kali Linux base image"),
121
- # Image detection tools (see docs/cybench-image-investigation.md):
122
- # - aisiuk/evals-cybench-agent-sandbox: NO gdb, NO ghidra, YES radare2
123
- # - cybench/cybench:latest: YES gdb, NO ghidra, YES radare2
124
- # - cybench/kali-linux-large:latest: YES gdb, YES ghidra, YES radare2
125
- # All of these pass - we just want to know which image
126
- ("which gdb || echo 'NOT_FOUND'", None, "[IMAGE] gdb"),
127
- ("which ghidra || echo 'NOT_FOUND'", None, "[IMAGE] ghidra"),
128
- ("which radare2 || echo 'NOT_FOUND'", None, "[IMAGE] radare2"),
129
- ("which binwalk || echo 'NOT_FOUND'", None, "[IMAGE] binwalk"),
130
- # Security tools
131
- ("which nmap", "/usr/bin/nmap", "nmap installed"),
132
- ("which sqlmap", "/usr/bin/sqlmap", "sqlmap installed"),
133
- ("which msfconsole || echo 'NOT_FOUND'", None, "[IMAGE] msfconsole"),
134
- ("which hydra || echo 'NOT_FOUND'", None, "[IMAGE] hydra"),
135
- ("which john || echo 'NOT_FOUND'", None, "[IMAGE] john"),
136
- # Docker-in-Docker capability (required for web challenges)
128
+ # Required tools - STRICT: must exist at expected paths
129
+ # Using cybench/cybench:latest which has all these tools
130
+ ("which gdb", "/usr/bin/gdb", "gdb at /usr/bin/gdb"),
131
+ ("which ghidra", "/usr/bin/ghidra", "ghidra at /usr/bin/ghidra"),
132
+ ("which radare2", "/usr/bin/radare2", "radare2 at /usr/bin/radare2"),
133
+ ("which binwalk", "/usr/bin/binwalk", "binwalk at /usr/bin/binwalk"),
134
+ # Security tools - STRICT
135
+ ("which nmap", "/usr/bin/nmap", "nmap at /usr/bin/nmap"),
136
+ ("which sqlmap", "/usr/bin/sqlmap", "sqlmap at /usr/bin/sqlmap"),
137
+ # Docker available (daemon may not be running in all test contexts)
137
138
  ("docker --version", "Docker version", "Docker available"),
138
- ("docker ps 2>&1 | head -1", None, "Docker daemon accessible"),
139
- # Python 3
140
- ("which python3", None, "python3 installed"),
141
- ("python3 --version", "Python 3", "Python 3.x"),
139
+ # NOTE: Docker daemon check (docker ps) removed - DinD requires privileged
140
+ # mode which isn't always available. Web challenges needing DinD will
141
+ # fail at runtime if daemon isn't accessible.
142
+ # Python via venv - STRICT
143
+ ("test -f /venv/bin/python3 && echo 'venv'", "venv", "Python venv at /venv"),
144
+ (
145
+ "/venv/bin/python3 -c 'from pwn import *; print(\"pwntools\")'",
146
+ "pwntools",
147
+ "pwntools in /venv",
148
+ ),
142
149
  # apt-get for additional tools
143
150
  ("apt-get --version", "apt", "apt-get available"),
144
151
  # Task files visible - cybench tasks have nested directories (blog/, nginx/, etc.)
145
- # Check that files exist in /root (not just empty directory)
146
- ("ls -la /root/ | grep -E '^d|^-' | wc -l", None, "Items in /root"),
152
+ # Check that files exist in /root (not just empty directory) using test -gt 0
153
+ ("test $(ls -A /root/ | wc -l) -gt 0", None, "Items in /root"),
147
154
  # For tasks with subdirs (chunky, etc.), verify structure is mounted
148
- ("find /root -type f 2>/dev/null | head -5", None, "Files under /root (recursive)"),
155
+ ("test $(find /root -type f 2>/dev/null | head -5 | wc -l) -gt 0", None, "Files under /root (recursive)"),
149
156
  ],
150
157
  "cybergym": [
151
158
  # Working directory: /root
152
159
  ("pwd", "/root", "Working directory is /root"),
153
160
  # OS: Kali Linux (same base as cybench)
154
161
  ("cat /etc/os-release | grep '^NAME='", "Kali", "Kali Linux base image"),
155
- # CyberGym-specific: submit.sh MUST exist for PoC submission
156
- ("test -f /submit.sh && echo 'submit.sh'", "submit.sh", "submit.sh present at /submit.sh"),
157
- ("test -x /submit.sh && echo 'executable'", "executable", "submit.sh is executable"),
158
- # Image detection (same as cybench - uses cybench/cybench:latest)
159
- ("which gdb || echo 'NOT_FOUND'", None, "[IMAGE] gdb"),
160
- ("which ghidra || echo 'NOT_FOUND'", None, "[IMAGE] ghidra"),
161
- ("which radare2 || echo 'NOT_FOUND'", None, "[IMAGE] radare2"),
162
- # Core security tools
163
- ("which python3", None, "python3 installed"),
164
- ("which nmap", "/usr/bin/nmap", "nmap installed"),
165
- ("which curl", "/usr/bin/curl", "curl installed"),
162
+ # CyberGym-specific: submit.sh in home dir for PoC submission
163
+ ("test -f /root/submit.sh && echo 'submit.sh'", "submit.sh", "submit.sh present in /root"),
164
+ ("test -x /root/submit.sh && echo 'executable'", "executable", "submit.sh is executable"),
165
+ # Required tools - STRICT (same image as cybench)
166
+ ("which gdb", "/usr/bin/gdb", "gdb at /usr/bin/gdb"),
167
+ ("which ghidra", "/usr/bin/ghidra", "ghidra at /usr/bin/ghidra"),
168
+ ("which radare2", "/usr/bin/radare2", "radare2 at /usr/bin/radare2"),
169
+ # Python via venv (same as cybench/cybench:latest)
170
+ ("test -f /venv/bin/python3 && echo 'venv'", "venv", "Python venv at /venv"),
171
+ ("which nmap", "/usr/bin/nmap", "nmap at /usr/bin/nmap"),
172
+ ("which curl", "/usr/bin/curl", "curl at /usr/bin/curl"),
166
173
  # apt-get for additional tools
167
174
  ("apt-get --version", "apt", "apt-get available"),
168
- # Task files from setup.sh
169
- ("ls /root/ 2>/dev/null | wc -l", None, "Files in /root"),
170
175
  ],
171
176
  "cvebench": [
172
177
  # OS: Kali Linux
173
178
  ("cat /etc/os-release | grep '^NAME='", "Kali", "Kali Linux base image"),
174
- # Core security tools for CVE exploitation
175
- ("which python3", None, "python3 installed"),
176
- ("which nmap", "/usr/bin/nmap", "nmap installed"),
177
- ("which curl", "/usr/bin/curl", "curl installed"),
178
- ("which wget", "/usr/bin/wget", "wget installed"),
179
- # Network access to target
180
- ("which nc", None, "netcat installed"),
179
+ # Core security tools for CVE exploitation - STRICT
180
+ ("which python3", "/usr/bin/python3", "python3 at /usr/bin/python3"),
181
+ ("which nmap", "/usr/bin/nmap", "nmap at /usr/bin/nmap"),
182
+ ("which curl", "/usr/bin/curl", "curl at /usr/bin/curl"),
183
+ ("which wget", "/usr/bin/wget", "wget at /usr/bin/wget"),
184
+ # Network access to target - STRICT
185
+ ("which nc", "/usr/bin/nc", "netcat at /usr/bin/nc"),
181
186
  # apt-get for additional tools
182
187
  ("apt-get --version", "apt", "apt-get available"),
183
188
  ],
@@ -191,23 +196,60 @@ ENVIRONMENT_CHECKS = {
191
196
  ],
192
197
  }
193
198
 
199
+ # Task-specific file expectations
200
+ # Maps benchmark -> task_id -> list of expected paths
201
+ # Use "*" as task_id for checks that apply to all tasks in a benchmark
202
+ TASK_FILE_CHECKS = {
203
+ "cybench": {
204
+ "chunky": ["/root/blog", "/root/nginx", "/root/rcache"],
205
+ "avatar": ["/root/chall.py"],
206
+ "lootstash": ["/root/stash"],
207
+ "primary_knowledge": [], # No local files - network-based
208
+ },
209
+ "cybergym": {
210
+ "*": ["/root/submit.sh"], # All cybergym tasks need submit.sh in home
211
+ },
212
+ "nyuctf": {
213
+ "*": ["/home/ctfplayer/ctf_files"], # Standard location
214
+ },
215
+ "intercode-ctf": {
216
+ "*": ["/ctf"], # Task files in /ctf directory
217
+ },
218
+ }
219
+
220
+
221
+ def get_file_checks_for_task(benchmark: str, task_id: str) -> list[tuple]:
222
+ """Generate file existence checks based on task expectations."""
223
+ checks = []
224
+ task_files = TASK_FILE_CHECKS.get(benchmark, {})
225
+
226
+ # Try task-specific first, then fallback to wildcard
227
+ expected = task_files.get(task_id) or task_files.get("*", [])
228
+
229
+ for path in expected:
230
+ checks.append(
231
+ (f"test -e {path} && echo 'EXISTS'", "EXISTS", f"{path} exists")
232
+ )
233
+
234
+ return checks
235
+
194
236
  # Commands to run for submission tests
195
237
  SUBMISSION_TESTS = {
196
238
  "intercode-ctf": {
197
239
  "score_cmd": 'task score "test_flag_123"',
198
- "score_expect": "incorrect", # Should say incorrect
240
+ "score_expect": "Score:", # Output shows "Score: I" (I=Incorrect)
199
241
  "submit_cmd": 'task submit "test_flag_for_e2e"',
200
242
  "submit_expect": "submitted",
201
243
  },
202
244
  "nyuctf": {
203
245
  "score_cmd": 'task score "test_flag_123"',
204
- "score_expect": "incorrect",
246
+ "score_expect": "Score:", # Output shows "Score: I" (I=Incorrect)
205
247
  "submit_cmd": 'task submit "test_flag_for_e2e"',
206
248
  "submit_expect": "submitted",
207
249
  },
208
250
  "cybench": {
209
251
  "score_cmd": 'task score "test_flag_123"',
210
- "score_expect": "incorrect",
252
+ "score_expect": "Score:", # Output shows "Score: I" (I=Incorrect)
211
253
  "submit_cmd": 'task submit "test_flag_for_e2e"',
212
254
  "submit_expect": "submitted",
213
255
  },
@@ -228,6 +270,12 @@ SUBMISSION_TESTS = {
228
270
  "submit_cmd": 'task submit "ls -la"',
229
271
  "submit_expect": "submitted",
230
272
  },
273
+ "nl2bash": {
274
+ # Single-shot - no score command (same as cybashbench)
275
+ "score_cmd": None,
276
+ "submit_cmd": 'task submit "ls -la"',
277
+ "submit_expect": "submitted",
278
+ },
231
279
  }
232
280
 
233
281
 
@@ -336,9 +384,16 @@ def run_automated_test(task_id: str, benchmark: str, timeout: int = 300) -> list
336
384
 
337
385
  # Start the CLI using the new session join flow
338
386
  # Session has status="created", so CLI will run full setup
339
- console.print(f"Launching hte-cli session join {session_id}...")
387
+ # Use explicit pipx path to test the published PyPI version, not local dev
388
+ if not HTE_CLI_PATH.exists():
389
+ console.print(f"[red]hte-cli not found at {HTE_CLI_PATH}[/red]")
390
+ console.print("[yellow]Install with: pipx install hte-cli[/yellow]")
391
+ results.append(TestResult("CLI installed", False, "", f"hte-cli not at {HTE_CLI_PATH}"))
392
+ return results
393
+
394
+ console.print(f"Launching {HTE_CLI_PATH} session join {session_id}...")
340
395
  child = pexpect.spawn(
341
- f"hte-cli session join {session_id}",
396
+ f"{HTE_CLI_PATH} session join {session_id}",
342
397
  encoding="utf-8",
343
398
  timeout=timeout,
344
399
  env={**os.environ, "TERM": "dumb"}, # Disable colors for easier parsing
@@ -388,6 +443,7 @@ def run_automated_test(task_id: str, benchmark: str, timeout: int = 300) -> list
388
443
  results.append(TestResult("Environment setup", True, "Environment ready"))
389
444
 
390
445
  # Wait for the "Login to the system" message and docker exec command
446
+ # CVE bench builds containers from source, can take 5+ minutes
391
447
  console.print("Waiting for docker exec command...")
392
448
  idx = child.expect(
393
449
  [
@@ -395,7 +451,7 @@ def run_automated_test(task_id: str, benchmark: str, timeout: int = 300) -> list
395
451
  r"docker exec -it",
396
452
  pexpect.TIMEOUT,
397
453
  ],
398
- timeout=120,
454
+ timeout=300, # 5 minutes for slow builds (cvebench)
399
455
  )
400
456
 
401
457
  if idx == 2: # TIMEOUT
@@ -503,7 +559,12 @@ def run_automated_test(task_id: str, benchmark: str, timeout: int = 300) -> list
503
559
 
504
560
  # Get benchmark-specific checks from ENVIRONMENT_CHECKS
505
561
  env_checks = ENVIRONMENT_CHECKS.get(benchmark, [])
506
- for check in env_checks:
562
+
563
+ # Add task-specific file checks
564
+ file_checks = get_file_checks_for_task(benchmark, task_id)
565
+ all_checks = env_checks + file_checks
566
+
567
+ for check in all_checks:
507
568
  # Unpack: (command, expected_pattern, description)
508
569
  if len(check) == 3:
509
570
  cmd, expected, desc = check
@@ -517,7 +578,7 @@ def run_automated_test(task_id: str, benchmark: str, timeout: int = 300) -> list
517
578
  ["docker", "exec", container_name, "sh", "-c", cmd],
518
579
  capture_output=True,
519
580
  text=True,
520
- timeout=15,
581
+ timeout=30, # Increased for slow imports (angr takes ~10s)
521
582
  )
522
583
  output = result.stdout.strip()
523
584
  stderr = result.stderr.strip()
@@ -560,17 +621,29 @@ def run_automated_test(task_id: str, benchmark: str, timeout: int = 300) -> list
560
621
 
561
622
  # Test score command if available
562
623
  if sub_tests.get("score_cmd"):
624
+ # Clear buffer before score test to avoid capturing stale output
625
+ try:
626
+ docker_child.read_nonblocking(size=10000, timeout=0.5)
627
+ except Exception:
628
+ pass
563
629
  docker_child.sendline(sub_tests["score_cmd"])
564
630
  time.sleep(2)
565
631
  docker_child.expect(prompt_patterns[:-1], timeout=30)
566
632
  output = strip_ansi(docker_child.before or "")
567
- results.append(
568
- TestResult(
569
- "task score",
570
- True, # Just checking it runs
571
- output[:200],
633
+
634
+ expected_score = sub_tests.get("score_expect")
635
+ if expected_score:
636
+ passed = expected_score.lower() in output.lower()
637
+ details = output[:200] if passed else f"Expected '{expected_score}' in output: {output[:100]}..."
638
+ results.append(TestResult("task score", passed, details))
639
+ else:
640
+ results.append(
641
+ TestResult(
642
+ "task score",
643
+ True, # Just checking it runs
644
+ output[:200],
645
+ )
572
646
  )
573
- )
574
647
 
575
648
  # Submit answer
576
649
  console.print("Submitting test answer...")
@@ -605,14 +678,15 @@ def run_automated_test(task_id: str, benchmark: str, timeout: int = 300) -> list
605
678
  timeout=60,
606
679
  )
607
680
  # EOF (idx=4) is expected - container exits after task submit
608
- if idx <= 4:
681
+ # TIMEOUT (idx=3) is a failure
682
+ if idx != 3:
609
683
  results.append(
610
684
  TestResult("Submission", True, "Answer submitted (container exited)")
611
685
  )
612
686
  else:
613
687
  results.append(
614
688
  TestResult(
615
- "Submission", False, docker_child.before or "", "Submission may have failed"
689
+ "Submission", False, docker_child.before or "", "Submission timed out waiting for result"
616
690
  )
617
691
  )
618
692
  elif idx < 3:
@@ -624,7 +698,7 @@ def run_automated_test(task_id: str, benchmark: str, timeout: int = 300) -> list
624
698
  else:
625
699
  results.append(
626
700
  TestResult(
627
- "Submission", False, docker_child.before or "", "Submission may have failed"
701
+ "Submission", False, docker_child.before or "", "Submission timed out at prompt"
628
702
  )
629
703
  )
630
704
 
@@ -671,10 +745,19 @@ def run_automated_test(task_id: str, benchmark: str, timeout: int = 300) -> list
671
745
  return results
672
746
 
673
747
 
674
- def verify_artifacts(task_id: str, _benchmark: str) -> list[TestResult]:
748
+ def verify_artifacts(task_id: str, benchmark: str) -> list[TestResult]:
675
749
  """Verify session and eval log artifacts were created."""
676
750
  results = []
677
751
 
752
+ # Get expected answer from SUBMISSION_TESTS
753
+ sub_tests = SUBMISSION_TESTS.get(benchmark, {})
754
+ submit_cmd = sub_tests.get("submit_cmd", "")
755
+ expected_answer = None
756
+ if 'task submit "' in submit_cmd:
757
+ expected_answer = submit_cmd.split('task submit "')[1].rstrip('"')
758
+ elif "task submit '" in submit_cmd:
759
+ expected_answer = submit_cmd.split("task submit '")[1].rstrip("'")
760
+
678
761
  # Check session in database
679
762
  session_info = ssh_query(f"""
680
763
  SELECT id, status, score, client_active_seconds, answer
@@ -699,14 +782,32 @@ def verify_artifacts(task_id: str, _benchmark: str) -> list[TestResult]:
699
782
  "Active time recorded", float(active_seconds or 0) > 0, f"Seconds: {active_seconds}"
700
783
  )
701
784
  )
702
- results.append(
703
- TestResult(
704
- "Answer recorded", bool(answer), f"Answer: {answer[:50]}..." if answer else ""
785
+
786
+ # Verify answer
787
+ if expected_answer and benchmark != "cybergym": # Cybergym submits file content, hard to verify here
788
+ results.append(
789
+ TestResult(
790
+ "Answer matches submission",
791
+ answer == expected_answer,
792
+ f"Expected: '{expected_answer}', Got: '{answer}'"
793
+ )
794
+ )
795
+ else:
796
+ results.append(
797
+ TestResult(
798
+ "Answer recorded", bool(answer), f"Answer: {answer[:50]}..." if answer else ""
799
+ )
800
+ )
801
+
802
+ # Score check - some benchmarks compute scores server-side later (not immediately)
803
+ no_immediate_score = benchmark in ("cybashbench", "nl2bash")
804
+ if no_immediate_score:
805
+ # These benchmarks don't produce immediate scores - skip check
806
+ pass
807
+ else:
808
+ results.append(
809
+ TestResult("Score recorded", score != "", f"Score: {score}" if score else "No score")
705
810
  )
706
- )
707
- results.append(
708
- TestResult("Score recorded", score != "", f"Score: {score}" if score else "No score")
709
- )
710
811
 
711
812
  # Check events (new flow uses setup_started/setup_completed instead of session_started)
712
813
  events = ssh_query(f"""
@@ -22,6 +22,41 @@ VPS_HOST = os.environ.get("VPS_HOST", "root@209.38.25.118")
22
22
  TEST_EMAIL = "e2e-test@lyptus.dev"
23
23
  CLI_CONFIG_PATH = Path.home() / "Library" / "Application Support" / "hte-cli" / "config.json"
24
24
 
25
+ # Expected task assignments per benchmark (4 per benchmark)
26
+ # First 2 for pytest API tests, last 2 for interactive/automated tests
27
+ EXPECTED_TASKS = {
28
+ "cybergym": ["arvo:60121", "arvo:8046", "arvo:41073", "arvo:43984"],
29
+ "cybench": ["chunky", "avatar", "lootstash", "primary_knowledge"],
30
+ "intercode-ctf": [
31
+ "intercode-ctf_general_skills/task_4",
32
+ "intercode-ctf_reverse/task_1",
33
+ "intercode-ctf_reverse/task_3",
34
+ "intercode-ctf_reverse/task_85",
35
+ ],
36
+ "nyuctf": [
37
+ "2013f-cry-stfu",
38
+ "2016q-rev-rock",
39
+ "2023q-rev-baby_s_first",
40
+ "2019q-cry-super_curve",
41
+ ],
42
+ "cybashbench": [
43
+ "cybashbench_pwn/nl2bash_003",
44
+ "cybashbench_reverse/nl2bash_004",
45
+ "cybashbench_web/nl2bash_001",
46
+ "cybashbench_forensics/nl2bash_005",
47
+ ],
48
+ "cvebench": ["CVE-2024-5084", "CVE-2024-4442", "CVE-2024-22120", "CVE-2024-32511"],
49
+ "nl2bash": [
50
+ "nl2bash_complex/task_8581",
51
+ "nl2bash_complex/task_713",
52
+ "nl2bash_complex/task_712",
53
+ "nl2bash_complex/task_8796",
54
+ ],
55
+ }
56
+
57
+ # Total expected assignments (calculated from EXPECTED_TASKS)
58
+ EXPECTED_ASSIGNMENT_COUNT = sum(len(tasks) for tasks in EXPECTED_TASKS.values())
59
+
25
60
 
26
61
  def ssh_query(query: str) -> str:
27
62
  """Run a sqlite3 query on the VPS (READ-ONLY)."""
@@ -37,6 +37,9 @@ TEST_NAME = "E2E Test User"
37
37
  # CLI config path (matches platformdirs on macOS)
38
38
  CLI_CONFIG_PATH = Path.home() / "Library" / "Application Support" / "hte-cli" / "config.json"
39
39
 
40
+ # Use the pipx-installed CLI (what experts use), not the local venv version
41
+ HTE_CLI_PATH = Path.home() / ".local" / "bin" / "hte-cli"
42
+
40
43
  # Task assignments: 4 per benchmark
41
44
  # First 2 for pytest API tests, last 2 for interactive tests
42
45
  BENCHMARK_TASKS = {
@@ -61,6 +64,12 @@ BENCHMARK_TASKS = {
61
64
  "cybashbench_forensics/nl2bash_005",
62
65
  ],
63
66
  "cvebench": ["CVE-2024-5084", "CVE-2024-4442", "CVE-2024-22120", "CVE-2024-32511"],
67
+ "nl2bash": [
68
+ "nl2bash_complex/task_8581",
69
+ "nl2bash_complex/task_713",
70
+ "nl2bash_complex/task_712",
71
+ "nl2bash_complex/task_8796",
72
+ ],
64
73
  }
65
74
 
66
75
  # Instructions for each benchmark type
@@ -341,10 +350,10 @@ def setup(admin_password: str, yes: bool):
341
350
  CLI_CONFIG_PATH.write_text(json.dumps(config, indent=2))
342
351
  console.print("[green]CLI config written[/green]")
343
352
 
344
- # 7. Verify CLI works
353
+ # 7. Verify CLI works (use pipx version, not local venv)
345
354
  console.print("\nVerifying CLI authentication...")
346
355
  result = subprocess.run(
347
- ["hte-cli", "auth", "status"],
356
+ [str(HTE_CLI_PATH), "auth", "status"],
348
357
  capture_output=True,
349
358
  text=True,
350
359
  )
@@ -688,38 +697,158 @@ def cleanup():
688
697
  help="Admin password for API access",
689
698
  )
690
699
  @click.option("--yes", "-y", is_flag=True, help="Skip confirmation prompts")
691
- def full(admin_password: str, yes: bool):
692
- """Run complete E2E test suite (setup, run all, verify, cleanup)."""
693
- console.print("\n[bold]Full E2E Test Suite[/bold]\n")
694
- console.print("[yellow]This will run all benchmarks interactively.[/yellow]")
695
- console.print("You'll need to interact with each task container.\n")
696
-
697
- if not yes and not click.confirm("Continue?"):
700
+ @click.option("--skip-setup", is_flag=True, help="Skip setup if already done")
701
+ @click.option("--cleanup-after", is_flag=True, help="Run cleanup after tests")
702
+ def full(admin_password: str, yes: bool, skip_setup: bool, cleanup_after: bool):
703
+ """Run complete E2E test suite in 3 phases.
704
+
705
+ Phase 1: Infrastructure tests (pytest, fast, no containers)
706
+ Phase 2: Automated benchmark E2E tests (pexpect, creates completed sessions)
707
+ Phase 3: Session verification tests (pytest, validates completed sessions)
708
+
709
+ This is fully automated - no user interaction required.
710
+ """
711
+ console.print(Panel("[bold]Full E2E Test Suite - 3 Phases[/bold]", style="cyan"))
712
+ console.print("""
713
+ [dim]Phase 1:[/dim] Infrastructure tests (pytest)
714
+ [dim]Phase 2:[/dim] Automated benchmark E2E tests (pexpect)
715
+ [dim]Phase 3:[/dim] Session verification tests (pytest)
716
+ """)
717
+
718
+ if not yes and not click.confirm("Run full automated E2E suite?"):
698
719
  raise click.ClickException("Aborted")
699
720
 
700
- # Setup
701
- ctx = click.get_current_context()
702
- ctx.invoke(setup, admin_password=admin_password, yes=yes)
721
+ results = {"phase1": None, "phase2": {}, "phase3": None}
722
+ tests_dir = Path(__file__).parent
723
+
724
+ # Setup (unless skipped)
725
+ if not skip_setup:
726
+ console.print("\n" + "=" * 60)
727
+ console.print("[bold cyan]SETUP: Creating test user and assignments[/bold cyan]")
728
+ console.print("=" * 60)
729
+ ctx = click.get_current_context()
730
+ ctx.invoke(setup, admin_password=admin_password, yes=True)
731
+
732
+ # Phase 1: Infrastructure tests
733
+ console.print("\n" + "=" * 60)
734
+ console.print("[bold cyan]PHASE 1: Infrastructure Tests[/bold cyan]")
735
+ console.print("=" * 60)
736
+ console.print("[dim]Running pytest on infrastructure, imports, benchmark flows...[/dim]\n")
737
+
738
+ phase1_result = subprocess.run(
739
+ [
740
+ "uv", "run", "pytest",
741
+ str(tests_dir / "test_infrastructure.py"),
742
+ str(tests_dir / "test_runtime_imports.py"),
743
+ str(tests_dir / "test_benchmark_flows.py"),
744
+ "-v", "--tb=short",
745
+ ],
746
+ cwd=tests_dir.parent.parent,
747
+ )
748
+ results["phase1"] = phase1_result.returncode == 0
703
749
 
704
- # Run each benchmark
705
- for benchmark in BENCHMARK_TASKS.keys():
706
- console.print(f"\n{'=' * 50}")
707
- console.print(f"[bold]Benchmark: {benchmark}[/bold]")
750
+ if not results["phase1"]:
751
+ console.print("\n[red bold]Phase 1 FAILED - stopping[/red bold]")
752
+ _print_full_summary(results)
753
+ raise SystemExit(1)
754
+
755
+ console.print("\n[green]Phase 1 PASSED[/green]")
708
756
 
709
- for i in range(2):
710
- if click.confirm(f"\nRun task {i+1}/2 for {benchmark}?"):
711
- ctx.invoke(run, benchmark=benchmark, task_index=i)
757
+ # Phase 2: Automated benchmark E2E tests
758
+ console.print("\n" + "=" * 60)
759
+ console.print("[bold cyan]PHASE 2: Automated Benchmark E2E Tests[/bold cyan]")
760
+ console.print("=" * 60)
761
+ console.print("[dim]Running automated tests for each benchmark via pexpect...[/dim]\n")
762
+
763
+ from automated_runner import run_benchmark_test
764
+
765
+ for benchmark in BENCHMARK_TASKS.keys():
766
+ console.print(f"\n[bold]--- {benchmark} ---[/bold]")
767
+ try:
768
+ # Run task index 2 (third task, reserved for automated E2E)
769
+ success = run_benchmark_test(benchmark, task_index=2)
770
+ results["phase2"][benchmark] = success
771
+ if success:
772
+ console.print(f"[green]{benchmark}: PASSED[/green]")
773
+ else:
774
+ console.print(f"[red]{benchmark}: FAILED[/red]")
775
+ except Exception as e:
776
+ console.print(f"[red]{benchmark}: ERROR - {e}[/red]")
777
+ results["phase2"][benchmark] = False
778
+
779
+ phase2_passed = all(results["phase2"].values())
780
+ if not phase2_passed:
781
+ console.print("\n[yellow]Phase 2 had failures - continuing to Phase 3[/yellow]")
782
+
783
+ # Phase 3: Session verification tests
784
+ console.print("\n" + "=" * 60)
785
+ console.print("[bold cyan]PHASE 3: Session Verification Tests[/bold cyan]")
786
+ console.print("=" * 60)
787
+ console.print("[dim]Running pytest on session lifecycle and eval logs...[/dim]\n")
788
+
789
+ phase3_result = subprocess.run(
790
+ [
791
+ "uv", "run", "pytest",
792
+ str(tests_dir / "test_session_lifecycle.py"),
793
+ str(tests_dir / "test_eval_logs.py"),
794
+ "-v", "--tb=short",
795
+ ],
796
+ cwd=tests_dir.parent.parent,
797
+ )
798
+ results["phase3"] = phase3_result.returncode == 0
712
799
 
713
- # Verify
714
- console.print(f"\n{'=' * 50}")
715
- ctx.invoke(verify, admin_password=admin_password)
800
+ # Summary
801
+ _print_full_summary(results)
716
802
 
717
803
  # Cleanup
718
- console.print(f"\n{'=' * 50}")
719
- if click.confirm("Run cleanup?"):
804
+ if cleanup_after:
805
+ console.print("\n" + "=" * 60)
806
+ console.print("[bold cyan]CLEANUP[/bold cyan]")
807
+ ctx = click.get_current_context()
720
808
  ctx.invoke(cleanup)
721
809
 
722
- console.print("\n[bold green]Full E2E test complete![/bold green]")
810
+ # Exit with appropriate code
811
+ all_passed = results["phase1"] and phase2_passed and results["phase3"]
812
+ if all_passed:
813
+ console.print("\n[bold green]All phases PASSED![/bold green]")
814
+ else:
815
+ console.print("\n[bold red]Some phases FAILED[/bold red]")
816
+ raise SystemExit(1)
817
+
818
+
819
+ def _print_full_summary(results: dict):
820
+ """Print summary table of all phases."""
821
+ console.print("\n" + "=" * 60)
822
+ console.print("[bold]SUMMARY[/bold]")
823
+ console.print("=" * 60)
824
+
825
+ table = Table()
826
+ table.add_column("Phase", style="cyan")
827
+ table.add_column("Status")
828
+ table.add_column("Details")
829
+
830
+ # Phase 1
831
+ if results["phase1"] is not None:
832
+ status = "[green]PASSED[/green]" if results["phase1"] else "[red]FAILED[/red]"
833
+ table.add_row("Phase 1: Infrastructure", status, "pytest infra/imports/flows")
834
+
835
+ # Phase 2
836
+ if results["phase2"]:
837
+ passed = sum(1 for v in results["phase2"].values() if v)
838
+ total = len(results["phase2"])
839
+ status = "[green]PASSED[/green]" if passed == total else f"[yellow]{passed}/{total}[/yellow]"
840
+ details = ", ".join(
841
+ f"[green]{b}[/green]" if v else f"[red]{b}[/red]"
842
+ for b, v in results["phase2"].items()
843
+ )
844
+ table.add_row("Phase 2: Benchmarks", status, details)
845
+
846
+ # Phase 3
847
+ if results["phase3"] is not None:
848
+ status = "[green]PASSED[/green]" if results["phase3"] else "[red]FAILED[/red]"
849
+ table.add_row("Phase 3: Verification", status, "pytest lifecycle/logs")
850
+
851
+ console.print(table)
723
852
 
724
853
 
725
854
  if __name__ == "__main__":
@@ -13,7 +13,14 @@ Run with: uv run pytest tests/e2e/test_benchmark_flows.py -v
13
13
  import pytest
14
14
  import requests
15
15
 
16
- from tests.e2e.conftest import BASE_URL, get_test_user_id, ssh_command, ssh_query
16
+ from tests.e2e.conftest import (
17
+ BASE_URL,
18
+ EXPECTED_ASSIGNMENT_COUNT,
19
+ EXPECTED_TASKS,
20
+ get_test_user_id,
21
+ ssh_command,
22
+ ssh_query,
23
+ )
17
24
 
18
25
  # Benchmark test configurations
19
26
  # First 2 tasks for pytest API tests, last 2 for interactive tests
@@ -367,12 +374,14 @@ class TestCrossBenchmark:
367
374
  assert int(count) > 0, f"No assignments for {benchmark}"
368
375
 
369
376
  def test_total_assignments_correct(self):
370
- """Total assignments should be 24 (4 per benchmark)."""
377
+ """Total assignments should match expected count (4 per benchmark)."""
371
378
  count = ssh_query(f"""
372
379
  SELECT COUNT(*) FROM assignments
373
380
  WHERE user_id = '{get_test_user_id()}'
374
381
  """)
375
- assert int(count) == 24
382
+ assert int(count) == EXPECTED_ASSIGNMENT_COUNT, (
383
+ f"Expected {EXPECTED_ASSIGNMENT_COUNT} assignments, got {count}"
384
+ )
376
385
 
377
386
 
378
387
  # =============================================================================
@@ -28,6 +28,15 @@ LOCAL_EVAL_LOGS_DIR = Path.home() / "Library" / "Application Support" / "hte-cli
28
28
  VPS_EVAL_LOGS_DIR = "/opt/hte-web/data/eval_logs"
29
29
 
30
30
 
31
+ def db_path_to_host_path(db_path: str) -> str:
32
+ """Translate container path stored in DB to host path on VPS.
33
+
34
+ Backend runs in Docker with /opt/hte-web/data mounted as /data,
35
+ so paths are stored as /data/... but host has /opt/hte-web/data/...
36
+ """
37
+ return db_path.replace("/data/", "/opt/hte-web/data/")
38
+
39
+
31
40
  def ssh_query(query: str) -> str:
32
41
  """Run a sqlite3 query on the VPS."""
33
42
  result = subprocess.run(
@@ -85,8 +94,8 @@ class TestLocalEvalLogs:
85
94
  pytest.skip("Local eval logs directory not found")
86
95
 
87
96
  logs = list(LOCAL_EVAL_LOGS_DIR.glob("*.eval"))
88
- # Just verify we can list them
89
- assert isinstance(logs, list)
97
+ # Verify we found eval logs (if E2E tests have run, there should be some)
98
+ assert len(logs) > 0, f"No eval logs found in {LOCAL_EVAL_LOGS_DIR}"
90
99
 
91
100
 
92
101
  # =============================================================================
@@ -103,11 +112,12 @@ class TestVPSEvalLogs:
103
112
  assert result == "exists", "VPS eval logs directory not found"
104
113
 
105
114
  def test_vps_eval_log_count(self):
106
- """Should be able to count eval logs on VPS."""
115
+ """Should have eval logs on VPS if sessions have completed."""
107
116
  result = ssh_command(f"find {VPS_EVAL_LOGS_DIR} -name '*.eval.gz' 2>/dev/null | wc -l")
108
- count = int(result.strip()) if result.strip().isdigit() else 0
109
- # Just verify we can count them
110
- assert count >= 0
117
+ assert result.strip().isdigit(), f"Invalid count result: {result}"
118
+ count = int(result.strip())
119
+ # If E2E tests have run, there should be eval logs
120
+ assert count > 0, f"No eval logs found on VPS in {VPS_EVAL_LOGS_DIR}"
111
121
 
112
122
  def test_completed_sessions_have_eval_log_path(self):
113
123
  """Completed sessions should have eval_log_path recorded."""
@@ -128,9 +138,14 @@ class TestVPSEvalLogs:
128
138
  """)
129
139
 
130
140
  # All completed sessions should have eval log paths
131
- assert int(with_path) == int(
132
- count
133
- ), f"Only {with_path}/{count} completed sessions have eval_log_path"
141
+ # Handle empty string from SQL query
142
+ with_path_count = int(with_path) if with_path else 0
143
+ total_count = int(count) if count else 0
144
+
145
+ if total_count == 0:
146
+ pytest.skip("No completed sessions to check")
147
+
148
+ assert with_path_count == total_count, f"Only {with_path_count}/{total_count} completed sessions have eval_log_path"
134
149
 
135
150
  def test_eval_log_files_exist_on_vps(self):
136
151
  """Eval log files referenced in DB should exist on VPS."""
@@ -147,8 +162,9 @@ class TestVPSEvalLogs:
147
162
 
148
163
  for path in paths.split("\n"):
149
164
  if path:
150
- exists = ssh_command(f"test -f {path} && echo exists")
151
- assert exists == "exists", f"Eval log not found: {path}"
165
+ host_path = db_path_to_host_path(path)
166
+ exists = ssh_command(f"test -f {host_path} && echo exists")
167
+ assert exists == "exists", f"Eval log not found: {host_path} (DB path: {path})"
152
168
 
153
169
 
154
170
  # =============================================================================
@@ -175,32 +191,34 @@ class TestEvalLogFormat:
175
191
 
176
192
  def test_eval_log_can_be_decompressed(self):
177
193
  """Eval logs should be valid gzip files."""
178
- path = ssh_query("""
194
+ db_path = ssh_query("""
179
195
  SELECT eval_log_path FROM sessions
180
196
  WHERE status = 'submitted'
181
197
  AND eval_log_path IS NOT NULL
182
198
  LIMIT 1
183
199
  """)
184
200
 
185
- if not path:
201
+ if not db_path:
186
202
  pytest.skip("No eval logs to test")
187
203
 
204
+ path = db_path_to_host_path(db_path)
188
205
  # Try to decompress
189
206
  result = ssh_command(f"gunzip -t {path} 2>&1 && echo ok")
190
207
  assert "ok" in result, f"Eval log not valid gzip: {result}"
191
208
 
192
209
  def test_eval_log_contains_expected_structure(self):
193
210
  """Eval logs should contain expected Inspect AI structure."""
194
- path = ssh_query("""
211
+ db_path = ssh_query("""
195
212
  SELECT eval_log_path FROM sessions
196
213
  WHERE status = 'submitted'
197
214
  AND eval_log_path IS NOT NULL
198
215
  LIMIT 1
199
216
  """)
200
217
 
201
- if not path:
218
+ if not db_path:
202
219
  pytest.skip("No eval logs to test")
203
220
 
221
+ path = db_path_to_host_path(db_path)
204
222
  # List contents of the gzipped eval (it's actually a zip inside gzip)
205
223
  # First copy to temp, decompress, check structure
206
224
  result = ssh_command(f"""
@@ -225,40 +243,43 @@ class TestEvalLogUpload:
225
243
  """Test eval log upload functionality."""
226
244
 
227
245
  def test_upload_event_recorded(self):
228
- """Upload events should be recorded in session_events."""
246
+ """Upload events should be recorded in session_events for sessions with eval logs."""
247
+ # Only check sessions that have eval_log_path (proves upload succeeded)
229
248
  session_id = ssh_query(f"""
230
249
  SELECT id FROM sessions
231
250
  WHERE user_id = '{get_test_user_id()}'
232
251
  AND status = 'submitted'
252
+ AND eval_log_path IS NOT NULL
233
253
  LIMIT 1
234
254
  """)
235
255
 
236
256
  if not session_id:
237
- pytest.skip("No completed sessions")
257
+ pytest.skip("No completed sessions with eval logs")
238
258
 
239
259
  events = ssh_query(f"""
240
260
  SELECT event_type FROM session_events
241
261
  WHERE session_id = '{session_id}'
242
262
  """)
243
263
 
244
- # Should have upload-related events
264
+ # Should have upload-related events for sessions with eval logs
245
265
  event_list = events.split("\n") if events else []
246
266
  has_upload = any("upload" in e.lower() for e in event_list)
247
- # Note: upload events might not always exist
248
- assert isinstance(has_upload, bool)
267
+
268
+ assert has_upload, f"No upload events found for session {session_id}. Events: {event_list[:5]}"
249
269
 
250
270
  def test_eval_log_size_reasonable(self):
251
271
  """Eval logs should be reasonably sized (not empty, not huge)."""
252
- path = ssh_query("""
272
+ db_path = ssh_query("""
253
273
  SELECT eval_log_path FROM sessions
254
274
  WHERE status = 'submitted'
255
275
  AND eval_log_path IS NOT NULL
256
276
  LIMIT 1
257
277
  """)
258
278
 
259
- if not path:
279
+ if not db_path:
260
280
  pytest.skip("No eval logs to test")
261
281
 
282
+ path = db_path_to_host_path(db_path)
262
283
  size = ssh_command(f"stat -c%s {path} 2>/dev/null || stat -f%z {path}")
263
284
 
264
285
  if size.isdigit():
@@ -25,31 +25,8 @@ from tests.e2e.conftest import (
25
25
  ssh_query,
26
26
  )
27
27
 
28
- # Expected task assignments (4 per benchmark)
29
- # First 2 for pytest API tests, last 2 for interactive tests
30
- EXPECTED_TASKS = {
31
- "cybergym": ["arvo:60121", "arvo:8046", "arvo:41073", "arvo:43984"],
32
- "cybench": ["chunky", "avatar", "lootstash", "primary_knowledge"],
33
- "intercode-ctf": [
34
- "intercode-ctf_general_skills/task_4",
35
- "intercode-ctf_reverse/task_1",
36
- "intercode-ctf_reverse/task_3",
37
- "intercode-ctf_reverse/task_85",
38
- ],
39
- "nyuctf": [
40
- "2013f-cry-stfu",
41
- "2016q-rev-rock",
42
- "2023q-rev-baby_s_first",
43
- "2019q-cry-super_curve",
44
- ],
45
- "cybashbench": [
46
- "cybashbench_pwn/nl2bash_003",
47
- "cybashbench_reverse/nl2bash_004",
48
- "cybashbench_web/nl2bash_001",
49
- "cybashbench_forensics/nl2bash_005",
50
- ],
51
- "cvebench": ["CVE-2024-5084", "CVE-2024-4442", "CVE-2024-22120", "CVE-2024-32511"],
52
- }
28
+ # Import shared constants from conftest
29
+ from tests.e2e.conftest import EXPECTED_TASKS, EXPECTED_ASSIGNMENT_COUNT
53
30
 
54
31
 
55
32
  # =============================================================================
@@ -133,11 +110,13 @@ class TestAssignments:
133
110
  """Test that task assignments are correctly set up."""
134
111
 
135
112
  def test_correct_number_of_assignments(self):
136
- """Test user should have exactly 12 assignments."""
113
+ """Test user should have expected number of assignments."""
137
114
  count = ssh_query(
138
115
  f"SELECT COUNT(*) FROM assignments WHERE user_id = '{get_test_user_id()}'"
139
116
  )
140
- assert int(count) == 24
117
+ assert int(count) == EXPECTED_ASSIGNMENT_COUNT, (
118
+ f"Expected {EXPECTED_ASSIGNMENT_COUNT} assignments, got {count}"
119
+ )
141
120
 
142
121
  @pytest.mark.parametrize("benchmark,tasks", EXPECTED_TASKS.items())
143
122
  def test_benchmark_tasks_assigned(self, benchmark, tasks):
@@ -226,8 +205,9 @@ class TestAPIEndpoints:
226
205
  )
227
206
  assert response.status_code == 200
228
207
  assignments = response.json()
229
- # User may have different number of assignments
230
- assert isinstance(assignments, list)
208
+ # Test user should have assignments from E2E setup
209
+ assert isinstance(assignments, list), "Expected list of assignments"
210
+ assert len(assignments) > 0, "Test user should have at least one assignment"
231
211
 
232
212
  def test_assignment_has_task_info(self, api_headers):
233
213
  """Assignments should include task information."""
@@ -149,8 +149,10 @@ print(f'Loaded {len(HUMAN_REGISTRY)} benchmarks: {list(HUMAN_REGISTRY.keys())}')
149
149
  pytest.fail(f"Import failed in container: {result.stderr}")
150
150
 
151
151
  assert "Loaded" in result.stdout
152
- # Should have at least 6 benchmarks
153
- assert "6 benchmarks" in result.stdout or "7 benchmarks" in result.stdout
152
+ # Should have exactly 7 benchmarks
153
+ assert "7 benchmarks" in result.stdout, (
154
+ f"Expected 7 benchmarks, got: {result.stdout}"
155
+ )
154
156
 
155
157
  def test_backend_can_import_adapters(self):
156
158
  """Backend should be able to instantiate adapters."""
@@ -176,9 +178,11 @@ for name, cls in HUMAN_REGISTRY.items():
176
178
  if "FAIL" in result.stdout:
177
179
  pytest.fail(f"Adapter instantiation failed: {result.stdout}")
178
180
 
179
- # Should have OK for all benchmarks
181
+ # All benchmarks should show OK - STRICT check
180
182
  for benchmark in BENCHMARKS:
181
- assert f"{benchmark}: OK" in result.stdout or benchmark not in result.stdout
183
+ assert f"{benchmark}: OK" in result.stdout, (
184
+ f"Benchmark {benchmark} not found or not OK in output: {result.stdout}"
185
+ )
182
186
 
183
187
 
184
188
  class TestLocalImports:
@@ -164,17 +164,26 @@ class TestSessionCompletion:
164
164
 
165
165
  def test_completed_session_has_score(self):
166
166
  """Completed sessions should have a score."""
167
+ # Count total submitted sessions
168
+ total_submitted = ssh_query(f"""
169
+ SELECT COUNT(*) FROM sessions
170
+ WHERE user_id = '{get_test_user_id()}'
171
+ AND status = 'submitted'
172
+ """)
173
+ total = int(total_submitted) if total_submitted else 0
174
+ if total == 0:
175
+ pytest.skip("No submitted sessions to verify")
176
+
177
+ # Count sessions without score
167
178
  sessions_without_score = ssh_query(f"""
168
179
  SELECT COUNT(*) FROM sessions
169
180
  WHERE user_id = '{get_test_user_id()}'
170
181
  AND status = 'submitted'
171
182
  AND score IS NULL
172
183
  """)
173
- # Note: score can legitimately be NULL for some benchmarks
174
- # This test documents expected behavior
175
- count = int(sessions_without_score)
176
- # We just want to verify we can query this
177
- assert count >= 0
184
+ count = int(sessions_without_score) if sessions_without_score else 0
185
+ # Most submitted sessions should have scores (some benchmarks may not score)
186
+ assert count < total, f"All {total} sessions missing scores"
178
187
 
179
188
  def test_completed_session_has_answer(self):
180
189
  """Completed sessions should have an answer."""
@@ -208,14 +217,16 @@ class TestSessionState:
208
217
  """Test session state verification (read-only)."""
209
218
 
210
219
  def test_abandoned_sessions_count(self):
211
- """Verify we can count abandoned sessions."""
220
+ """Verify abandoned sessions exist and are queryable."""
212
221
  abandoned_count = ssh_query(f"""
213
222
  SELECT COUNT(*) FROM sessions
214
223
  WHERE user_id = '{get_test_user_id()}'
215
224
  AND status = 'abandoned'
216
225
  """)
217
- # Just verify we can query abandoned sessions
218
- assert int(abandoned_count) >= 0
226
+ count = int(abandoned_count) if abandoned_count else 0
227
+ # Verify the query returned a valid number (not empty/error)
228
+ assert abandoned_count.strip().isdigit(), f"Query returned invalid value: {abandoned_count}"
229
+ # Note: count can legitimately be 0 if no sessions were abandoned
219
230
 
220
231
  def test_no_stuck_sessions_older_than_24h(self):
221
232
  """No in_progress sessions should be older than 24 hours."""
@@ -387,8 +398,9 @@ class TestSessionCancellation:
387
398
  WHERE user_id = '{get_test_user_id()}'
388
399
  AND status = 'cancelled'
389
400
  """)
390
- # Just verify we can query cancelled sessions
391
- assert int(cancelled) >= 0
401
+ # Verify query returned valid result
402
+ assert cancelled.strip().isdigit(), f"Query returned invalid value: {cancelled}"
403
+ # Note: count can legitimately be 0 if no sessions were cancelled
392
404
 
393
405
  def test_no_orphaned_in_progress_after_cancel(self):
394
406
  """Assignments should not be in_progress if session is cancelled."""
@@ -625,7 +625,7 @@ wheels = [
625
625
 
626
626
  [[package]]
627
627
  name = "hte-cli"
628
- version = "0.2.18"
628
+ version = "0.2.21"
629
629
  source = { editable = "." }
630
630
  dependencies = [
631
631
  { name = "click" },
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes