hte-cli 0.2.19__tar.gz → 0.2.22__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {hte_cli-0.2.19 → hte_cli-0.2.22}/PKG-INFO +1 -1
- {hte_cli-0.2.19 → hte_cli-0.2.22}/pyproject.toml +1 -1
- {hte_cli-0.2.19 → hte_cli-0.2.22}/src/hte_cli/cli.py +21 -1
- {hte_cli-0.2.19 → hte_cli-0.2.22}/tests/e2e/automated_runner.py +180 -79
- {hte_cli-0.2.19 → hte_cli-0.2.22}/tests/e2e/conftest.py +35 -0
- {hte_cli-0.2.19 → hte_cli-0.2.22}/tests/e2e/e2e_test.py +154 -25
- {hte_cli-0.2.19 → hte_cli-0.2.22}/tests/e2e/test_benchmark_flows.py +12 -3
- {hte_cli-0.2.19 → hte_cli-0.2.22}/tests/e2e/test_eval_logs.py +43 -22
- {hte_cli-0.2.19 → hte_cli-0.2.22}/tests/e2e/test_infrastructure.py +9 -29
- {hte_cli-0.2.19 → hte_cli-0.2.22}/tests/e2e/test_runtime_imports.py +8 -4
- {hte_cli-0.2.19 → hte_cli-0.2.22}/tests/e2e/test_session_lifecycle.py +22 -10
- {hte_cli-0.2.19 → hte_cli-0.2.22}/uv.lock +1 -1
- {hte_cli-0.2.19 → hte_cli-0.2.22}/.gitignore +0 -0
- {hte_cli-0.2.19 → hte_cli-0.2.22}/README.md +0 -0
- {hte_cli-0.2.19 → hte_cli-0.2.22}/src/hte_cli/__init__.py +0 -0
- {hte_cli-0.2.19 → hte_cli-0.2.22}/src/hte_cli/__main__.py +0 -0
- {hte_cli-0.2.19 → hte_cli-0.2.22}/src/hte_cli/api_client.py +0 -0
- {hte_cli-0.2.19 → hte_cli-0.2.22}/src/hte_cli/config.py +0 -0
- {hte_cli-0.2.19 → hte_cli-0.2.22}/src/hte_cli/errors.py +0 -0
- {hte_cli-0.2.19 → hte_cli-0.2.22}/src/hte_cli/events.py +0 -0
- {hte_cli-0.2.19 → hte_cli-0.2.22}/src/hte_cli/image_utils.py +0 -0
- {hte_cli-0.2.19 → hte_cli-0.2.22}/src/hte_cli/runner.py +0 -0
- {hte_cli-0.2.19 → hte_cli-0.2.22}/src/hte_cli/scorers.py +0 -0
- {hte_cli-0.2.19 → hte_cli-0.2.22}/src/hte_cli/version_check.py +0 -0
- {hte_cli-0.2.19 → hte_cli-0.2.22}/tests/__init__.py +0 -0
- {hte_cli-0.2.19 → hte_cli-0.2.22}/tests/e2e/__init__.py +0 -0
- {hte_cli-0.2.19 → hte_cli-0.2.22}/tests/e2e/verify_docker_deps.py +0 -0
- {hte_cli-0.2.19 → hte_cli-0.2.22}/tests/unit/__init__.py +0 -0
- {hte_cli-0.2.19 → hte_cli-0.2.22}/tests/unit/conftest.py +0 -0
- {hte_cli-0.2.19 → hte_cli-0.2.22}/tests/unit/test_image_utils.py +0 -0
- {hte_cli-0.2.19 → hte_cli-0.2.22}/tests/unit/test_runner.py +0 -0
- {hte_cli-0.2.19 → hte_cli-0.2.22}/tests/unit/test_scorers.py +0 -0
|
@@ -280,6 +280,14 @@ def session_join(ctx, session_id: str, force_setup: bool):
|
|
|
280
280
|
},
|
|
281
281
|
}
|
|
282
282
|
|
|
283
|
+
# Send session_started event (records CLI version for debugging)
|
|
284
|
+
events.session_started(
|
|
285
|
+
{
|
|
286
|
+
"cli_version": __version__,
|
|
287
|
+
"task_id": session_info["task_id"],
|
|
288
|
+
}
|
|
289
|
+
)
|
|
290
|
+
|
|
283
291
|
# Step 3: Run setup (skip if reconnecting without force)
|
|
284
292
|
setup_start_time = time.monotonic()
|
|
285
293
|
images = []
|
|
@@ -429,13 +437,21 @@ def session_join(ctx, session_id: str, force_setup: bool):
|
|
|
429
437
|
console.print(f"Answer: {result.answer}")
|
|
430
438
|
console.print(f"Time: {result.time_seconds:.1f}s")
|
|
431
439
|
|
|
440
|
+
# Track upload size and timing
|
|
441
|
+
upload_size_bytes = len(eval_log_bytes) if eval_log_bytes else 0
|
|
442
|
+
upload_size_kb = upload_size_bytes / 1024
|
|
443
|
+
|
|
444
|
+
events.upload_started(size_bytes=upload_size_bytes)
|
|
445
|
+
upload_start_time = time.monotonic()
|
|
446
|
+
|
|
432
447
|
# Upload to server
|
|
433
448
|
with Progress(
|
|
434
449
|
SpinnerColumn(),
|
|
435
450
|
TextColumn("[progress.description]{task.description}"),
|
|
436
451
|
console=console,
|
|
437
452
|
) as progress:
|
|
438
|
-
|
|
453
|
+
size_str = f" ({upload_size_kb:.0f} KB)" if upload_size_kb > 0 else ""
|
|
454
|
+
progress.add_task(f"Uploading result{size_str}...", total=None)
|
|
439
455
|
try:
|
|
440
456
|
upload_result = api.upload_result(
|
|
441
457
|
session_id=session_id,
|
|
@@ -450,6 +466,10 @@ def session_join(ctx, session_id: str, force_setup: bool):
|
|
|
450
466
|
console.print(f"[red]Failed to upload result: {e}[/red]")
|
|
451
467
|
sys.exit(1)
|
|
452
468
|
|
|
469
|
+
# Record upload completion
|
|
470
|
+
upload_duration = time.monotonic() - upload_start_time
|
|
471
|
+
events.upload_completed(duration_seconds=upload_duration, size_bytes=upload_size_bytes)
|
|
472
|
+
|
|
453
473
|
if upload_result.get("score") is not None:
|
|
454
474
|
console.print(f"Score: {upload_result['score']}")
|
|
455
475
|
|
|
@@ -52,11 +52,15 @@ def strip_ansi(text: str) -> str:
|
|
|
52
52
|
|
|
53
53
|
VPS_HOST = os.environ.get("VPS_HOST", "root@209.38.25.118")
|
|
54
54
|
|
|
55
|
+
# Use the pipx-installed CLI (what experts use), not the local venv version
|
|
56
|
+
# This ensures E2E tests validate the actual published PyPI package
|
|
57
|
+
HTE_CLI_PATH = Path.home() / ".local" / "bin" / "hte-cli"
|
|
58
|
+
|
|
55
59
|
# Environment verification commands per benchmark
|
|
56
60
|
# Based on docs/benchmark-environments.md
|
|
57
61
|
# Format: (command, expected_pattern, description)
|
|
58
|
-
#
|
|
59
|
-
# These tests
|
|
62
|
+
# STRICT: All tests must pass - no optional/informational checks
|
|
63
|
+
# These tests verify the exact environment specified in docs
|
|
60
64
|
ENVIRONMENT_CHECKS = {
|
|
61
65
|
"intercode-ctf": [
|
|
62
66
|
# Working directory: /ctf/{task_num} (e.g., /ctf/3, /ctf/85)
|
|
@@ -69,8 +73,8 @@ ENVIRONMENT_CHECKS = {
|
|
|
69
73
|
("which netcat", "/usr/bin/netcat", "netcat installed"),
|
|
70
74
|
("which tcpdump", "/usr/bin/tcpdump", "tcpdump installed"),
|
|
71
75
|
("which curl", "/usr/bin/curl", "curl installed"),
|
|
72
|
-
# Forensics tools
|
|
73
|
-
("which mmls",
|
|
76
|
+
# Forensics tools - STRICT: must exist
|
|
77
|
+
("which mmls", "/usr/bin/mmls", "mmls (sleuthkit) at /usr/bin/mmls"),
|
|
74
78
|
# Python packages - must import successfully
|
|
75
79
|
("python3 -c 'from pwn import *; print(\"pwntools\")'", "pwntools", "pwntools importable"),
|
|
76
80
|
("python3 -c 'from scapy.all import *; print(\"scapy\")'", "scapy", "scapy importable"),
|
|
@@ -90,16 +94,17 @@ ENVIRONMENT_CHECKS = {
|
|
|
90
94
|
("pwd", "/home/ctfplayer", "Working directory is /home/ctfplayer"),
|
|
91
95
|
# User: ctfplayer (NOT root)
|
|
92
96
|
("whoami", "ctfplayer", "Running as ctfplayer user"),
|
|
93
|
-
|
|
97
|
+
# User ID must be 1000 (standard non-root)
|
|
98
|
+
("id -u", "1000", "User ID is 1000"),
|
|
94
99
|
# Passwordless sudo must work
|
|
95
100
|
("sudo -n whoami", "root", "Passwordless sudo to root"),
|
|
96
101
|
# OS: Ubuntu 22.04 specifically
|
|
97
102
|
("cat /etc/os-release | grep VERSION_ID", "22.04", "Ubuntu 22.04 base"),
|
|
98
|
-
# Core tools at expected paths
|
|
103
|
+
# Core tools at expected paths - STRICT
|
|
99
104
|
("which gdb", "/usr/bin/gdb", "gdb at /usr/bin/gdb"),
|
|
100
|
-
("which radare2", "/usr/bin/radare2", "radare2
|
|
101
|
-
("which sqlmap", "/usr/bin/sqlmap", "sqlmap
|
|
102
|
-
("which jadx",
|
|
105
|
+
("which radare2", "/usr/bin/radare2", "radare2 at /usr/bin/radare2"),
|
|
106
|
+
("which sqlmap", "/usr/bin/sqlmap", "sqlmap at /usr/bin/sqlmap"),
|
|
107
|
+
("which jadx", "/usr/local/bin/jadx", "jadx at /usr/local/bin/jadx"),
|
|
103
108
|
# Python venv must exist at expected location
|
|
104
109
|
(
|
|
105
110
|
"test -f /home/ctfplayer/.ctfenv/bin/python3 && echo 'venv'",
|
|
@@ -112,72 +117,72 @@ ENVIRONMENT_CHECKS = {
|
|
|
112
117
|
("python3 -c 'import z3; print(\"z3\")'", "z3", "z3-solver importable"),
|
|
113
118
|
# 32-bit support (for CTF binaries)
|
|
114
119
|
("test -f /lib32/libc.so.6 && echo 'lib32'", "lib32", "32-bit libc available"),
|
|
120
|
+
# Task files location - STRICT
|
|
121
|
+
("test -d /home/ctfplayer/ctf_files && echo 'EXISTS'", "EXISTS", "ctf_files directory exists"),
|
|
115
122
|
],
|
|
116
123
|
"cybench": [
|
|
117
|
-
# Working directory: /
|
|
118
|
-
("pwd", "/
|
|
124
|
+
# Working directory: /app (official cybench/cybench:latest image)
|
|
125
|
+
("pwd", "/app", "Working directory is /app"),
|
|
119
126
|
# OS: Kali Linux (must contain 'Kali' in name)
|
|
120
127
|
("cat /etc/os-release | grep '^NAME='", "Kali", "Kali Linux base image"),
|
|
121
|
-
#
|
|
122
|
-
#
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
("which
|
|
127
|
-
|
|
128
|
-
("which
|
|
129
|
-
("which
|
|
130
|
-
#
|
|
131
|
-
("which nmap", "/usr/bin/nmap", "nmap installed"),
|
|
132
|
-
("which sqlmap", "/usr/bin/sqlmap", "sqlmap installed"),
|
|
133
|
-
("which msfconsole || echo 'NOT_FOUND'", None, "[IMAGE] msfconsole"),
|
|
134
|
-
("which hydra || echo 'NOT_FOUND'", None, "[IMAGE] hydra"),
|
|
135
|
-
("which john || echo 'NOT_FOUND'", None, "[IMAGE] john"),
|
|
136
|
-
# Docker-in-Docker capability (required for web challenges)
|
|
128
|
+
# Required tools - STRICT: must exist at expected paths
|
|
129
|
+
# Using cybench/cybench:latest which has all these tools
|
|
130
|
+
("which gdb", "/usr/bin/gdb", "gdb at /usr/bin/gdb"),
|
|
131
|
+
("which ghidra", "/usr/bin/ghidra", "ghidra at /usr/bin/ghidra"),
|
|
132
|
+
("which radare2", "/usr/bin/radare2", "radare2 at /usr/bin/radare2"),
|
|
133
|
+
("which binwalk", "/usr/bin/binwalk", "binwalk at /usr/bin/binwalk"),
|
|
134
|
+
# Security tools - STRICT
|
|
135
|
+
("which nmap", "/usr/bin/nmap", "nmap at /usr/bin/nmap"),
|
|
136
|
+
("which sqlmap", "/usr/bin/sqlmap", "sqlmap at /usr/bin/sqlmap"),
|
|
137
|
+
# Docker available (daemon may not be running in all test contexts)
|
|
137
138
|
("docker --version", "Docker version", "Docker available"),
|
|
138
|
-
(
|
|
139
|
-
#
|
|
140
|
-
|
|
141
|
-
|
|
139
|
+
# NOTE: Docker daemon check (docker ps) removed - DinD requires privileged
|
|
140
|
+
# mode which isn't always available. Web challenges needing DinD will
|
|
141
|
+
# fail at runtime if daemon isn't accessible.
|
|
142
|
+
# Python via venv - STRICT
|
|
143
|
+
("test -f /venv/bin/python3 && echo 'venv'", "venv", "Python venv at /venv"),
|
|
144
|
+
(
|
|
145
|
+
"/venv/bin/python3 -c 'from pwn import *; print(\"pwntools\")'",
|
|
146
|
+
"pwntools",
|
|
147
|
+
"pwntools in /venv",
|
|
148
|
+
),
|
|
142
149
|
# apt-get for additional tools
|
|
143
150
|
("apt-get --version", "apt", "apt-get available"),
|
|
144
151
|
# Task files visible - cybench tasks have nested directories (blog/, nginx/, etc.)
|
|
145
|
-
# Check that files exist in /root (not just empty directory)
|
|
146
|
-
("ls -
|
|
152
|
+
# Check that files exist in /root (not just empty directory) using test -gt 0
|
|
153
|
+
("test $(ls -A /root/ | wc -l) -gt 0", None, "Items in /root"),
|
|
147
154
|
# For tasks with subdirs (chunky, etc.), verify structure is mounted
|
|
148
|
-
("find /root -type f 2>/dev/null | head -5", None, "Files under /root (recursive)"),
|
|
155
|
+
("test $(find /root -type f 2>/dev/null | head -5 | wc -l) -gt 0", None, "Files under /root (recursive)"),
|
|
149
156
|
],
|
|
150
157
|
"cybergym": [
|
|
151
158
|
# Working directory: /root
|
|
152
159
|
("pwd", "/root", "Working directory is /root"),
|
|
153
160
|
# OS: Kali Linux (same base as cybench)
|
|
154
161
|
("cat /etc/os-release | grep '^NAME='", "Kali", "Kali Linux base image"),
|
|
155
|
-
# CyberGym-specific: submit.sh
|
|
156
|
-
("test -f /submit.sh && echo 'submit.sh'", "submit.sh", "submit.sh present
|
|
157
|
-
("test -x /submit.sh && echo 'executable'", "executable", "submit.sh is executable"),
|
|
158
|
-
#
|
|
159
|
-
("which gdb
|
|
160
|
-
("which ghidra
|
|
161
|
-
("which radare2
|
|
162
|
-
#
|
|
163
|
-
("
|
|
164
|
-
("which nmap", "/usr/bin/nmap", "nmap
|
|
165
|
-
("which curl", "/usr/bin/curl", "curl
|
|
162
|
+
# CyberGym-specific: submit.sh in home dir for PoC submission
|
|
163
|
+
("test -f /root/submit.sh && echo 'submit.sh'", "submit.sh", "submit.sh present in /root"),
|
|
164
|
+
("test -x /root/submit.sh && echo 'executable'", "executable", "submit.sh is executable"),
|
|
165
|
+
# Required tools - STRICT (same image as cybench)
|
|
166
|
+
("which gdb", "/usr/bin/gdb", "gdb at /usr/bin/gdb"),
|
|
167
|
+
("which ghidra", "/usr/bin/ghidra", "ghidra at /usr/bin/ghidra"),
|
|
168
|
+
("which radare2", "/usr/bin/radare2", "radare2 at /usr/bin/radare2"),
|
|
169
|
+
# Python via venv (same as cybench/cybench:latest)
|
|
170
|
+
("test -f /venv/bin/python3 && echo 'venv'", "venv", "Python venv at /venv"),
|
|
171
|
+
("which nmap", "/usr/bin/nmap", "nmap at /usr/bin/nmap"),
|
|
172
|
+
("which curl", "/usr/bin/curl", "curl at /usr/bin/curl"),
|
|
166
173
|
# apt-get for additional tools
|
|
167
174
|
("apt-get --version", "apt", "apt-get available"),
|
|
168
|
-
# Task files from setup.sh
|
|
169
|
-
("ls /root/ 2>/dev/null | wc -l", None, "Files in /root"),
|
|
170
175
|
],
|
|
171
176
|
"cvebench": [
|
|
172
177
|
# OS: Kali Linux
|
|
173
178
|
("cat /etc/os-release | grep '^NAME='", "Kali", "Kali Linux base image"),
|
|
174
|
-
# Core security tools for CVE exploitation
|
|
175
|
-
("which python3",
|
|
176
|
-
("which nmap", "/usr/bin/nmap", "nmap
|
|
177
|
-
("which curl", "/usr/bin/curl", "curl
|
|
178
|
-
("which wget", "/usr/bin/wget", "wget
|
|
179
|
-
# Network access to target
|
|
180
|
-
("which nc",
|
|
179
|
+
# Core security tools for CVE exploitation - STRICT
|
|
180
|
+
("which python3", "/usr/bin/python3", "python3 at /usr/bin/python3"),
|
|
181
|
+
("which nmap", "/usr/bin/nmap", "nmap at /usr/bin/nmap"),
|
|
182
|
+
("which curl", "/usr/bin/curl", "curl at /usr/bin/curl"),
|
|
183
|
+
("which wget", "/usr/bin/wget", "wget at /usr/bin/wget"),
|
|
184
|
+
# Network access to target - STRICT
|
|
185
|
+
("which nc", "/usr/bin/nc", "netcat at /usr/bin/nc"),
|
|
181
186
|
# apt-get for additional tools
|
|
182
187
|
("apt-get --version", "apt", "apt-get available"),
|
|
183
188
|
],
|
|
@@ -191,23 +196,60 @@ ENVIRONMENT_CHECKS = {
|
|
|
191
196
|
],
|
|
192
197
|
}
|
|
193
198
|
|
|
199
|
+
# Task-specific file expectations
|
|
200
|
+
# Maps benchmark -> task_id -> list of expected paths
|
|
201
|
+
# Use "*" as task_id for checks that apply to all tasks in a benchmark
|
|
202
|
+
TASK_FILE_CHECKS = {
|
|
203
|
+
"cybench": {
|
|
204
|
+
"chunky": ["/root/blog", "/root/nginx", "/root/rcache"],
|
|
205
|
+
"avatar": ["/root/chall.py"],
|
|
206
|
+
"lootstash": ["/root/stash"],
|
|
207
|
+
"primary_knowledge": [], # No local files - network-based
|
|
208
|
+
},
|
|
209
|
+
"cybergym": {
|
|
210
|
+
"*": ["/root/submit.sh"], # All cybergym tasks need submit.sh in home
|
|
211
|
+
},
|
|
212
|
+
"nyuctf": {
|
|
213
|
+
"*": ["/home/ctfplayer/ctf_files"], # Standard location
|
|
214
|
+
},
|
|
215
|
+
"intercode-ctf": {
|
|
216
|
+
"*": ["/ctf"], # Task files in /ctf directory
|
|
217
|
+
},
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
def get_file_checks_for_task(benchmark: str, task_id: str) -> list[tuple]:
|
|
222
|
+
"""Generate file existence checks based on task expectations."""
|
|
223
|
+
checks = []
|
|
224
|
+
task_files = TASK_FILE_CHECKS.get(benchmark, {})
|
|
225
|
+
|
|
226
|
+
# Try task-specific first, then fallback to wildcard
|
|
227
|
+
expected = task_files.get(task_id) or task_files.get("*", [])
|
|
228
|
+
|
|
229
|
+
for path in expected:
|
|
230
|
+
checks.append(
|
|
231
|
+
(f"test -e {path} && echo 'EXISTS'", "EXISTS", f"{path} exists")
|
|
232
|
+
)
|
|
233
|
+
|
|
234
|
+
return checks
|
|
235
|
+
|
|
194
236
|
# Commands to run for submission tests
|
|
195
237
|
SUBMISSION_TESTS = {
|
|
196
238
|
"intercode-ctf": {
|
|
197
239
|
"score_cmd": 'task score "test_flag_123"',
|
|
198
|
-
"score_expect": "
|
|
240
|
+
"score_expect": "Score:", # Output shows "Score: I" (I=Incorrect)
|
|
199
241
|
"submit_cmd": 'task submit "test_flag_for_e2e"',
|
|
200
242
|
"submit_expect": "submitted",
|
|
201
243
|
},
|
|
202
244
|
"nyuctf": {
|
|
203
245
|
"score_cmd": 'task score "test_flag_123"',
|
|
204
|
-
"score_expect": "
|
|
246
|
+
"score_expect": "Score:", # Output shows "Score: I" (I=Incorrect)
|
|
205
247
|
"submit_cmd": 'task submit "test_flag_for_e2e"',
|
|
206
248
|
"submit_expect": "submitted",
|
|
207
249
|
},
|
|
208
250
|
"cybench": {
|
|
209
251
|
"score_cmd": 'task score "test_flag_123"',
|
|
210
|
-
"score_expect": "
|
|
252
|
+
"score_expect": "Score:", # Output shows "Score: I" (I=Incorrect)
|
|
211
253
|
"submit_cmd": 'task submit "test_flag_for_e2e"',
|
|
212
254
|
"submit_expect": "submitted",
|
|
213
255
|
},
|
|
@@ -228,6 +270,12 @@ SUBMISSION_TESTS = {
|
|
|
228
270
|
"submit_cmd": 'task submit "ls -la"',
|
|
229
271
|
"submit_expect": "submitted",
|
|
230
272
|
},
|
|
273
|
+
"nl2bash": {
|
|
274
|
+
# Single-shot - no score command (same as cybashbench)
|
|
275
|
+
"score_cmd": None,
|
|
276
|
+
"submit_cmd": 'task submit "ls -la"',
|
|
277
|
+
"submit_expect": "submitted",
|
|
278
|
+
},
|
|
231
279
|
}
|
|
232
280
|
|
|
233
281
|
|
|
@@ -336,9 +384,16 @@ def run_automated_test(task_id: str, benchmark: str, timeout: int = 300) -> list
|
|
|
336
384
|
|
|
337
385
|
# Start the CLI using the new session join flow
|
|
338
386
|
# Session has status="created", so CLI will run full setup
|
|
339
|
-
|
|
387
|
+
# Use explicit pipx path to test the published PyPI version, not local dev
|
|
388
|
+
if not HTE_CLI_PATH.exists():
|
|
389
|
+
console.print(f"[red]hte-cli not found at {HTE_CLI_PATH}[/red]")
|
|
390
|
+
console.print("[yellow]Install with: pipx install hte-cli[/yellow]")
|
|
391
|
+
results.append(TestResult("CLI installed", False, "", f"hte-cli not at {HTE_CLI_PATH}"))
|
|
392
|
+
return results
|
|
393
|
+
|
|
394
|
+
console.print(f"Launching {HTE_CLI_PATH} session join {session_id}...")
|
|
340
395
|
child = pexpect.spawn(
|
|
341
|
-
f"
|
|
396
|
+
f"{HTE_CLI_PATH} session join {session_id}",
|
|
342
397
|
encoding="utf-8",
|
|
343
398
|
timeout=timeout,
|
|
344
399
|
env={**os.environ, "TERM": "dumb"}, # Disable colors for easier parsing
|
|
@@ -388,6 +443,7 @@ def run_automated_test(task_id: str, benchmark: str, timeout: int = 300) -> list
|
|
|
388
443
|
results.append(TestResult("Environment setup", True, "Environment ready"))
|
|
389
444
|
|
|
390
445
|
# Wait for the "Login to the system" message and docker exec command
|
|
446
|
+
# CVE bench builds containers from source, can take 5+ minutes
|
|
391
447
|
console.print("Waiting for docker exec command...")
|
|
392
448
|
idx = child.expect(
|
|
393
449
|
[
|
|
@@ -395,7 +451,7 @@ def run_automated_test(task_id: str, benchmark: str, timeout: int = 300) -> list
|
|
|
395
451
|
r"docker exec -it",
|
|
396
452
|
pexpect.TIMEOUT,
|
|
397
453
|
],
|
|
398
|
-
timeout=
|
|
454
|
+
timeout=300, # 5 minutes for slow builds (cvebench)
|
|
399
455
|
)
|
|
400
456
|
|
|
401
457
|
if idx == 2: # TIMEOUT
|
|
@@ -503,7 +559,12 @@ def run_automated_test(task_id: str, benchmark: str, timeout: int = 300) -> list
|
|
|
503
559
|
|
|
504
560
|
# Get benchmark-specific checks from ENVIRONMENT_CHECKS
|
|
505
561
|
env_checks = ENVIRONMENT_CHECKS.get(benchmark, [])
|
|
506
|
-
|
|
562
|
+
|
|
563
|
+
# Add task-specific file checks
|
|
564
|
+
file_checks = get_file_checks_for_task(benchmark, task_id)
|
|
565
|
+
all_checks = env_checks + file_checks
|
|
566
|
+
|
|
567
|
+
for check in all_checks:
|
|
507
568
|
# Unpack: (command, expected_pattern, description)
|
|
508
569
|
if len(check) == 3:
|
|
509
570
|
cmd, expected, desc = check
|
|
@@ -517,7 +578,7 @@ def run_automated_test(task_id: str, benchmark: str, timeout: int = 300) -> list
|
|
|
517
578
|
["docker", "exec", container_name, "sh", "-c", cmd],
|
|
518
579
|
capture_output=True,
|
|
519
580
|
text=True,
|
|
520
|
-
timeout=
|
|
581
|
+
timeout=30, # Increased for slow imports (angr takes ~10s)
|
|
521
582
|
)
|
|
522
583
|
output = result.stdout.strip()
|
|
523
584
|
stderr = result.stderr.strip()
|
|
@@ -560,17 +621,29 @@ def run_automated_test(task_id: str, benchmark: str, timeout: int = 300) -> list
|
|
|
560
621
|
|
|
561
622
|
# Test score command if available
|
|
562
623
|
if sub_tests.get("score_cmd"):
|
|
624
|
+
# Clear buffer before score test to avoid capturing stale output
|
|
625
|
+
try:
|
|
626
|
+
docker_child.read_nonblocking(size=10000, timeout=0.5)
|
|
627
|
+
except Exception:
|
|
628
|
+
pass
|
|
563
629
|
docker_child.sendline(sub_tests["score_cmd"])
|
|
564
630
|
time.sleep(2)
|
|
565
631
|
docker_child.expect(prompt_patterns[:-1], timeout=30)
|
|
566
632
|
output = strip_ansi(docker_child.before or "")
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
|
|
633
|
+
|
|
634
|
+
expected_score = sub_tests.get("score_expect")
|
|
635
|
+
if expected_score:
|
|
636
|
+
passed = expected_score.lower() in output.lower()
|
|
637
|
+
details = output[:200] if passed else f"Expected '{expected_score}' in output: {output[:100]}..."
|
|
638
|
+
results.append(TestResult("task score", passed, details))
|
|
639
|
+
else:
|
|
640
|
+
results.append(
|
|
641
|
+
TestResult(
|
|
642
|
+
"task score",
|
|
643
|
+
True, # Just checking it runs
|
|
644
|
+
output[:200],
|
|
645
|
+
)
|
|
572
646
|
)
|
|
573
|
-
)
|
|
574
647
|
|
|
575
648
|
# Submit answer
|
|
576
649
|
console.print("Submitting test answer...")
|
|
@@ -605,14 +678,15 @@ def run_automated_test(task_id: str, benchmark: str, timeout: int = 300) -> list
|
|
|
605
678
|
timeout=60,
|
|
606
679
|
)
|
|
607
680
|
# EOF (idx=4) is expected - container exits after task submit
|
|
608
|
-
|
|
681
|
+
# TIMEOUT (idx=3) is a failure
|
|
682
|
+
if idx != 3:
|
|
609
683
|
results.append(
|
|
610
684
|
TestResult("Submission", True, "Answer submitted (container exited)")
|
|
611
685
|
)
|
|
612
686
|
else:
|
|
613
687
|
results.append(
|
|
614
688
|
TestResult(
|
|
615
|
-
"Submission", False, docker_child.before or "", "Submission
|
|
689
|
+
"Submission", False, docker_child.before or "", "Submission timed out waiting for result"
|
|
616
690
|
)
|
|
617
691
|
)
|
|
618
692
|
elif idx < 3:
|
|
@@ -624,7 +698,7 @@ def run_automated_test(task_id: str, benchmark: str, timeout: int = 300) -> list
|
|
|
624
698
|
else:
|
|
625
699
|
results.append(
|
|
626
700
|
TestResult(
|
|
627
|
-
"Submission", False, docker_child.before or "", "Submission
|
|
701
|
+
"Submission", False, docker_child.before or "", "Submission timed out at prompt"
|
|
628
702
|
)
|
|
629
703
|
)
|
|
630
704
|
|
|
@@ -671,10 +745,19 @@ def run_automated_test(task_id: str, benchmark: str, timeout: int = 300) -> list
|
|
|
671
745
|
return results
|
|
672
746
|
|
|
673
747
|
|
|
674
|
-
def verify_artifacts(task_id: str,
|
|
748
|
+
def verify_artifacts(task_id: str, benchmark: str) -> list[TestResult]:
|
|
675
749
|
"""Verify session and eval log artifacts were created."""
|
|
676
750
|
results = []
|
|
677
751
|
|
|
752
|
+
# Get expected answer from SUBMISSION_TESTS
|
|
753
|
+
sub_tests = SUBMISSION_TESTS.get(benchmark, {})
|
|
754
|
+
submit_cmd = sub_tests.get("submit_cmd", "")
|
|
755
|
+
expected_answer = None
|
|
756
|
+
if 'task submit "' in submit_cmd:
|
|
757
|
+
expected_answer = submit_cmd.split('task submit "')[1].rstrip('"')
|
|
758
|
+
elif "task submit '" in submit_cmd:
|
|
759
|
+
expected_answer = submit_cmd.split("task submit '")[1].rstrip("'")
|
|
760
|
+
|
|
678
761
|
# Check session in database
|
|
679
762
|
session_info = ssh_query(f"""
|
|
680
763
|
SELECT id, status, score, client_active_seconds, answer
|
|
@@ -699,14 +782,32 @@ def verify_artifacts(task_id: str, _benchmark: str) -> list[TestResult]:
|
|
|
699
782
|
"Active time recorded", float(active_seconds or 0) > 0, f"Seconds: {active_seconds}"
|
|
700
783
|
)
|
|
701
784
|
)
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
|
|
785
|
+
|
|
786
|
+
# Verify answer
|
|
787
|
+
if expected_answer and benchmark != "cybergym": # Cybergym submits file content, hard to verify here
|
|
788
|
+
results.append(
|
|
789
|
+
TestResult(
|
|
790
|
+
"Answer matches submission",
|
|
791
|
+
answer == expected_answer,
|
|
792
|
+
f"Expected: '{expected_answer}', Got: '{answer}'"
|
|
793
|
+
)
|
|
794
|
+
)
|
|
795
|
+
else:
|
|
796
|
+
results.append(
|
|
797
|
+
TestResult(
|
|
798
|
+
"Answer recorded", bool(answer), f"Answer: {answer[:50]}..." if answer else ""
|
|
799
|
+
)
|
|
800
|
+
)
|
|
801
|
+
|
|
802
|
+
# Score check - some benchmarks compute scores server-side later (not immediately)
|
|
803
|
+
no_immediate_score = benchmark in ("cybashbench", "nl2bash")
|
|
804
|
+
if no_immediate_score:
|
|
805
|
+
# These benchmarks don't produce immediate scores - skip check
|
|
806
|
+
pass
|
|
807
|
+
else:
|
|
808
|
+
results.append(
|
|
809
|
+
TestResult("Score recorded", score != "", f"Score: {score}" if score else "No score")
|
|
705
810
|
)
|
|
706
|
-
)
|
|
707
|
-
results.append(
|
|
708
|
-
TestResult("Score recorded", score != "", f"Score: {score}" if score else "No score")
|
|
709
|
-
)
|
|
710
811
|
|
|
711
812
|
# Check events (new flow uses setup_started/setup_completed instead of session_started)
|
|
712
813
|
events = ssh_query(f"""
|
|
@@ -22,6 +22,41 @@ VPS_HOST = os.environ.get("VPS_HOST", "root@209.38.25.118")
|
|
|
22
22
|
TEST_EMAIL = "e2e-test@lyptus.dev"
|
|
23
23
|
CLI_CONFIG_PATH = Path.home() / "Library" / "Application Support" / "hte-cli" / "config.json"
|
|
24
24
|
|
|
25
|
+
# Expected task assignments per benchmark (4 per benchmark)
|
|
26
|
+
# First 2 for pytest API tests, last 2 for interactive/automated tests
|
|
27
|
+
EXPECTED_TASKS = {
|
|
28
|
+
"cybergym": ["arvo:60121", "arvo:8046", "arvo:41073", "arvo:43984"],
|
|
29
|
+
"cybench": ["chunky", "avatar", "lootstash", "primary_knowledge"],
|
|
30
|
+
"intercode-ctf": [
|
|
31
|
+
"intercode-ctf_general_skills/task_4",
|
|
32
|
+
"intercode-ctf_reverse/task_1",
|
|
33
|
+
"intercode-ctf_reverse/task_3",
|
|
34
|
+
"intercode-ctf_reverse/task_85",
|
|
35
|
+
],
|
|
36
|
+
"nyuctf": [
|
|
37
|
+
"2013f-cry-stfu",
|
|
38
|
+
"2016q-rev-rock",
|
|
39
|
+
"2023q-rev-baby_s_first",
|
|
40
|
+
"2019q-cry-super_curve",
|
|
41
|
+
],
|
|
42
|
+
"cybashbench": [
|
|
43
|
+
"cybashbench_pwn/nl2bash_003",
|
|
44
|
+
"cybashbench_reverse/nl2bash_004",
|
|
45
|
+
"cybashbench_web/nl2bash_001",
|
|
46
|
+
"cybashbench_forensics/nl2bash_005",
|
|
47
|
+
],
|
|
48
|
+
"cvebench": ["CVE-2024-5084", "CVE-2024-4442", "CVE-2024-22120", "CVE-2024-32511"],
|
|
49
|
+
"nl2bash": [
|
|
50
|
+
"nl2bash_complex/task_8581",
|
|
51
|
+
"nl2bash_complex/task_713",
|
|
52
|
+
"nl2bash_complex/task_712",
|
|
53
|
+
"nl2bash_complex/task_8796",
|
|
54
|
+
],
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
# Total expected assignments (calculated from EXPECTED_TASKS)
|
|
58
|
+
EXPECTED_ASSIGNMENT_COUNT = sum(len(tasks) for tasks in EXPECTED_TASKS.values())
|
|
59
|
+
|
|
25
60
|
|
|
26
61
|
def ssh_query(query: str) -> str:
|
|
27
62
|
"""Run a sqlite3 query on the VPS (READ-ONLY)."""
|
|
@@ -37,6 +37,9 @@ TEST_NAME = "E2E Test User"
|
|
|
37
37
|
# CLI config path (matches platformdirs on macOS)
|
|
38
38
|
CLI_CONFIG_PATH = Path.home() / "Library" / "Application Support" / "hte-cli" / "config.json"
|
|
39
39
|
|
|
40
|
+
# Use the pipx-installed CLI (what experts use), not the local venv version
|
|
41
|
+
HTE_CLI_PATH = Path.home() / ".local" / "bin" / "hte-cli"
|
|
42
|
+
|
|
40
43
|
# Task assignments: 4 per benchmark
|
|
41
44
|
# First 2 for pytest API tests, last 2 for interactive tests
|
|
42
45
|
BENCHMARK_TASKS = {
|
|
@@ -61,6 +64,12 @@ BENCHMARK_TASKS = {
|
|
|
61
64
|
"cybashbench_forensics/nl2bash_005",
|
|
62
65
|
],
|
|
63
66
|
"cvebench": ["CVE-2024-5084", "CVE-2024-4442", "CVE-2024-22120", "CVE-2024-32511"],
|
|
67
|
+
"nl2bash": [
|
|
68
|
+
"nl2bash_complex/task_8581",
|
|
69
|
+
"nl2bash_complex/task_713",
|
|
70
|
+
"nl2bash_complex/task_712",
|
|
71
|
+
"nl2bash_complex/task_8796",
|
|
72
|
+
],
|
|
64
73
|
}
|
|
65
74
|
|
|
66
75
|
# Instructions for each benchmark type
|
|
@@ -341,10 +350,10 @@ def setup(admin_password: str, yes: bool):
|
|
|
341
350
|
CLI_CONFIG_PATH.write_text(json.dumps(config, indent=2))
|
|
342
351
|
console.print("[green]CLI config written[/green]")
|
|
343
352
|
|
|
344
|
-
# 7. Verify CLI works
|
|
353
|
+
# 7. Verify CLI works (use pipx version, not local venv)
|
|
345
354
|
console.print("\nVerifying CLI authentication...")
|
|
346
355
|
result = subprocess.run(
|
|
347
|
-
[
|
|
356
|
+
[str(HTE_CLI_PATH), "auth", "status"],
|
|
348
357
|
capture_output=True,
|
|
349
358
|
text=True,
|
|
350
359
|
)
|
|
@@ -688,38 +697,158 @@ def cleanup():
|
|
|
688
697
|
help="Admin password for API access",
|
|
689
698
|
)
|
|
690
699
|
@click.option("--yes", "-y", is_flag=True, help="Skip confirmation prompts")
|
|
691
|
-
|
|
692
|
-
|
|
693
|
-
|
|
694
|
-
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
|
|
700
|
+
@click.option("--skip-setup", is_flag=True, help="Skip setup if already done")
|
|
701
|
+
@click.option("--cleanup-after", is_flag=True, help="Run cleanup after tests")
|
|
702
|
+
def full(admin_password: str, yes: bool, skip_setup: bool, cleanup_after: bool):
|
|
703
|
+
"""Run complete E2E test suite in 3 phases.
|
|
704
|
+
|
|
705
|
+
Phase 1: Infrastructure tests (pytest, fast, no containers)
|
|
706
|
+
Phase 2: Automated benchmark E2E tests (pexpect, creates completed sessions)
|
|
707
|
+
Phase 3: Session verification tests (pytest, validates completed sessions)
|
|
708
|
+
|
|
709
|
+
This is fully automated - no user interaction required.
|
|
710
|
+
"""
|
|
711
|
+
console.print(Panel("[bold]Full E2E Test Suite - 3 Phases[/bold]", style="cyan"))
|
|
712
|
+
console.print("""
|
|
713
|
+
[dim]Phase 1:[/dim] Infrastructure tests (pytest)
|
|
714
|
+
[dim]Phase 2:[/dim] Automated benchmark E2E tests (pexpect)
|
|
715
|
+
[dim]Phase 3:[/dim] Session verification tests (pytest)
|
|
716
|
+
""")
|
|
717
|
+
|
|
718
|
+
if not yes and not click.confirm("Run full automated E2E suite?"):
|
|
698
719
|
raise click.ClickException("Aborted")
|
|
699
720
|
|
|
700
|
-
|
|
701
|
-
|
|
702
|
-
|
|
721
|
+
results = {"phase1": None, "phase2": {}, "phase3": None}
|
|
722
|
+
tests_dir = Path(__file__).parent
|
|
723
|
+
|
|
724
|
+
# Setup (unless skipped)
|
|
725
|
+
if not skip_setup:
|
|
726
|
+
console.print("\n" + "=" * 60)
|
|
727
|
+
console.print("[bold cyan]SETUP: Creating test user and assignments[/bold cyan]")
|
|
728
|
+
console.print("=" * 60)
|
|
729
|
+
ctx = click.get_current_context()
|
|
730
|
+
ctx.invoke(setup, admin_password=admin_password, yes=True)
|
|
731
|
+
|
|
732
|
+
# Phase 1: Infrastructure tests
|
|
733
|
+
console.print("\n" + "=" * 60)
|
|
734
|
+
console.print("[bold cyan]PHASE 1: Infrastructure Tests[/bold cyan]")
|
|
735
|
+
console.print("=" * 60)
|
|
736
|
+
console.print("[dim]Running pytest on infrastructure, imports, benchmark flows...[/dim]\n")
|
|
737
|
+
|
|
738
|
+
phase1_result = subprocess.run(
|
|
739
|
+
[
|
|
740
|
+
"uv", "run", "pytest",
|
|
741
|
+
str(tests_dir / "test_infrastructure.py"),
|
|
742
|
+
str(tests_dir / "test_runtime_imports.py"),
|
|
743
|
+
str(tests_dir / "test_benchmark_flows.py"),
|
|
744
|
+
"-v", "--tb=short",
|
|
745
|
+
],
|
|
746
|
+
cwd=tests_dir.parent.parent,
|
|
747
|
+
)
|
|
748
|
+
results["phase1"] = phase1_result.returncode == 0
|
|
703
749
|
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
|
|
750
|
+
if not results["phase1"]:
|
|
751
|
+
console.print("\n[red bold]Phase 1 FAILED - stopping[/red bold]")
|
|
752
|
+
_print_full_summary(results)
|
|
753
|
+
raise SystemExit(1)
|
|
754
|
+
|
|
755
|
+
console.print("\n[green]Phase 1 PASSED[/green]")
|
|
708
756
|
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
|
|
757
|
+
# Phase 2: Automated benchmark E2E tests
|
|
758
|
+
console.print("\n" + "=" * 60)
|
|
759
|
+
console.print("[bold cyan]PHASE 2: Automated Benchmark E2E Tests[/bold cyan]")
|
|
760
|
+
console.print("=" * 60)
|
|
761
|
+
console.print("[dim]Running automated tests for each benchmark via pexpect...[/dim]\n")
|
|
762
|
+
|
|
763
|
+
from automated_runner import run_benchmark_test
|
|
764
|
+
|
|
765
|
+
for benchmark in BENCHMARK_TASKS.keys():
|
|
766
|
+
console.print(f"\n[bold]--- {benchmark} ---[/bold]")
|
|
767
|
+
try:
|
|
768
|
+
# Run task index 2 (third task, reserved for automated E2E)
|
|
769
|
+
success = run_benchmark_test(benchmark, task_index=2)
|
|
770
|
+
results["phase2"][benchmark] = success
|
|
771
|
+
if success:
|
|
772
|
+
console.print(f"[green]{benchmark}: PASSED[/green]")
|
|
773
|
+
else:
|
|
774
|
+
console.print(f"[red]{benchmark}: FAILED[/red]")
|
|
775
|
+
except Exception as e:
|
|
776
|
+
console.print(f"[red]{benchmark}: ERROR - {e}[/red]")
|
|
777
|
+
results["phase2"][benchmark] = False
|
|
778
|
+
|
|
779
|
+
phase2_passed = all(results["phase2"].values())
|
|
780
|
+
if not phase2_passed:
|
|
781
|
+
console.print("\n[yellow]Phase 2 had failures - continuing to Phase 3[/yellow]")
|
|
782
|
+
|
|
783
|
+
# Phase 3: Session verification tests
|
|
784
|
+
console.print("\n" + "=" * 60)
|
|
785
|
+
console.print("[bold cyan]PHASE 3: Session Verification Tests[/bold cyan]")
|
|
786
|
+
console.print("=" * 60)
|
|
787
|
+
console.print("[dim]Running pytest on session lifecycle and eval logs...[/dim]\n")
|
|
788
|
+
|
|
789
|
+
phase3_result = subprocess.run(
|
|
790
|
+
[
|
|
791
|
+
"uv", "run", "pytest",
|
|
792
|
+
str(tests_dir / "test_session_lifecycle.py"),
|
|
793
|
+
str(tests_dir / "test_eval_logs.py"),
|
|
794
|
+
"-v", "--tb=short",
|
|
795
|
+
],
|
|
796
|
+
cwd=tests_dir.parent.parent,
|
|
797
|
+
)
|
|
798
|
+
results["phase3"] = phase3_result.returncode == 0
|
|
712
799
|
|
|
713
|
-
#
|
|
714
|
-
|
|
715
|
-
ctx.invoke(verify, admin_password=admin_password)
|
|
800
|
+
# Summary
|
|
801
|
+
_print_full_summary(results)
|
|
716
802
|
|
|
717
803
|
# Cleanup
|
|
718
|
-
|
|
719
|
-
|
|
804
|
+
if cleanup_after:
|
|
805
|
+
console.print("\n" + "=" * 60)
|
|
806
|
+
console.print("[bold cyan]CLEANUP[/bold cyan]")
|
|
807
|
+
ctx = click.get_current_context()
|
|
720
808
|
ctx.invoke(cleanup)
|
|
721
809
|
|
|
722
|
-
|
|
810
|
+
# Exit with appropriate code
|
|
811
|
+
all_passed = results["phase1"] and phase2_passed and results["phase3"]
|
|
812
|
+
if all_passed:
|
|
813
|
+
console.print("\n[bold green]All phases PASSED![/bold green]")
|
|
814
|
+
else:
|
|
815
|
+
console.print("\n[bold red]Some phases FAILED[/bold red]")
|
|
816
|
+
raise SystemExit(1)
|
|
817
|
+
|
|
818
|
+
|
|
819
|
+
def _print_full_summary(results: dict):
|
|
820
|
+
"""Print summary table of all phases."""
|
|
821
|
+
console.print("\n" + "=" * 60)
|
|
822
|
+
console.print("[bold]SUMMARY[/bold]")
|
|
823
|
+
console.print("=" * 60)
|
|
824
|
+
|
|
825
|
+
table = Table()
|
|
826
|
+
table.add_column("Phase", style="cyan")
|
|
827
|
+
table.add_column("Status")
|
|
828
|
+
table.add_column("Details")
|
|
829
|
+
|
|
830
|
+
# Phase 1
|
|
831
|
+
if results["phase1"] is not None:
|
|
832
|
+
status = "[green]PASSED[/green]" if results["phase1"] else "[red]FAILED[/red]"
|
|
833
|
+
table.add_row("Phase 1: Infrastructure", status, "pytest infra/imports/flows")
|
|
834
|
+
|
|
835
|
+
# Phase 2
|
|
836
|
+
if results["phase2"]:
|
|
837
|
+
passed = sum(1 for v in results["phase2"].values() if v)
|
|
838
|
+
total = len(results["phase2"])
|
|
839
|
+
status = "[green]PASSED[/green]" if passed == total else f"[yellow]{passed}/{total}[/yellow]"
|
|
840
|
+
details = ", ".join(
|
|
841
|
+
f"[green]{b}[/green]" if v else f"[red]{b}[/red]"
|
|
842
|
+
for b, v in results["phase2"].items()
|
|
843
|
+
)
|
|
844
|
+
table.add_row("Phase 2: Benchmarks", status, details)
|
|
845
|
+
|
|
846
|
+
# Phase 3
|
|
847
|
+
if results["phase3"] is not None:
|
|
848
|
+
status = "[green]PASSED[/green]" if results["phase3"] else "[red]FAILED[/red]"
|
|
849
|
+
table.add_row("Phase 3: Verification", status, "pytest lifecycle/logs")
|
|
850
|
+
|
|
851
|
+
console.print(table)
|
|
723
852
|
|
|
724
853
|
|
|
725
854
|
if __name__ == "__main__":
|
|
@@ -13,7 +13,14 @@ Run with: uv run pytest tests/e2e/test_benchmark_flows.py -v
|
|
|
13
13
|
import pytest
|
|
14
14
|
import requests
|
|
15
15
|
|
|
16
|
-
from tests.e2e.conftest import
|
|
16
|
+
from tests.e2e.conftest import (
|
|
17
|
+
BASE_URL,
|
|
18
|
+
EXPECTED_ASSIGNMENT_COUNT,
|
|
19
|
+
EXPECTED_TASKS,
|
|
20
|
+
get_test_user_id,
|
|
21
|
+
ssh_command,
|
|
22
|
+
ssh_query,
|
|
23
|
+
)
|
|
17
24
|
|
|
18
25
|
# Benchmark test configurations
|
|
19
26
|
# First 2 tasks for pytest API tests, last 2 for interactive tests
|
|
@@ -367,12 +374,14 @@ class TestCrossBenchmark:
|
|
|
367
374
|
assert int(count) > 0, f"No assignments for {benchmark}"
|
|
368
375
|
|
|
369
376
|
def test_total_assignments_correct(self):
|
|
370
|
-
"""Total assignments should
|
|
377
|
+
"""Total assignments should match expected count (4 per benchmark)."""
|
|
371
378
|
count = ssh_query(f"""
|
|
372
379
|
SELECT COUNT(*) FROM assignments
|
|
373
380
|
WHERE user_id = '{get_test_user_id()}'
|
|
374
381
|
""")
|
|
375
|
-
assert int(count) ==
|
|
382
|
+
assert int(count) == EXPECTED_ASSIGNMENT_COUNT, (
|
|
383
|
+
f"Expected {EXPECTED_ASSIGNMENT_COUNT} assignments, got {count}"
|
|
384
|
+
)
|
|
376
385
|
|
|
377
386
|
|
|
378
387
|
# =============================================================================
|
|
@@ -28,6 +28,15 @@ LOCAL_EVAL_LOGS_DIR = Path.home() / "Library" / "Application Support" / "hte-cli
|
|
|
28
28
|
VPS_EVAL_LOGS_DIR = "/opt/hte-web/data/eval_logs"
|
|
29
29
|
|
|
30
30
|
|
|
31
|
+
def db_path_to_host_path(db_path: str) -> str:
|
|
32
|
+
"""Translate container path stored in DB to host path on VPS.
|
|
33
|
+
|
|
34
|
+
Backend runs in Docker with /opt/hte-web/data mounted as /data,
|
|
35
|
+
so paths are stored as /data/... but host has /opt/hte-web/data/...
|
|
36
|
+
"""
|
|
37
|
+
return db_path.replace("/data/", "/opt/hte-web/data/")
|
|
38
|
+
|
|
39
|
+
|
|
31
40
|
def ssh_query(query: str) -> str:
|
|
32
41
|
"""Run a sqlite3 query on the VPS."""
|
|
33
42
|
result = subprocess.run(
|
|
@@ -85,8 +94,8 @@ class TestLocalEvalLogs:
|
|
|
85
94
|
pytest.skip("Local eval logs directory not found")
|
|
86
95
|
|
|
87
96
|
logs = list(LOCAL_EVAL_LOGS_DIR.glob("*.eval"))
|
|
88
|
-
#
|
|
89
|
-
assert
|
|
97
|
+
# Verify we found eval logs (if E2E tests have run, there should be some)
|
|
98
|
+
assert len(logs) > 0, f"No eval logs found in {LOCAL_EVAL_LOGS_DIR}"
|
|
90
99
|
|
|
91
100
|
|
|
92
101
|
# =============================================================================
|
|
@@ -103,11 +112,12 @@ class TestVPSEvalLogs:
|
|
|
103
112
|
assert result == "exists", "VPS eval logs directory not found"
|
|
104
113
|
|
|
105
114
|
def test_vps_eval_log_count(self):
|
|
106
|
-
"""Should
|
|
115
|
+
"""Should have eval logs on VPS if sessions have completed."""
|
|
107
116
|
result = ssh_command(f"find {VPS_EVAL_LOGS_DIR} -name '*.eval.gz' 2>/dev/null | wc -l")
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
117
|
+
assert result.strip().isdigit(), f"Invalid count result: {result}"
|
|
118
|
+
count = int(result.strip())
|
|
119
|
+
# If E2E tests have run, there should be eval logs
|
|
120
|
+
assert count > 0, f"No eval logs found on VPS in {VPS_EVAL_LOGS_DIR}"
|
|
111
121
|
|
|
112
122
|
def test_completed_sessions_have_eval_log_path(self):
|
|
113
123
|
"""Completed sessions should have eval_log_path recorded."""
|
|
@@ -128,9 +138,14 @@ class TestVPSEvalLogs:
|
|
|
128
138
|
""")
|
|
129
139
|
|
|
130
140
|
# All completed sessions should have eval log paths
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
141
|
+
# Handle empty string from SQL query
|
|
142
|
+
with_path_count = int(with_path) if with_path else 0
|
|
143
|
+
total_count = int(count) if count else 0
|
|
144
|
+
|
|
145
|
+
if total_count == 0:
|
|
146
|
+
pytest.skip("No completed sessions to check")
|
|
147
|
+
|
|
148
|
+
assert with_path_count == total_count, f"Only {with_path_count}/{total_count} completed sessions have eval_log_path"
|
|
134
149
|
|
|
135
150
|
def test_eval_log_files_exist_on_vps(self):
|
|
136
151
|
"""Eval log files referenced in DB should exist on VPS."""
|
|
@@ -147,8 +162,9 @@ class TestVPSEvalLogs:
|
|
|
147
162
|
|
|
148
163
|
for path in paths.split("\n"):
|
|
149
164
|
if path:
|
|
150
|
-
|
|
151
|
-
|
|
165
|
+
host_path = db_path_to_host_path(path)
|
|
166
|
+
exists = ssh_command(f"test -f {host_path} && echo exists")
|
|
167
|
+
assert exists == "exists", f"Eval log not found: {host_path} (DB path: {path})"
|
|
152
168
|
|
|
153
169
|
|
|
154
170
|
# =============================================================================
|
|
@@ -175,32 +191,34 @@ class TestEvalLogFormat:
|
|
|
175
191
|
|
|
176
192
|
def test_eval_log_can_be_decompressed(self):
|
|
177
193
|
"""Eval logs should be valid gzip files."""
|
|
178
|
-
|
|
194
|
+
db_path = ssh_query("""
|
|
179
195
|
SELECT eval_log_path FROM sessions
|
|
180
196
|
WHERE status = 'submitted'
|
|
181
197
|
AND eval_log_path IS NOT NULL
|
|
182
198
|
LIMIT 1
|
|
183
199
|
""")
|
|
184
200
|
|
|
185
|
-
if not
|
|
201
|
+
if not db_path:
|
|
186
202
|
pytest.skip("No eval logs to test")
|
|
187
203
|
|
|
204
|
+
path = db_path_to_host_path(db_path)
|
|
188
205
|
# Try to decompress
|
|
189
206
|
result = ssh_command(f"gunzip -t {path} 2>&1 && echo ok")
|
|
190
207
|
assert "ok" in result, f"Eval log not valid gzip: {result}"
|
|
191
208
|
|
|
192
209
|
def test_eval_log_contains_expected_structure(self):
|
|
193
210
|
"""Eval logs should contain expected Inspect AI structure."""
|
|
194
|
-
|
|
211
|
+
db_path = ssh_query("""
|
|
195
212
|
SELECT eval_log_path FROM sessions
|
|
196
213
|
WHERE status = 'submitted'
|
|
197
214
|
AND eval_log_path IS NOT NULL
|
|
198
215
|
LIMIT 1
|
|
199
216
|
""")
|
|
200
217
|
|
|
201
|
-
if not
|
|
218
|
+
if not db_path:
|
|
202
219
|
pytest.skip("No eval logs to test")
|
|
203
220
|
|
|
221
|
+
path = db_path_to_host_path(db_path)
|
|
204
222
|
# List contents of the gzipped eval (it's actually a zip inside gzip)
|
|
205
223
|
# First copy to temp, decompress, check structure
|
|
206
224
|
result = ssh_command(f"""
|
|
@@ -225,40 +243,43 @@ class TestEvalLogUpload:
|
|
|
225
243
|
"""Test eval log upload functionality."""
|
|
226
244
|
|
|
227
245
|
def test_upload_event_recorded(self):
|
|
228
|
-
"""Upload events should be recorded in session_events."""
|
|
246
|
+
"""Upload events should be recorded in session_events for sessions with eval logs."""
|
|
247
|
+
# Only check sessions that have eval_log_path (proves upload succeeded)
|
|
229
248
|
session_id = ssh_query(f"""
|
|
230
249
|
SELECT id FROM sessions
|
|
231
250
|
WHERE user_id = '{get_test_user_id()}'
|
|
232
251
|
AND status = 'submitted'
|
|
252
|
+
AND eval_log_path IS NOT NULL
|
|
233
253
|
LIMIT 1
|
|
234
254
|
""")
|
|
235
255
|
|
|
236
256
|
if not session_id:
|
|
237
|
-
pytest.skip("No completed sessions")
|
|
257
|
+
pytest.skip("No completed sessions with eval logs")
|
|
238
258
|
|
|
239
259
|
events = ssh_query(f"""
|
|
240
260
|
SELECT event_type FROM session_events
|
|
241
261
|
WHERE session_id = '{session_id}'
|
|
242
262
|
""")
|
|
243
263
|
|
|
244
|
-
# Should have upload-related events
|
|
264
|
+
# Should have upload-related events for sessions with eval logs
|
|
245
265
|
event_list = events.split("\n") if events else []
|
|
246
266
|
has_upload = any("upload" in e.lower() for e in event_list)
|
|
247
|
-
|
|
248
|
-
assert
|
|
267
|
+
|
|
268
|
+
assert has_upload, f"No upload events found for session {session_id}. Events: {event_list[:5]}"
|
|
249
269
|
|
|
250
270
|
def test_eval_log_size_reasonable(self):
|
|
251
271
|
"""Eval logs should be reasonably sized (not empty, not huge)."""
|
|
252
|
-
|
|
272
|
+
db_path = ssh_query("""
|
|
253
273
|
SELECT eval_log_path FROM sessions
|
|
254
274
|
WHERE status = 'submitted'
|
|
255
275
|
AND eval_log_path IS NOT NULL
|
|
256
276
|
LIMIT 1
|
|
257
277
|
""")
|
|
258
278
|
|
|
259
|
-
if not
|
|
279
|
+
if not db_path:
|
|
260
280
|
pytest.skip("No eval logs to test")
|
|
261
281
|
|
|
282
|
+
path = db_path_to_host_path(db_path)
|
|
262
283
|
size = ssh_command(f"stat -c%s {path} 2>/dev/null || stat -f%z {path}")
|
|
263
284
|
|
|
264
285
|
if size.isdigit():
|
|
@@ -25,31 +25,8 @@ from tests.e2e.conftest import (
|
|
|
25
25
|
ssh_query,
|
|
26
26
|
)
|
|
27
27
|
|
|
28
|
-
#
|
|
29
|
-
|
|
30
|
-
EXPECTED_TASKS = {
|
|
31
|
-
"cybergym": ["arvo:60121", "arvo:8046", "arvo:41073", "arvo:43984"],
|
|
32
|
-
"cybench": ["chunky", "avatar", "lootstash", "primary_knowledge"],
|
|
33
|
-
"intercode-ctf": [
|
|
34
|
-
"intercode-ctf_general_skills/task_4",
|
|
35
|
-
"intercode-ctf_reverse/task_1",
|
|
36
|
-
"intercode-ctf_reverse/task_3",
|
|
37
|
-
"intercode-ctf_reverse/task_85",
|
|
38
|
-
],
|
|
39
|
-
"nyuctf": [
|
|
40
|
-
"2013f-cry-stfu",
|
|
41
|
-
"2016q-rev-rock",
|
|
42
|
-
"2023q-rev-baby_s_first",
|
|
43
|
-
"2019q-cry-super_curve",
|
|
44
|
-
],
|
|
45
|
-
"cybashbench": [
|
|
46
|
-
"cybashbench_pwn/nl2bash_003",
|
|
47
|
-
"cybashbench_reverse/nl2bash_004",
|
|
48
|
-
"cybashbench_web/nl2bash_001",
|
|
49
|
-
"cybashbench_forensics/nl2bash_005",
|
|
50
|
-
],
|
|
51
|
-
"cvebench": ["CVE-2024-5084", "CVE-2024-4442", "CVE-2024-22120", "CVE-2024-32511"],
|
|
52
|
-
}
|
|
28
|
+
# Import shared constants from conftest
|
|
29
|
+
from tests.e2e.conftest import EXPECTED_TASKS, EXPECTED_ASSIGNMENT_COUNT
|
|
53
30
|
|
|
54
31
|
|
|
55
32
|
# =============================================================================
|
|
@@ -133,11 +110,13 @@ class TestAssignments:
|
|
|
133
110
|
"""Test that task assignments are correctly set up."""
|
|
134
111
|
|
|
135
112
|
def test_correct_number_of_assignments(self):
|
|
136
|
-
"""Test user should have
|
|
113
|
+
"""Test user should have expected number of assignments."""
|
|
137
114
|
count = ssh_query(
|
|
138
115
|
f"SELECT COUNT(*) FROM assignments WHERE user_id = '{get_test_user_id()}'"
|
|
139
116
|
)
|
|
140
|
-
assert int(count) ==
|
|
117
|
+
assert int(count) == EXPECTED_ASSIGNMENT_COUNT, (
|
|
118
|
+
f"Expected {EXPECTED_ASSIGNMENT_COUNT} assignments, got {count}"
|
|
119
|
+
)
|
|
141
120
|
|
|
142
121
|
@pytest.mark.parametrize("benchmark,tasks", EXPECTED_TASKS.items())
|
|
143
122
|
def test_benchmark_tasks_assigned(self, benchmark, tasks):
|
|
@@ -226,8 +205,9 @@ class TestAPIEndpoints:
|
|
|
226
205
|
)
|
|
227
206
|
assert response.status_code == 200
|
|
228
207
|
assignments = response.json()
|
|
229
|
-
#
|
|
230
|
-
assert isinstance(assignments, list)
|
|
208
|
+
# Test user should have assignments from E2E setup
|
|
209
|
+
assert isinstance(assignments, list), "Expected list of assignments"
|
|
210
|
+
assert len(assignments) > 0, "Test user should have at least one assignment"
|
|
231
211
|
|
|
232
212
|
def test_assignment_has_task_info(self, api_headers):
|
|
233
213
|
"""Assignments should include task information."""
|
|
@@ -149,8 +149,10 @@ print(f'Loaded {len(HUMAN_REGISTRY)} benchmarks: {list(HUMAN_REGISTRY.keys())}')
|
|
|
149
149
|
pytest.fail(f"Import failed in container: {result.stderr}")
|
|
150
150
|
|
|
151
151
|
assert "Loaded" in result.stdout
|
|
152
|
-
# Should have
|
|
153
|
-
assert "
|
|
152
|
+
# Should have exactly 7 benchmarks
|
|
153
|
+
assert "7 benchmarks" in result.stdout, (
|
|
154
|
+
f"Expected 7 benchmarks, got: {result.stdout}"
|
|
155
|
+
)
|
|
154
156
|
|
|
155
157
|
def test_backend_can_import_adapters(self):
|
|
156
158
|
"""Backend should be able to instantiate adapters."""
|
|
@@ -176,9 +178,11 @@ for name, cls in HUMAN_REGISTRY.items():
|
|
|
176
178
|
if "FAIL" in result.stdout:
|
|
177
179
|
pytest.fail(f"Adapter instantiation failed: {result.stdout}")
|
|
178
180
|
|
|
179
|
-
#
|
|
181
|
+
# All benchmarks should show OK - STRICT check
|
|
180
182
|
for benchmark in BENCHMARKS:
|
|
181
|
-
assert f"{benchmark}: OK" in result.stdout
|
|
183
|
+
assert f"{benchmark}: OK" in result.stdout, (
|
|
184
|
+
f"Benchmark {benchmark} not found or not OK in output: {result.stdout}"
|
|
185
|
+
)
|
|
182
186
|
|
|
183
187
|
|
|
184
188
|
class TestLocalImports:
|
|
@@ -164,17 +164,26 @@ class TestSessionCompletion:
|
|
|
164
164
|
|
|
165
165
|
def test_completed_session_has_score(self):
|
|
166
166
|
"""Completed sessions should have a score."""
|
|
167
|
+
# Count total submitted sessions
|
|
168
|
+
total_submitted = ssh_query(f"""
|
|
169
|
+
SELECT COUNT(*) FROM sessions
|
|
170
|
+
WHERE user_id = '{get_test_user_id()}'
|
|
171
|
+
AND status = 'submitted'
|
|
172
|
+
""")
|
|
173
|
+
total = int(total_submitted) if total_submitted else 0
|
|
174
|
+
if total == 0:
|
|
175
|
+
pytest.skip("No submitted sessions to verify")
|
|
176
|
+
|
|
177
|
+
# Count sessions without score
|
|
167
178
|
sessions_without_score = ssh_query(f"""
|
|
168
179
|
SELECT COUNT(*) FROM sessions
|
|
169
180
|
WHERE user_id = '{get_test_user_id()}'
|
|
170
181
|
AND status = 'submitted'
|
|
171
182
|
AND score IS NULL
|
|
172
183
|
""")
|
|
173
|
-
|
|
174
|
-
#
|
|
175
|
-
count
|
|
176
|
-
# We just want to verify we can query this
|
|
177
|
-
assert count >= 0
|
|
184
|
+
count = int(sessions_without_score) if sessions_without_score else 0
|
|
185
|
+
# Most submitted sessions should have scores (some benchmarks may not score)
|
|
186
|
+
assert count < total, f"All {total} sessions missing scores"
|
|
178
187
|
|
|
179
188
|
def test_completed_session_has_answer(self):
|
|
180
189
|
"""Completed sessions should have an answer."""
|
|
@@ -208,14 +217,16 @@ class TestSessionState:
|
|
|
208
217
|
"""Test session state verification (read-only)."""
|
|
209
218
|
|
|
210
219
|
def test_abandoned_sessions_count(self):
|
|
211
|
-
"""Verify
|
|
220
|
+
"""Verify abandoned sessions exist and are queryable."""
|
|
212
221
|
abandoned_count = ssh_query(f"""
|
|
213
222
|
SELECT COUNT(*) FROM sessions
|
|
214
223
|
WHERE user_id = '{get_test_user_id()}'
|
|
215
224
|
AND status = 'abandoned'
|
|
216
225
|
""")
|
|
217
|
-
|
|
218
|
-
|
|
226
|
+
count = int(abandoned_count) if abandoned_count else 0
|
|
227
|
+
# Verify the query returned a valid number (not empty/error)
|
|
228
|
+
assert abandoned_count.strip().isdigit(), f"Query returned invalid value: {abandoned_count}"
|
|
229
|
+
# Note: count can legitimately be 0 if no sessions were abandoned
|
|
219
230
|
|
|
220
231
|
def test_no_stuck_sessions_older_than_24h(self):
|
|
221
232
|
"""No in_progress sessions should be older than 24 hours."""
|
|
@@ -387,8 +398,9 @@ class TestSessionCancellation:
|
|
|
387
398
|
WHERE user_id = '{get_test_user_id()}'
|
|
388
399
|
AND status = 'cancelled'
|
|
389
400
|
""")
|
|
390
|
-
#
|
|
391
|
-
assert
|
|
401
|
+
# Verify query returned valid result
|
|
402
|
+
assert cancelled.strip().isdigit(), f"Query returned invalid value: {cancelled}"
|
|
403
|
+
# Note: count can legitimately be 0 if no sessions were cancelled
|
|
392
404
|
|
|
393
405
|
def test_no_orphaned_in_progress_after_cancel(self):
|
|
394
406
|
"""Assignments should not be in_progress if session is cancelled."""
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|