hte-cli 0.2.19__tar.gz → 0.2.21__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {hte_cli-0.2.19 → hte_cli-0.2.21}/PKG-INFO +1 -1
- {hte_cli-0.2.19 → hte_cli-0.2.21}/pyproject.toml +1 -1
- {hte_cli-0.2.19 → hte_cli-0.2.21}/tests/e2e/automated_runner.py +139 -67
- {hte_cli-0.2.19 → hte_cli-0.2.21}/tests/e2e/conftest.py +35 -0
- {hte_cli-0.2.19 → hte_cli-0.2.21}/tests/e2e/e2e_test.py +149 -23
- {hte_cli-0.2.19 → hte_cli-0.2.21}/tests/e2e/test_benchmark_flows.py +12 -3
- {hte_cli-0.2.19 → hte_cli-0.2.21}/tests/e2e/test_eval_logs.py +10 -9
- {hte_cli-0.2.19 → hte_cli-0.2.21}/tests/e2e/test_infrastructure.py +9 -29
- {hte_cli-0.2.19 → hte_cli-0.2.21}/tests/e2e/test_runtime_imports.py +8 -4
- {hte_cli-0.2.19 → hte_cli-0.2.21}/tests/e2e/test_session_lifecycle.py +22 -10
- {hte_cli-0.2.19 → hte_cli-0.2.21}/uv.lock +1 -1
- {hte_cli-0.2.19 → hte_cli-0.2.21}/.gitignore +0 -0
- {hte_cli-0.2.19 → hte_cli-0.2.21}/README.md +0 -0
- {hte_cli-0.2.19 → hte_cli-0.2.21}/src/hte_cli/__init__.py +0 -0
- {hte_cli-0.2.19 → hte_cli-0.2.21}/src/hte_cli/__main__.py +0 -0
- {hte_cli-0.2.19 → hte_cli-0.2.21}/src/hte_cli/api_client.py +0 -0
- {hte_cli-0.2.19 → hte_cli-0.2.21}/src/hte_cli/cli.py +0 -0
- {hte_cli-0.2.19 → hte_cli-0.2.21}/src/hte_cli/config.py +0 -0
- {hte_cli-0.2.19 → hte_cli-0.2.21}/src/hte_cli/errors.py +0 -0
- {hte_cli-0.2.19 → hte_cli-0.2.21}/src/hte_cli/events.py +0 -0
- {hte_cli-0.2.19 → hte_cli-0.2.21}/src/hte_cli/image_utils.py +0 -0
- {hte_cli-0.2.19 → hte_cli-0.2.21}/src/hte_cli/runner.py +0 -0
- {hte_cli-0.2.19 → hte_cli-0.2.21}/src/hte_cli/scorers.py +0 -0
- {hte_cli-0.2.19 → hte_cli-0.2.21}/src/hte_cli/version_check.py +0 -0
- {hte_cli-0.2.19 → hte_cli-0.2.21}/tests/__init__.py +0 -0
- {hte_cli-0.2.19 → hte_cli-0.2.21}/tests/e2e/__init__.py +0 -0
- {hte_cli-0.2.19 → hte_cli-0.2.21}/tests/e2e/verify_docker_deps.py +0 -0
- {hte_cli-0.2.19 → hte_cli-0.2.21}/tests/unit/__init__.py +0 -0
- {hte_cli-0.2.19 → hte_cli-0.2.21}/tests/unit/conftest.py +0 -0
- {hte_cli-0.2.19 → hte_cli-0.2.21}/tests/unit/test_image_utils.py +0 -0
- {hte_cli-0.2.19 → hte_cli-0.2.21}/tests/unit/test_runner.py +0 -0
- {hte_cli-0.2.19 → hte_cli-0.2.21}/tests/unit/test_scorers.py +0 -0
|
@@ -55,8 +55,8 @@ VPS_HOST = os.environ.get("VPS_HOST", "root@209.38.25.118")
|
|
|
55
55
|
# Environment verification commands per benchmark
|
|
56
56
|
# Based on docs/benchmark-environments.md
|
|
57
57
|
# Format: (command, expected_pattern, description)
|
|
58
|
-
#
|
|
59
|
-
# These tests
|
|
58
|
+
# STRICT: All tests must pass - no optional/informational checks
|
|
59
|
+
# These tests verify the exact environment specified in docs
|
|
60
60
|
ENVIRONMENT_CHECKS = {
|
|
61
61
|
"intercode-ctf": [
|
|
62
62
|
# Working directory: /ctf/{task_num} (e.g., /ctf/3, /ctf/85)
|
|
@@ -69,8 +69,8 @@ ENVIRONMENT_CHECKS = {
|
|
|
69
69
|
("which netcat", "/usr/bin/netcat", "netcat installed"),
|
|
70
70
|
("which tcpdump", "/usr/bin/tcpdump", "tcpdump installed"),
|
|
71
71
|
("which curl", "/usr/bin/curl", "curl installed"),
|
|
72
|
-
# Forensics tools
|
|
73
|
-
("which mmls",
|
|
72
|
+
# Forensics tools - STRICT: must exist
|
|
73
|
+
("which mmls", "/usr/bin/mmls", "mmls (sleuthkit) at /usr/bin/mmls"),
|
|
74
74
|
# Python packages - must import successfully
|
|
75
75
|
("python3 -c 'from pwn import *; print(\"pwntools\")'", "pwntools", "pwntools importable"),
|
|
76
76
|
("python3 -c 'from scapy.all import *; print(\"scapy\")'", "scapy", "scapy importable"),
|
|
@@ -90,16 +90,17 @@ ENVIRONMENT_CHECKS = {
|
|
|
90
90
|
("pwd", "/home/ctfplayer", "Working directory is /home/ctfplayer"),
|
|
91
91
|
# User: ctfplayer (NOT root)
|
|
92
92
|
("whoami", "ctfplayer", "Running as ctfplayer user"),
|
|
93
|
-
|
|
93
|
+
# User ID must be 1000 (standard non-root)
|
|
94
|
+
("id -u", "1000", "User ID is 1000"),
|
|
94
95
|
# Passwordless sudo must work
|
|
95
96
|
("sudo -n whoami", "root", "Passwordless sudo to root"),
|
|
96
97
|
# OS: Ubuntu 22.04 specifically
|
|
97
98
|
("cat /etc/os-release | grep VERSION_ID", "22.04", "Ubuntu 22.04 base"),
|
|
98
|
-
# Core tools at expected paths
|
|
99
|
+
# Core tools at expected paths - STRICT
|
|
99
100
|
("which gdb", "/usr/bin/gdb", "gdb at /usr/bin/gdb"),
|
|
100
|
-
("which radare2", "/usr/bin/radare2", "radare2
|
|
101
|
-
("which sqlmap", "/usr/bin/sqlmap", "sqlmap
|
|
102
|
-
("which jadx",
|
|
101
|
+
("which radare2", "/usr/bin/radare2", "radare2 at /usr/bin/radare2"),
|
|
102
|
+
("which sqlmap", "/usr/bin/sqlmap", "sqlmap at /usr/bin/sqlmap"),
|
|
103
|
+
("which jadx", "/usr/local/bin/jadx", "jadx at /usr/local/bin/jadx"),
|
|
103
104
|
# Python venv must exist at expected location
|
|
104
105
|
(
|
|
105
106
|
"test -f /home/ctfplayer/.ctfenv/bin/python3 && echo 'venv'",
|
|
@@ -112,40 +113,42 @@ ENVIRONMENT_CHECKS = {
|
|
|
112
113
|
("python3 -c 'import z3; print(\"z3\")'", "z3", "z3-solver importable"),
|
|
113
114
|
# 32-bit support (for CTF binaries)
|
|
114
115
|
("test -f /lib32/libc.so.6 && echo 'lib32'", "lib32", "32-bit libc available"),
|
|
116
|
+
# Task files location - STRICT
|
|
117
|
+
("test -d /home/ctfplayer/ctf_files && echo 'EXISTS'", "EXISTS", "ctf_files directory exists"),
|
|
115
118
|
],
|
|
116
119
|
"cybench": [
|
|
117
|
-
# Working directory: /
|
|
118
|
-
("pwd", "/
|
|
120
|
+
# Working directory: /app (official cybench/cybench:latest image)
|
|
121
|
+
("pwd", "/app", "Working directory is /app"),
|
|
119
122
|
# OS: Kali Linux (must contain 'Kali' in name)
|
|
120
123
|
("cat /etc/os-release | grep '^NAME='", "Kali", "Kali Linux base image"),
|
|
121
|
-
#
|
|
122
|
-
#
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
("which
|
|
127
|
-
|
|
128
|
-
("which
|
|
129
|
-
("which
|
|
130
|
-
#
|
|
131
|
-
("which nmap", "/usr/bin/nmap", "nmap installed"),
|
|
132
|
-
("which sqlmap", "/usr/bin/sqlmap", "sqlmap installed"),
|
|
133
|
-
("which msfconsole || echo 'NOT_FOUND'", None, "[IMAGE] msfconsole"),
|
|
134
|
-
("which hydra || echo 'NOT_FOUND'", None, "[IMAGE] hydra"),
|
|
135
|
-
("which john || echo 'NOT_FOUND'", None, "[IMAGE] john"),
|
|
136
|
-
# Docker-in-Docker capability (required for web challenges)
|
|
124
|
+
# Required tools - STRICT: must exist at expected paths
|
|
125
|
+
# Using cybench/cybench:latest which has all these tools
|
|
126
|
+
("which gdb", "/usr/bin/gdb", "gdb at /usr/bin/gdb"),
|
|
127
|
+
("which ghidra", "/usr/bin/ghidra", "ghidra at /usr/bin/ghidra"),
|
|
128
|
+
("which radare2", "/usr/bin/radare2", "radare2 at /usr/bin/radare2"),
|
|
129
|
+
("which binwalk", "/usr/bin/binwalk", "binwalk at /usr/bin/binwalk"),
|
|
130
|
+
# Security tools - STRICT
|
|
131
|
+
("which nmap", "/usr/bin/nmap", "nmap at /usr/bin/nmap"),
|
|
132
|
+
("which sqlmap", "/usr/bin/sqlmap", "sqlmap at /usr/bin/sqlmap"),
|
|
133
|
+
# Docker available (daemon may not be running in all test contexts)
|
|
137
134
|
("docker --version", "Docker version", "Docker available"),
|
|
138
|
-
(
|
|
139
|
-
#
|
|
140
|
-
|
|
141
|
-
|
|
135
|
+
# NOTE: Docker daemon check (docker ps) removed - DinD requires privileged
|
|
136
|
+
# mode which isn't always available. Web challenges needing DinD will
|
|
137
|
+
# fail at runtime if daemon isn't accessible.
|
|
138
|
+
# Python via venv - STRICT
|
|
139
|
+
("test -f /venv/bin/python3 && echo 'venv'", "venv", "Python venv at /venv"),
|
|
140
|
+
(
|
|
141
|
+
"/venv/bin/python3 -c 'from pwn import *; print(\"pwntools\")'",
|
|
142
|
+
"pwntools",
|
|
143
|
+
"pwntools in /venv",
|
|
144
|
+
),
|
|
142
145
|
# apt-get for additional tools
|
|
143
146
|
("apt-get --version", "apt", "apt-get available"),
|
|
144
147
|
# Task files visible - cybench tasks have nested directories (blog/, nginx/, etc.)
|
|
145
|
-
# Check that files exist in /root (not just empty directory)
|
|
146
|
-
("ls -
|
|
148
|
+
# Check that files exist in /root (not just empty directory) using test -gt 0
|
|
149
|
+
("test $(ls -A /root/ | wc -l) -gt 0", None, "Items in /root"),
|
|
147
150
|
# For tasks with subdirs (chunky, etc.), verify structure is mounted
|
|
148
|
-
("find /root -type f 2>/dev/null | head -5", None, "Files under /root (recursive)"),
|
|
151
|
+
("test $(find /root -type f 2>/dev/null | head -5 | wc -l) -gt 0", None, "Files under /root (recursive)"),
|
|
149
152
|
],
|
|
150
153
|
"cybergym": [
|
|
151
154
|
# Working directory: /root
|
|
@@ -155,29 +158,27 @@ ENVIRONMENT_CHECKS = {
|
|
|
155
158
|
# CyberGym-specific: submit.sh MUST exist for PoC submission
|
|
156
159
|
("test -f /submit.sh && echo 'submit.sh'", "submit.sh", "submit.sh present at /submit.sh"),
|
|
157
160
|
("test -x /submit.sh && echo 'executable'", "executable", "submit.sh is executable"),
|
|
158
|
-
#
|
|
159
|
-
("which gdb
|
|
160
|
-
("which ghidra
|
|
161
|
-
("which radare2
|
|
162
|
-
# Core security tools
|
|
163
|
-
("which python3",
|
|
164
|
-
("which nmap", "/usr/bin/nmap", "nmap
|
|
165
|
-
("which curl", "/usr/bin/curl", "curl
|
|
161
|
+
# Required tools - STRICT (same image as cybench)
|
|
162
|
+
("which gdb", "/usr/bin/gdb", "gdb at /usr/bin/gdb"),
|
|
163
|
+
("which ghidra", "/usr/bin/ghidra", "ghidra at /usr/bin/ghidra"),
|
|
164
|
+
("which radare2", "/usr/bin/radare2", "radare2 at /usr/bin/radare2"),
|
|
165
|
+
# Core security tools - STRICT
|
|
166
|
+
("which python3", "/usr/bin/python3", "python3 at /usr/bin/python3"),
|
|
167
|
+
("which nmap", "/usr/bin/nmap", "nmap at /usr/bin/nmap"),
|
|
168
|
+
("which curl", "/usr/bin/curl", "curl at /usr/bin/curl"),
|
|
166
169
|
# apt-get for additional tools
|
|
167
170
|
("apt-get --version", "apt", "apt-get available"),
|
|
168
|
-
# Task files from setup.sh
|
|
169
|
-
("ls /root/ 2>/dev/null | wc -l", None, "Files in /root"),
|
|
170
171
|
],
|
|
171
172
|
"cvebench": [
|
|
172
173
|
# OS: Kali Linux
|
|
173
174
|
("cat /etc/os-release | grep '^NAME='", "Kali", "Kali Linux base image"),
|
|
174
|
-
# Core security tools for CVE exploitation
|
|
175
|
-
("which python3",
|
|
176
|
-
("which nmap", "/usr/bin/nmap", "nmap
|
|
177
|
-
("which curl", "/usr/bin/curl", "curl
|
|
178
|
-
("which wget", "/usr/bin/wget", "wget
|
|
179
|
-
# Network access to target
|
|
180
|
-
("which nc",
|
|
175
|
+
# Core security tools for CVE exploitation - STRICT
|
|
176
|
+
("which python3", "/usr/bin/python3", "python3 at /usr/bin/python3"),
|
|
177
|
+
("which nmap", "/usr/bin/nmap", "nmap at /usr/bin/nmap"),
|
|
178
|
+
("which curl", "/usr/bin/curl", "curl at /usr/bin/curl"),
|
|
179
|
+
("which wget", "/usr/bin/wget", "wget at /usr/bin/wget"),
|
|
180
|
+
# Network access to target - STRICT
|
|
181
|
+
("which nc", "/usr/bin/nc", "netcat at /usr/bin/nc"),
|
|
181
182
|
# apt-get for additional tools
|
|
182
183
|
("apt-get --version", "apt", "apt-get available"),
|
|
183
184
|
],
|
|
@@ -191,6 +192,43 @@ ENVIRONMENT_CHECKS = {
|
|
|
191
192
|
],
|
|
192
193
|
}
|
|
193
194
|
|
|
195
|
+
# Task-specific file expectations
|
|
196
|
+
# Maps benchmark -> task_id -> list of expected paths
|
|
197
|
+
# Use "*" as task_id for checks that apply to all tasks in a benchmark
|
|
198
|
+
TASK_FILE_CHECKS = {
|
|
199
|
+
"cybench": {
|
|
200
|
+
"chunky": ["/root/blog", "/root/nginx", "/root/rcache"],
|
|
201
|
+
"avatar": ["/root/chall.py"],
|
|
202
|
+
"lootstash": ["/root/stash"],
|
|
203
|
+
"primary_knowledge": [], # No local files - network-based
|
|
204
|
+
},
|
|
205
|
+
"cybergym": {
|
|
206
|
+
"*": ["/submit.sh"], # All cybergym tasks need submit.sh
|
|
207
|
+
},
|
|
208
|
+
"nyuctf": {
|
|
209
|
+
"*": ["/home/ctfplayer/ctf_files"], # Standard location
|
|
210
|
+
},
|
|
211
|
+
"intercode-ctf": {
|
|
212
|
+
"*": ["/ctf"], # Task files in /ctf directory
|
|
213
|
+
},
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
def get_file_checks_for_task(benchmark: str, task_id: str) -> list[tuple]:
|
|
218
|
+
"""Generate file existence checks based on task expectations."""
|
|
219
|
+
checks = []
|
|
220
|
+
task_files = TASK_FILE_CHECKS.get(benchmark, {})
|
|
221
|
+
|
|
222
|
+
# Try task-specific first, then fallback to wildcard
|
|
223
|
+
expected = task_files.get(task_id) or task_files.get("*", [])
|
|
224
|
+
|
|
225
|
+
for path in expected:
|
|
226
|
+
checks.append(
|
|
227
|
+
(f"test -e {path} && echo 'EXISTS'", "EXISTS", f"{path} exists")
|
|
228
|
+
)
|
|
229
|
+
|
|
230
|
+
return checks
|
|
231
|
+
|
|
194
232
|
# Commands to run for submission tests
|
|
195
233
|
SUBMISSION_TESTS = {
|
|
196
234
|
"intercode-ctf": {
|
|
@@ -503,7 +541,12 @@ def run_automated_test(task_id: str, benchmark: str, timeout: int = 300) -> list
|
|
|
503
541
|
|
|
504
542
|
# Get benchmark-specific checks from ENVIRONMENT_CHECKS
|
|
505
543
|
env_checks = ENVIRONMENT_CHECKS.get(benchmark, [])
|
|
506
|
-
|
|
544
|
+
|
|
545
|
+
# Add task-specific file checks
|
|
546
|
+
file_checks = get_file_checks_for_task(benchmark, task_id)
|
|
547
|
+
all_checks = env_checks + file_checks
|
|
548
|
+
|
|
549
|
+
for check in all_checks:
|
|
507
550
|
# Unpack: (command, expected_pattern, description)
|
|
508
551
|
if len(check) == 3:
|
|
509
552
|
cmd, expected, desc = check
|
|
@@ -517,7 +560,7 @@ def run_automated_test(task_id: str, benchmark: str, timeout: int = 300) -> list
|
|
|
517
560
|
["docker", "exec", container_name, "sh", "-c", cmd],
|
|
518
561
|
capture_output=True,
|
|
519
562
|
text=True,
|
|
520
|
-
timeout=
|
|
563
|
+
timeout=30, # Increased for slow imports (angr takes ~10s)
|
|
521
564
|
)
|
|
522
565
|
output = result.stdout.strip()
|
|
523
566
|
stderr = result.stderr.strip()
|
|
@@ -564,13 +607,20 @@ def run_automated_test(task_id: str, benchmark: str, timeout: int = 300) -> list
|
|
|
564
607
|
time.sleep(2)
|
|
565
608
|
docker_child.expect(prompt_patterns[:-1], timeout=30)
|
|
566
609
|
output = strip_ansi(docker_child.before or "")
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
|
|
610
|
+
|
|
611
|
+
expected_score = sub_tests.get("score_expect")
|
|
612
|
+
if expected_score:
|
|
613
|
+
passed = expected_score.lower() in output.lower()
|
|
614
|
+
details = output[:200] if passed else f"Expected '{expected_score}' in output: {output[:100]}..."
|
|
615
|
+
results.append(TestResult("task score", passed, details))
|
|
616
|
+
else:
|
|
617
|
+
results.append(
|
|
618
|
+
TestResult(
|
|
619
|
+
"task score",
|
|
620
|
+
True, # Just checking it runs
|
|
621
|
+
output[:200],
|
|
622
|
+
)
|
|
572
623
|
)
|
|
573
|
-
)
|
|
574
624
|
|
|
575
625
|
# Submit answer
|
|
576
626
|
console.print("Submitting test answer...")
|
|
@@ -605,14 +655,15 @@ def run_automated_test(task_id: str, benchmark: str, timeout: int = 300) -> list
|
|
|
605
655
|
timeout=60,
|
|
606
656
|
)
|
|
607
657
|
# EOF (idx=4) is expected - container exits after task submit
|
|
608
|
-
|
|
658
|
+
# TIMEOUT (idx=3) is a failure
|
|
659
|
+
if idx != 3:
|
|
609
660
|
results.append(
|
|
610
661
|
TestResult("Submission", True, "Answer submitted (container exited)")
|
|
611
662
|
)
|
|
612
663
|
else:
|
|
613
664
|
results.append(
|
|
614
665
|
TestResult(
|
|
615
|
-
"Submission", False, docker_child.before or "", "Submission
|
|
666
|
+
"Submission", False, docker_child.before or "", "Submission timed out waiting for result"
|
|
616
667
|
)
|
|
617
668
|
)
|
|
618
669
|
elif idx < 3:
|
|
@@ -624,7 +675,7 @@ def run_automated_test(task_id: str, benchmark: str, timeout: int = 300) -> list
|
|
|
624
675
|
else:
|
|
625
676
|
results.append(
|
|
626
677
|
TestResult(
|
|
627
|
-
"Submission", False, docker_child.before or "", "Submission
|
|
678
|
+
"Submission", False, docker_child.before or "", "Submission timed out at prompt"
|
|
628
679
|
)
|
|
629
680
|
)
|
|
630
681
|
|
|
@@ -671,10 +722,19 @@ def run_automated_test(task_id: str, benchmark: str, timeout: int = 300) -> list
|
|
|
671
722
|
return results
|
|
672
723
|
|
|
673
724
|
|
|
674
|
-
def verify_artifacts(task_id: str,
|
|
725
|
+
def verify_artifacts(task_id: str, benchmark: str) -> list[TestResult]:
|
|
675
726
|
"""Verify session and eval log artifacts were created."""
|
|
676
727
|
results = []
|
|
677
728
|
|
|
729
|
+
# Get expected answer from SUBMISSION_TESTS
|
|
730
|
+
sub_tests = SUBMISSION_TESTS.get(benchmark, {})
|
|
731
|
+
submit_cmd = sub_tests.get("submit_cmd", "")
|
|
732
|
+
expected_answer = None
|
|
733
|
+
if 'task submit "' in submit_cmd:
|
|
734
|
+
expected_answer = submit_cmd.split('task submit "')[1].rstrip('"')
|
|
735
|
+
elif "task submit '" in submit_cmd:
|
|
736
|
+
expected_answer = submit_cmd.split("task submit '")[1].rstrip("'")
|
|
737
|
+
|
|
678
738
|
# Check session in database
|
|
679
739
|
session_info = ssh_query(f"""
|
|
680
740
|
SELECT id, status, score, client_active_seconds, answer
|
|
@@ -699,11 +759,23 @@ def verify_artifacts(task_id: str, _benchmark: str) -> list[TestResult]:
|
|
|
699
759
|
"Active time recorded", float(active_seconds or 0) > 0, f"Seconds: {active_seconds}"
|
|
700
760
|
)
|
|
701
761
|
)
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
|
|
762
|
+
|
|
763
|
+
# Verify answer
|
|
764
|
+
if expected_answer and benchmark != "cybergym": # Cybergym submits file content, hard to verify here
|
|
765
|
+
results.append(
|
|
766
|
+
TestResult(
|
|
767
|
+
"Answer matches submission",
|
|
768
|
+
answer == expected_answer,
|
|
769
|
+
f"Expected: '{expected_answer}', Got: '{answer}'"
|
|
770
|
+
)
|
|
705
771
|
)
|
|
706
|
-
|
|
772
|
+
else:
|
|
773
|
+
results.append(
|
|
774
|
+
TestResult(
|
|
775
|
+
"Answer recorded", bool(answer), f"Answer: {answer[:50]}..." if answer else ""
|
|
776
|
+
)
|
|
777
|
+
)
|
|
778
|
+
|
|
707
779
|
results.append(
|
|
708
780
|
TestResult("Score recorded", score != "", f"Score: {score}" if score else "No score")
|
|
709
781
|
)
|
|
@@ -22,6 +22,41 @@ VPS_HOST = os.environ.get("VPS_HOST", "root@209.38.25.118")
|
|
|
22
22
|
TEST_EMAIL = "e2e-test@lyptus.dev"
|
|
23
23
|
CLI_CONFIG_PATH = Path.home() / "Library" / "Application Support" / "hte-cli" / "config.json"
|
|
24
24
|
|
|
25
|
+
# Expected task assignments per benchmark (4 per benchmark)
|
|
26
|
+
# First 2 for pytest API tests, last 2 for interactive/automated tests
|
|
27
|
+
EXPECTED_TASKS = {
|
|
28
|
+
"cybergym": ["arvo:60121", "arvo:8046", "arvo:41073", "arvo:43984"],
|
|
29
|
+
"cybench": ["chunky", "avatar", "lootstash", "primary_knowledge"],
|
|
30
|
+
"intercode-ctf": [
|
|
31
|
+
"intercode-ctf_general_skills/task_4",
|
|
32
|
+
"intercode-ctf_reverse/task_1",
|
|
33
|
+
"intercode-ctf_reverse/task_3",
|
|
34
|
+
"intercode-ctf_reverse/task_85",
|
|
35
|
+
],
|
|
36
|
+
"nyuctf": [
|
|
37
|
+
"2013f-cry-stfu",
|
|
38
|
+
"2016q-rev-rock",
|
|
39
|
+
"2023q-rev-baby_s_first",
|
|
40
|
+
"2019q-cry-super_curve",
|
|
41
|
+
],
|
|
42
|
+
"cybashbench": [
|
|
43
|
+
"cybashbench_pwn/nl2bash_003",
|
|
44
|
+
"cybashbench_reverse/nl2bash_004",
|
|
45
|
+
"cybashbench_web/nl2bash_001",
|
|
46
|
+
"cybashbench_forensics/nl2bash_005",
|
|
47
|
+
],
|
|
48
|
+
"cvebench": ["CVE-2024-5084", "CVE-2024-4442", "CVE-2024-22120", "CVE-2024-32511"],
|
|
49
|
+
"nl2bash": [
|
|
50
|
+
"nl2bash_complex/task_8581",
|
|
51
|
+
"nl2bash_complex/task_713",
|
|
52
|
+
"nl2bash_complex/task_712",
|
|
53
|
+
"nl2bash_complex/task_8796",
|
|
54
|
+
],
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
# Total expected assignments (calculated from EXPECTED_TASKS)
|
|
58
|
+
EXPECTED_ASSIGNMENT_COUNT = sum(len(tasks) for tasks in EXPECTED_TASKS.values())
|
|
59
|
+
|
|
25
60
|
|
|
26
61
|
def ssh_query(query: str) -> str:
|
|
27
62
|
"""Run a sqlite3 query on the VPS (READ-ONLY)."""
|
|
@@ -61,6 +61,12 @@ BENCHMARK_TASKS = {
|
|
|
61
61
|
"cybashbench_forensics/nl2bash_005",
|
|
62
62
|
],
|
|
63
63
|
"cvebench": ["CVE-2024-5084", "CVE-2024-4442", "CVE-2024-22120", "CVE-2024-32511"],
|
|
64
|
+
"nl2bash": [
|
|
65
|
+
"nl2bash_complex/task_8581",
|
|
66
|
+
"nl2bash_complex/task_713",
|
|
67
|
+
"nl2bash_complex/task_712",
|
|
68
|
+
"nl2bash_complex/task_8796",
|
|
69
|
+
],
|
|
64
70
|
}
|
|
65
71
|
|
|
66
72
|
# Instructions for each benchmark type
|
|
@@ -688,38 +694,158 @@ def cleanup():
|
|
|
688
694
|
help="Admin password for API access",
|
|
689
695
|
)
|
|
690
696
|
@click.option("--yes", "-y", is_flag=True, help="Skip confirmation prompts")
|
|
691
|
-
|
|
692
|
-
|
|
693
|
-
|
|
694
|
-
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
|
|
697
|
+
@click.option("--skip-setup", is_flag=True, help="Skip setup if already done")
|
|
698
|
+
@click.option("--cleanup-after", is_flag=True, help="Run cleanup after tests")
|
|
699
|
+
def full(admin_password: str, yes: bool, skip_setup: bool, cleanup_after: bool):
|
|
700
|
+
"""Run complete E2E test suite in 3 phases.
|
|
701
|
+
|
|
702
|
+
Phase 1: Infrastructure tests (pytest, fast, no containers)
|
|
703
|
+
Phase 2: Automated benchmark E2E tests (pexpect, creates completed sessions)
|
|
704
|
+
Phase 3: Session verification tests (pytest, validates completed sessions)
|
|
705
|
+
|
|
706
|
+
This is fully automated - no user interaction required.
|
|
707
|
+
"""
|
|
708
|
+
console.print(Panel("[bold]Full E2E Test Suite - 3 Phases[/bold]", style="cyan"))
|
|
709
|
+
console.print("""
|
|
710
|
+
[dim]Phase 1:[/dim] Infrastructure tests (pytest)
|
|
711
|
+
[dim]Phase 2:[/dim] Automated benchmark E2E tests (pexpect)
|
|
712
|
+
[dim]Phase 3:[/dim] Session verification tests (pytest)
|
|
713
|
+
""")
|
|
714
|
+
|
|
715
|
+
if not yes and not click.confirm("Run full automated E2E suite?"):
|
|
698
716
|
raise click.ClickException("Aborted")
|
|
699
717
|
|
|
700
|
-
|
|
701
|
-
|
|
702
|
-
|
|
718
|
+
results = {"phase1": None, "phase2": {}, "phase3": None}
|
|
719
|
+
tests_dir = Path(__file__).parent
|
|
720
|
+
|
|
721
|
+
# Setup (unless skipped)
|
|
722
|
+
if not skip_setup:
|
|
723
|
+
console.print("\n" + "=" * 60)
|
|
724
|
+
console.print("[bold cyan]SETUP: Creating test user and assignments[/bold cyan]")
|
|
725
|
+
console.print("=" * 60)
|
|
726
|
+
ctx = click.get_current_context()
|
|
727
|
+
ctx.invoke(setup, admin_password=admin_password, yes=True)
|
|
728
|
+
|
|
729
|
+
# Phase 1: Infrastructure tests
|
|
730
|
+
console.print("\n" + "=" * 60)
|
|
731
|
+
console.print("[bold cyan]PHASE 1: Infrastructure Tests[/bold cyan]")
|
|
732
|
+
console.print("=" * 60)
|
|
733
|
+
console.print("[dim]Running pytest on infrastructure, imports, benchmark flows...[/dim]\n")
|
|
734
|
+
|
|
735
|
+
phase1_result = subprocess.run(
|
|
736
|
+
[
|
|
737
|
+
"uv", "run", "pytest",
|
|
738
|
+
str(tests_dir / "test_infrastructure.py"),
|
|
739
|
+
str(tests_dir / "test_runtime_imports.py"),
|
|
740
|
+
str(tests_dir / "test_benchmark_flows.py"),
|
|
741
|
+
"-v", "--tb=short",
|
|
742
|
+
],
|
|
743
|
+
cwd=tests_dir.parent.parent,
|
|
744
|
+
)
|
|
745
|
+
results["phase1"] = phase1_result.returncode == 0
|
|
746
|
+
|
|
747
|
+
if not results["phase1"]:
|
|
748
|
+
console.print("\n[red bold]Phase 1 FAILED - stopping[/red bold]")
|
|
749
|
+
_print_full_summary(results)
|
|
750
|
+
raise SystemExit(1)
|
|
703
751
|
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
|
|
752
|
+
console.print("\n[green]Phase 1 PASSED[/green]")
|
|
753
|
+
|
|
754
|
+
# Phase 2: Automated benchmark E2E tests
|
|
755
|
+
console.print("\n" + "=" * 60)
|
|
756
|
+
console.print("[bold cyan]PHASE 2: Automated Benchmark E2E Tests[/bold cyan]")
|
|
757
|
+
console.print("=" * 60)
|
|
758
|
+
console.print("[dim]Running automated tests for each benchmark via pexpect...[/dim]\n")
|
|
708
759
|
|
|
709
|
-
|
|
710
|
-
if click.confirm(f"\nRun task {i+1}/2 for {benchmark}?"):
|
|
711
|
-
ctx.invoke(run, benchmark=benchmark, task_index=i)
|
|
760
|
+
from automated_runner import run_benchmark_test
|
|
712
761
|
|
|
713
|
-
|
|
714
|
-
|
|
715
|
-
|
|
762
|
+
for benchmark in BENCHMARK_TASKS.keys():
|
|
763
|
+
console.print(f"\n[bold]--- {benchmark} ---[/bold]")
|
|
764
|
+
try:
|
|
765
|
+
# Run task index 2 (third task, reserved for automated E2E)
|
|
766
|
+
success = run_benchmark_test(benchmark, task_index=2)
|
|
767
|
+
results["phase2"][benchmark] = success
|
|
768
|
+
if success:
|
|
769
|
+
console.print(f"[green]{benchmark}: PASSED[/green]")
|
|
770
|
+
else:
|
|
771
|
+
console.print(f"[red]{benchmark}: FAILED[/red]")
|
|
772
|
+
except Exception as e:
|
|
773
|
+
console.print(f"[red]{benchmark}: ERROR - {e}[/red]")
|
|
774
|
+
results["phase2"][benchmark] = False
|
|
775
|
+
|
|
776
|
+
phase2_passed = all(results["phase2"].values())
|
|
777
|
+
if not phase2_passed:
|
|
778
|
+
console.print("\n[yellow]Phase 2 had failures - continuing to Phase 3[/yellow]")
|
|
779
|
+
|
|
780
|
+
# Phase 3: Session verification tests
|
|
781
|
+
console.print("\n" + "=" * 60)
|
|
782
|
+
console.print("[bold cyan]PHASE 3: Session Verification Tests[/bold cyan]")
|
|
783
|
+
console.print("=" * 60)
|
|
784
|
+
console.print("[dim]Running pytest on session lifecycle and eval logs...[/dim]\n")
|
|
785
|
+
|
|
786
|
+
phase3_result = subprocess.run(
|
|
787
|
+
[
|
|
788
|
+
"uv", "run", "pytest",
|
|
789
|
+
str(tests_dir / "test_session_lifecycle.py"),
|
|
790
|
+
str(tests_dir / "test_eval_logs.py"),
|
|
791
|
+
"-v", "--tb=short",
|
|
792
|
+
],
|
|
793
|
+
cwd=tests_dir.parent.parent,
|
|
794
|
+
)
|
|
795
|
+
results["phase3"] = phase3_result.returncode == 0
|
|
796
|
+
|
|
797
|
+
# Summary
|
|
798
|
+
_print_full_summary(results)
|
|
716
799
|
|
|
717
800
|
# Cleanup
|
|
718
|
-
|
|
719
|
-
|
|
801
|
+
if cleanup_after:
|
|
802
|
+
console.print("\n" + "=" * 60)
|
|
803
|
+
console.print("[bold cyan]CLEANUP[/bold cyan]")
|
|
804
|
+
ctx = click.get_current_context()
|
|
720
805
|
ctx.invoke(cleanup)
|
|
721
806
|
|
|
722
|
-
|
|
807
|
+
# Exit with appropriate code
|
|
808
|
+
all_passed = results["phase1"] and phase2_passed and results["phase3"]
|
|
809
|
+
if all_passed:
|
|
810
|
+
console.print("\n[bold green]All phases PASSED![/bold green]")
|
|
811
|
+
else:
|
|
812
|
+
console.print("\n[bold red]Some phases FAILED[/bold red]")
|
|
813
|
+
raise SystemExit(1)
|
|
814
|
+
|
|
815
|
+
|
|
816
|
+
def _print_full_summary(results: dict):
|
|
817
|
+
"""Print summary table of all phases."""
|
|
818
|
+
console.print("\n" + "=" * 60)
|
|
819
|
+
console.print("[bold]SUMMARY[/bold]")
|
|
820
|
+
console.print("=" * 60)
|
|
821
|
+
|
|
822
|
+
table = Table()
|
|
823
|
+
table.add_column("Phase", style="cyan")
|
|
824
|
+
table.add_column("Status")
|
|
825
|
+
table.add_column("Details")
|
|
826
|
+
|
|
827
|
+
# Phase 1
|
|
828
|
+
if results["phase1"] is not None:
|
|
829
|
+
status = "[green]PASSED[/green]" if results["phase1"] else "[red]FAILED[/red]"
|
|
830
|
+
table.add_row("Phase 1: Infrastructure", status, "pytest infra/imports/flows")
|
|
831
|
+
|
|
832
|
+
# Phase 2
|
|
833
|
+
if results["phase2"]:
|
|
834
|
+
passed = sum(1 for v in results["phase2"].values() if v)
|
|
835
|
+
total = len(results["phase2"])
|
|
836
|
+
status = "[green]PASSED[/green]" if passed == total else f"[yellow]{passed}/{total}[/yellow]"
|
|
837
|
+
details = ", ".join(
|
|
838
|
+
f"[green]{b}[/green]" if v else f"[red]{b}[/red]"
|
|
839
|
+
for b, v in results["phase2"].items()
|
|
840
|
+
)
|
|
841
|
+
table.add_row("Phase 2: Benchmarks", status, details)
|
|
842
|
+
|
|
843
|
+
# Phase 3
|
|
844
|
+
if results["phase3"] is not None:
|
|
845
|
+
status = "[green]PASSED[/green]" if results["phase3"] else "[red]FAILED[/red]"
|
|
846
|
+
table.add_row("Phase 3: Verification", status, "pytest lifecycle/logs")
|
|
847
|
+
|
|
848
|
+
console.print(table)
|
|
723
849
|
|
|
724
850
|
|
|
725
851
|
if __name__ == "__main__":
|
|
@@ -13,7 +13,14 @@ Run with: uv run pytest tests/e2e/test_benchmark_flows.py -v
|
|
|
13
13
|
import pytest
|
|
14
14
|
import requests
|
|
15
15
|
|
|
16
|
-
from tests.e2e.conftest import
|
|
16
|
+
from tests.e2e.conftest import (
|
|
17
|
+
BASE_URL,
|
|
18
|
+
EXPECTED_ASSIGNMENT_COUNT,
|
|
19
|
+
EXPECTED_TASKS,
|
|
20
|
+
get_test_user_id,
|
|
21
|
+
ssh_command,
|
|
22
|
+
ssh_query,
|
|
23
|
+
)
|
|
17
24
|
|
|
18
25
|
# Benchmark test configurations
|
|
19
26
|
# First 2 tasks for pytest API tests, last 2 for interactive tests
|
|
@@ -367,12 +374,14 @@ class TestCrossBenchmark:
|
|
|
367
374
|
assert int(count) > 0, f"No assignments for {benchmark}"
|
|
368
375
|
|
|
369
376
|
def test_total_assignments_correct(self):
|
|
370
|
-
"""Total assignments should
|
|
377
|
+
"""Total assignments should match expected count (4 per benchmark)."""
|
|
371
378
|
count = ssh_query(f"""
|
|
372
379
|
SELECT COUNT(*) FROM assignments
|
|
373
380
|
WHERE user_id = '{get_test_user_id()}'
|
|
374
381
|
""")
|
|
375
|
-
assert int(count) ==
|
|
382
|
+
assert int(count) == EXPECTED_ASSIGNMENT_COUNT, (
|
|
383
|
+
f"Expected {EXPECTED_ASSIGNMENT_COUNT} assignments, got {count}"
|
|
384
|
+
)
|
|
376
385
|
|
|
377
386
|
|
|
378
387
|
# =============================================================================
|
|
@@ -85,8 +85,8 @@ class TestLocalEvalLogs:
|
|
|
85
85
|
pytest.skip("Local eval logs directory not found")
|
|
86
86
|
|
|
87
87
|
logs = list(LOCAL_EVAL_LOGS_DIR.glob("*.eval"))
|
|
88
|
-
#
|
|
89
|
-
assert
|
|
88
|
+
# Verify we found eval logs (if E2E tests have run, there should be some)
|
|
89
|
+
assert len(logs) > 0, f"No eval logs found in {LOCAL_EVAL_LOGS_DIR}"
|
|
90
90
|
|
|
91
91
|
|
|
92
92
|
# =============================================================================
|
|
@@ -103,11 +103,12 @@ class TestVPSEvalLogs:
|
|
|
103
103
|
assert result == "exists", "VPS eval logs directory not found"
|
|
104
104
|
|
|
105
105
|
def test_vps_eval_log_count(self):
|
|
106
|
-
"""Should
|
|
106
|
+
"""Should have eval logs on VPS if sessions have completed."""
|
|
107
107
|
result = ssh_command(f"find {VPS_EVAL_LOGS_DIR} -name '*.eval.gz' 2>/dev/null | wc -l")
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
108
|
+
assert result.strip().isdigit(), f"Invalid count result: {result}"
|
|
109
|
+
count = int(result.strip())
|
|
110
|
+
# If E2E tests have run, there should be eval logs
|
|
111
|
+
assert count > 0, f"No eval logs found on VPS in {VPS_EVAL_LOGS_DIR}"
|
|
111
112
|
|
|
112
113
|
def test_completed_sessions_have_eval_log_path(self):
|
|
113
114
|
"""Completed sessions should have eval_log_path recorded."""
|
|
@@ -241,11 +242,11 @@ class TestEvalLogUpload:
|
|
|
241
242
|
WHERE session_id = '{session_id}'
|
|
242
243
|
""")
|
|
243
244
|
|
|
244
|
-
# Should have upload-related events
|
|
245
|
+
# Should have upload-related events for completed sessions
|
|
245
246
|
event_list = events.split("\n") if events else []
|
|
246
247
|
has_upload = any("upload" in e.lower() for e in event_list)
|
|
247
|
-
#
|
|
248
|
-
assert
|
|
248
|
+
# Completed sessions should have upload events
|
|
249
|
+
assert has_upload, f"No upload events found for session {session_id}. Events: {event_list[:5]}"
|
|
249
250
|
|
|
250
251
|
def test_eval_log_size_reasonable(self):
|
|
251
252
|
"""Eval logs should be reasonably sized (not empty, not huge)."""
|
|
@@ -25,31 +25,8 @@ from tests.e2e.conftest import (
|
|
|
25
25
|
ssh_query,
|
|
26
26
|
)
|
|
27
27
|
|
|
28
|
-
#
|
|
29
|
-
|
|
30
|
-
EXPECTED_TASKS = {
|
|
31
|
-
"cybergym": ["arvo:60121", "arvo:8046", "arvo:41073", "arvo:43984"],
|
|
32
|
-
"cybench": ["chunky", "avatar", "lootstash", "primary_knowledge"],
|
|
33
|
-
"intercode-ctf": [
|
|
34
|
-
"intercode-ctf_general_skills/task_4",
|
|
35
|
-
"intercode-ctf_reverse/task_1",
|
|
36
|
-
"intercode-ctf_reverse/task_3",
|
|
37
|
-
"intercode-ctf_reverse/task_85",
|
|
38
|
-
],
|
|
39
|
-
"nyuctf": [
|
|
40
|
-
"2013f-cry-stfu",
|
|
41
|
-
"2016q-rev-rock",
|
|
42
|
-
"2023q-rev-baby_s_first",
|
|
43
|
-
"2019q-cry-super_curve",
|
|
44
|
-
],
|
|
45
|
-
"cybashbench": [
|
|
46
|
-
"cybashbench_pwn/nl2bash_003",
|
|
47
|
-
"cybashbench_reverse/nl2bash_004",
|
|
48
|
-
"cybashbench_web/nl2bash_001",
|
|
49
|
-
"cybashbench_forensics/nl2bash_005",
|
|
50
|
-
],
|
|
51
|
-
"cvebench": ["CVE-2024-5084", "CVE-2024-4442", "CVE-2024-22120", "CVE-2024-32511"],
|
|
52
|
-
}
|
|
28
|
+
# Import shared constants from conftest
|
|
29
|
+
from tests.e2e.conftest import EXPECTED_TASKS, EXPECTED_ASSIGNMENT_COUNT
|
|
53
30
|
|
|
54
31
|
|
|
55
32
|
# =============================================================================
|
|
@@ -133,11 +110,13 @@ class TestAssignments:
|
|
|
133
110
|
"""Test that task assignments are correctly set up."""
|
|
134
111
|
|
|
135
112
|
def test_correct_number_of_assignments(self):
|
|
136
|
-
"""Test user should have
|
|
113
|
+
"""Test user should have expected number of assignments."""
|
|
137
114
|
count = ssh_query(
|
|
138
115
|
f"SELECT COUNT(*) FROM assignments WHERE user_id = '{get_test_user_id()}'"
|
|
139
116
|
)
|
|
140
|
-
assert int(count) ==
|
|
117
|
+
assert int(count) == EXPECTED_ASSIGNMENT_COUNT, (
|
|
118
|
+
f"Expected {EXPECTED_ASSIGNMENT_COUNT} assignments, got {count}"
|
|
119
|
+
)
|
|
141
120
|
|
|
142
121
|
@pytest.mark.parametrize("benchmark,tasks", EXPECTED_TASKS.items())
|
|
143
122
|
def test_benchmark_tasks_assigned(self, benchmark, tasks):
|
|
@@ -226,8 +205,9 @@ class TestAPIEndpoints:
|
|
|
226
205
|
)
|
|
227
206
|
assert response.status_code == 200
|
|
228
207
|
assignments = response.json()
|
|
229
|
-
#
|
|
230
|
-
assert isinstance(assignments, list)
|
|
208
|
+
# Test user should have assignments from E2E setup
|
|
209
|
+
assert isinstance(assignments, list), "Expected list of assignments"
|
|
210
|
+
assert len(assignments) > 0, "Test user should have at least one assignment"
|
|
231
211
|
|
|
232
212
|
def test_assignment_has_task_info(self, api_headers):
|
|
233
213
|
"""Assignments should include task information."""
|
|
@@ -149,8 +149,10 @@ print(f'Loaded {len(HUMAN_REGISTRY)} benchmarks: {list(HUMAN_REGISTRY.keys())}')
|
|
|
149
149
|
pytest.fail(f"Import failed in container: {result.stderr}")
|
|
150
150
|
|
|
151
151
|
assert "Loaded" in result.stdout
|
|
152
|
-
# Should have
|
|
153
|
-
assert "
|
|
152
|
+
# Should have exactly 7 benchmarks
|
|
153
|
+
assert "7 benchmarks" in result.stdout, (
|
|
154
|
+
f"Expected 7 benchmarks, got: {result.stdout}"
|
|
155
|
+
)
|
|
154
156
|
|
|
155
157
|
def test_backend_can_import_adapters(self):
|
|
156
158
|
"""Backend should be able to instantiate adapters."""
|
|
@@ -176,9 +178,11 @@ for name, cls in HUMAN_REGISTRY.items():
|
|
|
176
178
|
if "FAIL" in result.stdout:
|
|
177
179
|
pytest.fail(f"Adapter instantiation failed: {result.stdout}")
|
|
178
180
|
|
|
179
|
-
#
|
|
181
|
+
# All benchmarks should show OK - STRICT check
|
|
180
182
|
for benchmark in BENCHMARKS:
|
|
181
|
-
assert f"{benchmark}: OK" in result.stdout
|
|
183
|
+
assert f"{benchmark}: OK" in result.stdout, (
|
|
184
|
+
f"Benchmark {benchmark} not found or not OK in output: {result.stdout}"
|
|
185
|
+
)
|
|
182
186
|
|
|
183
187
|
|
|
184
188
|
class TestLocalImports:
|
|
@@ -164,17 +164,26 @@ class TestSessionCompletion:
|
|
|
164
164
|
|
|
165
165
|
def test_completed_session_has_score(self):
|
|
166
166
|
"""Completed sessions should have a score."""
|
|
167
|
+
# Count total submitted sessions
|
|
168
|
+
total_submitted = ssh_query(f"""
|
|
169
|
+
SELECT COUNT(*) FROM sessions
|
|
170
|
+
WHERE user_id = '{get_test_user_id()}'
|
|
171
|
+
AND status = 'submitted'
|
|
172
|
+
""")
|
|
173
|
+
total = int(total_submitted) if total_submitted else 0
|
|
174
|
+
if total == 0:
|
|
175
|
+
pytest.skip("No submitted sessions to verify")
|
|
176
|
+
|
|
177
|
+
# Count sessions without score
|
|
167
178
|
sessions_without_score = ssh_query(f"""
|
|
168
179
|
SELECT COUNT(*) FROM sessions
|
|
169
180
|
WHERE user_id = '{get_test_user_id()}'
|
|
170
181
|
AND status = 'submitted'
|
|
171
182
|
AND score IS NULL
|
|
172
183
|
""")
|
|
173
|
-
|
|
174
|
-
#
|
|
175
|
-
count
|
|
176
|
-
# We just want to verify we can query this
|
|
177
|
-
assert count >= 0
|
|
184
|
+
count = int(sessions_without_score) if sessions_without_score else 0
|
|
185
|
+
# Most submitted sessions should have scores (some benchmarks may not score)
|
|
186
|
+
assert count < total, f"All {total} sessions missing scores"
|
|
178
187
|
|
|
179
188
|
def test_completed_session_has_answer(self):
|
|
180
189
|
"""Completed sessions should have an answer."""
|
|
@@ -208,14 +217,16 @@ class TestSessionState:
|
|
|
208
217
|
"""Test session state verification (read-only)."""
|
|
209
218
|
|
|
210
219
|
def test_abandoned_sessions_count(self):
|
|
211
|
-
"""Verify
|
|
220
|
+
"""Verify abandoned sessions exist and are queryable."""
|
|
212
221
|
abandoned_count = ssh_query(f"""
|
|
213
222
|
SELECT COUNT(*) FROM sessions
|
|
214
223
|
WHERE user_id = '{get_test_user_id()}'
|
|
215
224
|
AND status = 'abandoned'
|
|
216
225
|
""")
|
|
217
|
-
|
|
218
|
-
|
|
226
|
+
count = int(abandoned_count) if abandoned_count else 0
|
|
227
|
+
# Verify the query returned a valid number (not empty/error)
|
|
228
|
+
assert abandoned_count.strip().isdigit(), f"Query returned invalid value: {abandoned_count}"
|
|
229
|
+
# Note: count can legitimately be 0 if no sessions were abandoned
|
|
219
230
|
|
|
220
231
|
def test_no_stuck_sessions_older_than_24h(self):
|
|
221
232
|
"""No in_progress sessions should be older than 24 hours."""
|
|
@@ -387,8 +398,9 @@ class TestSessionCancellation:
|
|
|
387
398
|
WHERE user_id = '{get_test_user_id()}'
|
|
388
399
|
AND status = 'cancelled'
|
|
389
400
|
""")
|
|
390
|
-
#
|
|
391
|
-
assert
|
|
401
|
+
# Verify query returned valid result
|
|
402
|
+
assert cancelled.strip().isdigit(), f"Query returned invalid value: {cancelled}"
|
|
403
|
+
# Note: count can legitimately be 0 if no sessions were cancelled
|
|
392
404
|
|
|
393
405
|
def test_no_orphaned_in_progress_after_cancel(self):
|
|
394
406
|
"""Assignments should not be in_progress if session is cancelled."""
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|