hte-cli 0.2.24__tar.gz → 0.2.26__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {hte_cli-0.2.24 → hte_cli-0.2.26}/.gitignore +1 -0
- {hte_cli-0.2.24 → hte_cli-0.2.26}/PKG-INFO +1 -1
- {hte_cli-0.2.24 → hte_cli-0.2.26}/pyproject.toml +1 -1
- {hte_cli-0.2.24 → hte_cli-0.2.26}/src/hte_cli/cli.py +54 -7
- {hte_cli-0.2.24 → hte_cli-0.2.26}/src/hte_cli/image_utils.py +36 -1
- {hte_cli-0.2.24 → hte_cli-0.2.26}/tests/e2e/automated_runner.py +94 -11
- {hte_cli-0.2.24 → hte_cli-0.2.26}/tests/e2e/conftest.py +43 -1
- {hte_cli-0.2.24 → hte_cli-0.2.26}/tests/e2e/e2e_test.py +131 -0
- {hte_cli-0.2.24 → hte_cli-0.2.26}/tests/e2e/test_eval_logs.py +68 -6
- {hte_cli-0.2.24 → hte_cli-0.2.26}/tests/e2e/test_session_lifecycle.py +10 -1
- {hte_cli-0.2.24 → hte_cli-0.2.26}/uv.lock +1 -1
- {hte_cli-0.2.24 → hte_cli-0.2.26}/README.md +0 -0
- {hte_cli-0.2.24 → hte_cli-0.2.26}/src/hte_cli/__init__.py +0 -0
- {hte_cli-0.2.24 → hte_cli-0.2.26}/src/hte_cli/__main__.py +0 -0
- {hte_cli-0.2.24 → hte_cli-0.2.26}/src/hte_cli/api_client.py +0 -0
- {hte_cli-0.2.24 → hte_cli-0.2.26}/src/hte_cli/config.py +0 -0
- {hte_cli-0.2.24 → hte_cli-0.2.26}/src/hte_cli/errors.py +0 -0
- {hte_cli-0.2.24 → hte_cli-0.2.26}/src/hte_cli/events.py +0 -0
- {hte_cli-0.2.24 → hte_cli-0.2.26}/src/hte_cli/runner.py +0 -0
- {hte_cli-0.2.24 → hte_cli-0.2.26}/src/hte_cli/scorers.py +0 -0
- {hte_cli-0.2.24 → hte_cli-0.2.26}/src/hte_cli/version_check.py +0 -0
- {hte_cli-0.2.24 → hte_cli-0.2.26}/tests/__init__.py +0 -0
- {hte_cli-0.2.24 → hte_cli-0.2.26}/tests/e2e/__init__.py +0 -0
- {hte_cli-0.2.24 → hte_cli-0.2.26}/tests/e2e/test_benchmark_flows.py +0 -0
- {hte_cli-0.2.24 → hte_cli-0.2.26}/tests/e2e/test_infrastructure.py +0 -0
- {hte_cli-0.2.24 → hte_cli-0.2.26}/tests/e2e/test_runtime_imports.py +0 -0
- {hte_cli-0.2.24 → hte_cli-0.2.26}/tests/e2e/verify_docker_deps.py +0 -0
- {hte_cli-0.2.24 → hte_cli-0.2.26}/tests/unit/__init__.py +0 -0
- {hte_cli-0.2.24 → hte_cli-0.2.26}/tests/unit/conftest.py +0 -0
- {hte_cli-0.2.24 → hte_cli-0.2.26}/tests/unit/test_image_utils.py +0 -0
- {hte_cli-0.2.24 → hte_cli-0.2.26}/tests/unit/test_runner.py +0 -0
- {hte_cli-0.2.24 → hte_cli-0.2.26}/tests/unit/test_scorers.py +0 -0
|
@@ -172,6 +172,18 @@ def session_join(ctx, session_id: str, force_setup: bool):
|
|
|
172
172
|
console.print("[red]Not logged in. Run: hte-cli auth login[/red]")
|
|
173
173
|
sys.exit(1)
|
|
174
174
|
|
|
175
|
+
# Check Docker is running before we start (with retry prompt)
|
|
176
|
+
while True:
|
|
177
|
+
docker_ok, docker_error = _check_docker()
|
|
178
|
+
if docker_ok:
|
|
179
|
+
console.print("[dim]✓ Docker running[/dim]")
|
|
180
|
+
break
|
|
181
|
+
console.print(f"[red]{docker_error}[/red]")
|
|
182
|
+
console.print()
|
|
183
|
+
if not click.confirm("Start Docker and retry?", default=True):
|
|
184
|
+
sys.exit(1)
|
|
185
|
+
console.print("[dim]Checking Docker again...[/dim]")
|
|
186
|
+
|
|
175
187
|
api = APIClient(config)
|
|
176
188
|
|
|
177
189
|
# Step 1: Join session
|
|
@@ -201,8 +213,11 @@ def session_join(ctx, session_id: str, force_setup: bool):
|
|
|
201
213
|
# Check if reconnecting (session already in_progress)
|
|
202
214
|
is_reconnect = session_info.get("status") == "in_progress"
|
|
203
215
|
|
|
204
|
-
|
|
205
|
-
|
|
216
|
+
# Always run setup on reconnect - previous attempt may have failed
|
|
217
|
+
# (e.g., image pull failed, Docker wasn't running, etc.)
|
|
218
|
+
if is_reconnect:
|
|
219
|
+
force_setup = True
|
|
220
|
+
console.print("[yellow]Reconnecting to existing session (re-running setup)...[/yellow]")
|
|
206
221
|
console.print()
|
|
207
222
|
|
|
208
223
|
console.print(
|
|
@@ -219,7 +234,11 @@ def session_join(ctx, session_id: str, force_setup: bool):
|
|
|
219
234
|
import time
|
|
220
235
|
from hte_cli.events import EventStreamer
|
|
221
236
|
from hte_cli.runner import TaskRunner
|
|
222
|
-
from hte_cli.image_utils import
|
|
237
|
+
from hte_cli.image_utils import (
|
|
238
|
+
extract_images_from_compose,
|
|
239
|
+
extract_image_platforms_from_compose,
|
|
240
|
+
pull_image_with_progress,
|
|
241
|
+
)
|
|
223
242
|
|
|
224
243
|
# Create event streamer
|
|
225
244
|
events = EventStreamer(api, session_id)
|
|
@@ -285,9 +304,11 @@ def session_join(ctx, session_id: str, force_setup: bool):
|
|
|
285
304
|
failed_images = []
|
|
286
305
|
|
|
287
306
|
if not is_reconnect or force_setup:
|
|
288
|
-
# Extract images from compose
|
|
307
|
+
# Extract images and their platforms from compose
|
|
308
|
+
image_platforms = {}
|
|
289
309
|
if compose_yaml:
|
|
290
310
|
images = extract_images_from_compose(compose_yaml)
|
|
311
|
+
image_platforms = extract_image_platforms_from_compose(compose_yaml)
|
|
291
312
|
|
|
292
313
|
# Send setup_started event (includes CLI version for debugging)
|
|
293
314
|
events.setup_started(images=images, cli_version=__version__)
|
|
@@ -298,9 +319,11 @@ def session_join(ctx, session_id: str, force_setup: bool):
|
|
|
298
319
|
|
|
299
320
|
console.print(f"[bold]Step 2:[/bold] Pulling {len(images)} Docker image(s)...")
|
|
300
321
|
pull_start = time.monotonic()
|
|
322
|
+
pull_errors = {}
|
|
301
323
|
|
|
302
324
|
for img in images:
|
|
303
325
|
short_name = img.split("/")[-1][:40]
|
|
326
|
+
platform = image_platforms.get(img)
|
|
304
327
|
|
|
305
328
|
# Check if already cached
|
|
306
329
|
if check_image_exists_locally(img):
|
|
@@ -310,6 +333,7 @@ def session_join(ctx, session_id: str, force_setup: bool):
|
|
|
310
333
|
|
|
311
334
|
# Need to pull - show progress
|
|
312
335
|
last_status = ["connecting..."]
|
|
336
|
+
last_error = [""]
|
|
313
337
|
with console.status(
|
|
314
338
|
f"[yellow]↓[/yellow] {short_name} [dim]connecting...[/dim]"
|
|
315
339
|
) as status:
|
|
@@ -328,14 +352,23 @@ def session_join(ctx, session_id: str, force_setup: bool):
|
|
|
328
352
|
status.update(
|
|
329
353
|
f"[yellow]↓[/yellow] {short_name} [dim]{display}[/dim]"
|
|
330
354
|
)
|
|
355
|
+
# Capture error messages
|
|
356
|
+
if "error" in line.lower() or "denied" in line.lower():
|
|
357
|
+
last_error[0] = line
|
|
331
358
|
|
|
332
|
-
success = pull_image_with_progress(
|
|
359
|
+
success = pull_image_with_progress(
|
|
360
|
+
img, platform=platform, on_progress=show_progress
|
|
361
|
+
)
|
|
333
362
|
|
|
334
363
|
if success:
|
|
335
364
|
console.print(f" [green]✓[/green] {short_name} [dim](downloaded)[/dim]")
|
|
336
365
|
pulled_images.append(img)
|
|
337
366
|
else:
|
|
338
|
-
|
|
367
|
+
platform_note = f" (platform: {platform})" if platform else ""
|
|
368
|
+
console.print(f" [red]✗[/red] {short_name}{platform_note} [dim](failed)[/dim]")
|
|
369
|
+
if last_error[0]:
|
|
370
|
+
console.print(f" [dim]{last_error[0][:60]}[/dim]")
|
|
371
|
+
pull_errors[img] = last_error[0]
|
|
339
372
|
failed_images.append(img)
|
|
340
373
|
|
|
341
374
|
pull_duration = time.monotonic() - pull_start
|
|
@@ -347,6 +380,20 @@ def session_join(ctx, session_id: str, force_setup: bool):
|
|
|
347
380
|
)
|
|
348
381
|
console.print()
|
|
349
382
|
|
|
383
|
+
# Fail fast if any required image couldn't be pulled
|
|
384
|
+
if failed_images:
|
|
385
|
+
console.print(
|
|
386
|
+
f"[red]Error: Failed to pull {len(failed_images)} required Docker image(s).[/red]"
|
|
387
|
+
)
|
|
388
|
+
console.print()
|
|
389
|
+
console.print("[yellow]Troubleshooting:[/yellow]")
|
|
390
|
+
console.print(" 1. Check Docker is running: docker info")
|
|
391
|
+
console.print(" 2. Try manual pull: docker pull python:3.12-slim --platform linux/amd64")
|
|
392
|
+
console.print(" 3. Check network connectivity")
|
|
393
|
+
console.print()
|
|
394
|
+
console.print("Session remains active - you can retry with: hte-cli session join " + session_id)
|
|
395
|
+
sys.exit(1)
|
|
396
|
+
|
|
350
397
|
# Send setup_completed - THIS STARTS THE TIMER ON SERVER
|
|
351
398
|
total_setup = time.monotonic() - setup_start_time
|
|
352
399
|
events.setup_completed(total_seconds=total_setup)
|
|
@@ -644,7 +691,7 @@ def _check_docker() -> tuple[bool, str | None]:
|
|
|
644
691
|
timeout=10,
|
|
645
692
|
)
|
|
646
693
|
if result.returncode != 0:
|
|
647
|
-
return False, "Docker is not running. Start Docker Desktop or
|
|
694
|
+
return False, "Docker is not running. Start Docker (Docker Desktop, colima, or dockerd)."
|
|
648
695
|
except FileNotFoundError:
|
|
649
696
|
return False, "Docker is not installed. Install from https://docs.docker.com/get-docker/"
|
|
650
697
|
except Exception as e:
|
|
@@ -38,6 +38,33 @@ def extract_images_from_compose(compose_yaml: str) -> list[str]:
|
|
|
38
38
|
return []
|
|
39
39
|
|
|
40
40
|
|
|
41
|
+
def extract_image_platforms_from_compose(compose_yaml: str) -> dict[str, str | None]:
|
|
42
|
+
"""
|
|
43
|
+
Extract Docker image names and their platforms from a compose.yaml string.
|
|
44
|
+
|
|
45
|
+
Args:
|
|
46
|
+
compose_yaml: Docker Compose YAML content
|
|
47
|
+
|
|
48
|
+
Returns:
|
|
49
|
+
Dict mapping image names to their platform (or None if no platform specified)
|
|
50
|
+
"""
|
|
51
|
+
try:
|
|
52
|
+
compose_data = yaml.safe_load(compose_yaml)
|
|
53
|
+
if not compose_data or "services" not in compose_data:
|
|
54
|
+
return {}
|
|
55
|
+
|
|
56
|
+
image_platforms = {}
|
|
57
|
+
for service_name, service_config in compose_data.get("services", {}).items():
|
|
58
|
+
if isinstance(service_config, dict) and "image" in service_config:
|
|
59
|
+
image = service_config["image"]
|
|
60
|
+
platform = service_config.get("platform")
|
|
61
|
+
image_platforms[image] = platform
|
|
62
|
+
return image_platforms
|
|
63
|
+
except yaml.YAMLError as e:
|
|
64
|
+
logger.warning(f"Failed to parse compose.yaml: {e}")
|
|
65
|
+
return {}
|
|
66
|
+
|
|
67
|
+
|
|
41
68
|
def check_image_exists_locally(image: str) -> bool:
|
|
42
69
|
"""
|
|
43
70
|
Check if a Docker image exists locally.
|
|
@@ -61,16 +88,20 @@ def check_image_exists_locally(image: str) -> bool:
|
|
|
61
88
|
|
|
62
89
|
def pull_image_with_progress(
|
|
63
90
|
image: str,
|
|
91
|
+
platform: str | None = None,
|
|
64
92
|
on_progress: Callable[[str, str], None] | None = None,
|
|
65
93
|
on_complete: Callable[[str, bool], None] | None = None,
|
|
94
|
+
on_error: Callable[[str, str], None] | None = None,
|
|
66
95
|
) -> bool:
|
|
67
96
|
"""
|
|
68
97
|
Pull a Docker image with progress callbacks using PTY for real progress output.
|
|
69
98
|
|
|
70
99
|
Args:
|
|
71
100
|
image: Image name to pull
|
|
101
|
+
platform: Optional platform to pull (e.g., "linux/amd64")
|
|
72
102
|
on_progress: Callback(image, status_line) called for each progress update
|
|
73
103
|
on_complete: Callback(image, success) called when pull completes
|
|
104
|
+
on_error: Callback(image, error_message) called when pull fails
|
|
74
105
|
|
|
75
106
|
Returns:
|
|
76
107
|
True if pull succeeded, False otherwise
|
|
@@ -79,8 +110,12 @@ def pull_image_with_progress(
|
|
|
79
110
|
# Use PTY to get real progress output from docker
|
|
80
111
|
master_fd, slave_fd = pty.openpty()
|
|
81
112
|
|
|
113
|
+
cmd = ["docker", "pull", image]
|
|
114
|
+
if platform:
|
|
115
|
+
cmd.extend(["--platform", platform])
|
|
116
|
+
|
|
82
117
|
process = subprocess.Popen(
|
|
83
|
-
|
|
118
|
+
cmd,
|
|
84
119
|
stdout=slave_fd,
|
|
85
120
|
stderr=slave_fd,
|
|
86
121
|
stdin=slave_fd,
|
|
@@ -169,7 +169,7 @@ ENVIRONMENT_CHECKS = {
|
|
|
169
169
|
("cat /etc/os-release | grep '^NAME='", "Kali", "Kali Linux base image"),
|
|
170
170
|
# CyberGym-specific: submit.sh in home dir for PoC submission
|
|
171
171
|
("test -f /root/submit.sh && echo 'submit.sh'", "submit.sh", "submit.sh present in /root"),
|
|
172
|
-
|
|
172
|
+
# Note: submit.sh executable check removed - file is made executable by task command
|
|
173
173
|
# Required tools - STRICT (same image as cybench)
|
|
174
174
|
("which gdb", "/usr/bin/gdb", "gdb at /usr/bin/gdb"),
|
|
175
175
|
("which ghidra", "/usr/bin/ghidra", "ghidra at /usr/bin/ghidra"),
|
|
@@ -369,6 +369,63 @@ def create_session_for_task(task_id: str) -> str | None:
|
|
|
369
369
|
return None
|
|
370
370
|
|
|
371
371
|
|
|
372
|
+
def test_join_endpoint(session_id: str, benchmark: str) -> list[TestResult]:
|
|
373
|
+
"""Test the session join endpoint while session is in 'created' state.
|
|
374
|
+
|
|
375
|
+
This tests the API response before the CLI actually joins.
|
|
376
|
+
"""
|
|
377
|
+
import json
|
|
378
|
+
from pathlib import Path
|
|
379
|
+
|
|
380
|
+
results = []
|
|
381
|
+
base_url = os.environ.get("HTE_API_URL", "https://cyber-task-horizons.com")
|
|
382
|
+
|
|
383
|
+
config_path = Path.home() / "Library" / "Application Support" / "hte-cli" / "config.json"
|
|
384
|
+
if not config_path.exists():
|
|
385
|
+
results.append(TestResult("Join API test", False, "", "No CLI config"))
|
|
386
|
+
return results
|
|
387
|
+
|
|
388
|
+
config = json.loads(config_path.read_text())
|
|
389
|
+
headers = {"Authorization": f"Bearer {config['api_key']}"}
|
|
390
|
+
|
|
391
|
+
# Test the join endpoint
|
|
392
|
+
response = requests.post(
|
|
393
|
+
f"{base_url}/api/v1/cli/sessions/{session_id}/join",
|
|
394
|
+
headers=headers,
|
|
395
|
+
timeout=30,
|
|
396
|
+
)
|
|
397
|
+
|
|
398
|
+
if response.status_code != 200:
|
|
399
|
+
results.append(TestResult("Join API response", False, "", f"Status {response.status_code}"))
|
|
400
|
+
return results
|
|
401
|
+
|
|
402
|
+
data = response.json()
|
|
403
|
+
|
|
404
|
+
# Test: scorer_type is present
|
|
405
|
+
if "scorer_type" in data:
|
|
406
|
+
results.append(TestResult("Join returns scorer_type", True, data["scorer_type"]))
|
|
407
|
+
else:
|
|
408
|
+
results.append(TestResult("Join returns scorer_type", False, "", "Missing scorer_type"))
|
|
409
|
+
|
|
410
|
+
# Test: intermediate_scoring is present
|
|
411
|
+
if "intermediate_scoring" in data:
|
|
412
|
+
results.append(
|
|
413
|
+
TestResult("Join returns intermediate_scoring", True, str(data["intermediate_scoring"]))
|
|
414
|
+
)
|
|
415
|
+
else:
|
|
416
|
+
results.append(TestResult("Join returns intermediate_scoring", False, "", "Missing"))
|
|
417
|
+
|
|
418
|
+
# Test: target is present for flag-based benchmarks
|
|
419
|
+
if benchmark in ("cybench", "nyuctf", "intercode-ctf"):
|
|
420
|
+
has_target = "target" in data or ("task" in data and "target" in data.get("task", {}))
|
|
421
|
+
if has_target:
|
|
422
|
+
results.append(TestResult("Join returns target", True, "Present"))
|
|
423
|
+
else:
|
|
424
|
+
results.append(TestResult("Join returns target", False, "", "Missing target"))
|
|
425
|
+
|
|
426
|
+
return results
|
|
427
|
+
|
|
428
|
+
|
|
372
429
|
def run_automated_test(task_id: str, benchmark: str, timeout: int = 300) -> list[TestResult]:
|
|
373
430
|
"""
|
|
374
431
|
Run automated E2E test for a task using pexpect.
|
|
@@ -389,6 +446,11 @@ def run_automated_test(task_id: str, benchmark: str, timeout: int = 300) -> list
|
|
|
389
446
|
return results
|
|
390
447
|
results.append(TestResult("Session creation", True, f"Session: {session_id[:8]}..."))
|
|
391
448
|
|
|
449
|
+
# Test join endpoint while session is in 'created' state (before CLI joins)
|
|
450
|
+
console.print("Testing join endpoint...")
|
|
451
|
+
join_results = test_join_endpoint(session_id, benchmark)
|
|
452
|
+
results.extend(join_results)
|
|
453
|
+
|
|
392
454
|
# Start the CLI using the new session join flow
|
|
393
455
|
# Session has status="created", so CLI will run full setup
|
|
394
456
|
# Use explicit pipx path to test the published PyPI version, not local dev
|
|
@@ -634,24 +696,45 @@ def run_automated_test(task_id: str, benchmark: str, timeout: int = 300) -> list
|
|
|
634
696
|
except Exception:
|
|
635
697
|
pass
|
|
636
698
|
docker_child.sendline(sub_tests["score_cmd"])
|
|
637
|
-
time.sleep(2)
|
|
638
|
-
docker_child.expect(prompt_patterns[:-1], timeout=30)
|
|
639
|
-
output = strip_ansi(docker_child.before or "")
|
|
640
699
|
|
|
641
700
|
expected_score = sub_tests.get("score_expect")
|
|
642
701
|
if expected_score:
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
702
|
+
# Wait specifically for the score output, not just any prompt
|
|
703
|
+
# The score output appears as "Answer: ..., Score: I" or similar
|
|
704
|
+
try:
|
|
705
|
+
idx = docker_child.expect(
|
|
706
|
+
[expected_score, pexpect.TIMEOUT],
|
|
707
|
+
timeout=10,
|
|
708
|
+
)
|
|
709
|
+
if idx == 0:
|
|
710
|
+
# Found expected output - capture surrounding context
|
|
711
|
+
output = strip_ansi(docker_child.before or "") + expected_score
|
|
712
|
+
# Read a bit more to get the full score line
|
|
713
|
+
try:
|
|
714
|
+
extra = docker_child.read_nonblocking(size=100, timeout=1)
|
|
715
|
+
output += strip_ansi(extra)
|
|
716
|
+
except Exception:
|
|
717
|
+
pass
|
|
718
|
+
passed = True
|
|
719
|
+
details = output[:200]
|
|
720
|
+
else:
|
|
721
|
+
# Timeout - capture what we have
|
|
722
|
+
output = strip_ansi(docker_child.before or "")
|
|
723
|
+
passed = False
|
|
724
|
+
details = f"Timeout waiting for '{expected_score}': {output[:100]}..."
|
|
725
|
+
except Exception as e:
|
|
726
|
+
passed = False
|
|
727
|
+
details = f"Error: {e}"
|
|
649
728
|
results.append(TestResult("task score", passed, details))
|
|
650
729
|
else:
|
|
730
|
+
# No expected output - just check command runs
|
|
731
|
+
time.sleep(2)
|
|
732
|
+
docker_child.expect(prompt_patterns[:-1], timeout=30)
|
|
733
|
+
output = strip_ansi(docker_child.before or "")
|
|
651
734
|
results.append(
|
|
652
735
|
TestResult(
|
|
653
736
|
"task score",
|
|
654
|
-
True,
|
|
737
|
+
True,
|
|
655
738
|
output[:200],
|
|
656
739
|
)
|
|
657
740
|
)
|
|
@@ -102,14 +102,56 @@ def cleanup_stale_sessions_globally():
|
|
|
102
102
|
This runs once at the start of the entire pytest session.
|
|
103
103
|
The constraint is one active session per USER, so any leftover
|
|
104
104
|
sessions from previous runs will block new session creation.
|
|
105
|
+
|
|
106
|
+
Also ensures we have sessions in various states for testing:
|
|
107
|
+
- At least one 'cancelled' session (for test_join_cancelled_session_fails)
|
|
108
|
+
- At least one 'paused' session (for test_join_paused_session_fails)
|
|
105
109
|
"""
|
|
106
110
|
try:
|
|
107
111
|
user_id = get_test_user_id()
|
|
112
|
+
|
|
113
|
+
# First, clean up truly stale sessions
|
|
108
114
|
ssh_query(f"""
|
|
109
115
|
UPDATE sessions SET status = 'abandoned'
|
|
110
116
|
WHERE user_id = '{user_id}'
|
|
111
|
-
AND status IN ('created', 'in_progress'
|
|
117
|
+
AND status IN ('created', 'in_progress')
|
|
118
|
+
""")
|
|
119
|
+
|
|
120
|
+
# Ensure we have at least one cancelled session for testing
|
|
121
|
+
# (convert an abandoned session if none exist)
|
|
122
|
+
cancelled_count = ssh_query(f"""
|
|
123
|
+
SELECT COUNT(*) FROM sessions
|
|
124
|
+
WHERE user_id = '{user_id}' AND status = 'cancelled'
|
|
125
|
+
""")
|
|
126
|
+
if int(cancelled_count or 0) == 0:
|
|
127
|
+
ssh_query(f"""
|
|
128
|
+
UPDATE sessions SET status = 'cancelled'
|
|
129
|
+
WHERE user_id = '{user_id}'
|
|
130
|
+
AND status = 'abandoned'
|
|
131
|
+
AND id = (
|
|
132
|
+
SELECT id FROM sessions
|
|
133
|
+
WHERE user_id = '{user_id}' AND status = 'abandoned'
|
|
134
|
+
LIMIT 1
|
|
135
|
+
)
|
|
136
|
+
""")
|
|
137
|
+
|
|
138
|
+
# Ensure we have at least one paused session for testing
|
|
139
|
+
paused_count = ssh_query(f"""
|
|
140
|
+
SELECT COUNT(*) FROM sessions
|
|
141
|
+
WHERE user_id = '{user_id}' AND status = 'paused'
|
|
112
142
|
""")
|
|
143
|
+
if int(paused_count or 0) == 0:
|
|
144
|
+
ssh_query(f"""
|
|
145
|
+
UPDATE sessions SET status = 'paused'
|
|
146
|
+
WHERE user_id = '{user_id}'
|
|
147
|
+
AND status = 'abandoned'
|
|
148
|
+
AND id = (
|
|
149
|
+
SELECT id FROM sessions
|
|
150
|
+
WHERE user_id = '{user_id}' AND status = 'abandoned'
|
|
151
|
+
LIMIT 1
|
|
152
|
+
)
|
|
153
|
+
""")
|
|
154
|
+
|
|
113
155
|
except RuntimeError:
|
|
114
156
|
# Test user doesn't exist yet - setup hasn't run
|
|
115
157
|
pass
|
|
@@ -158,6 +158,112 @@ def ssh_command(cmd: str) -> str:
|
|
|
158
158
|
return result.stdout.strip()
|
|
159
159
|
|
|
160
160
|
|
|
161
|
+
def _create_test_session_states():
|
|
162
|
+
"""Create sessions in cancelled and paused states for edge-case tests.
|
|
163
|
+
|
|
164
|
+
This enables TestSessionJoin tests that verify joining cancelled/paused
|
|
165
|
+
sessions fails appropriately.
|
|
166
|
+
|
|
167
|
+
Uses the proper API flow:
|
|
168
|
+
1. Login as test user (JWT auth for web UI routes)
|
|
169
|
+
2. Create sessions via CLI API
|
|
170
|
+
3. Cancel/pause them via web UI API
|
|
171
|
+
"""
|
|
172
|
+
# Get CLI API key for creating sessions
|
|
173
|
+
if not CLI_CONFIG_PATH.exists():
|
|
174
|
+
console.print("[yellow]CLI config not found, skipping state creation[/yellow]")
|
|
175
|
+
return
|
|
176
|
+
|
|
177
|
+
config = json.loads(CLI_CONFIG_PATH.read_text())
|
|
178
|
+
cli_headers = {"Authorization": f"Bearer {config['api_key']}"}
|
|
179
|
+
|
|
180
|
+
# Login as test user to get JWT for web UI routes
|
|
181
|
+
login_response = requests.post(
|
|
182
|
+
f"{BASE_URL}/api/v1/auth/login",
|
|
183
|
+
json={"email": TEST_EMAIL, "password": TEST_PASSWORD},
|
|
184
|
+
timeout=30,
|
|
185
|
+
)
|
|
186
|
+
if login_response.status_code != 200:
|
|
187
|
+
console.print("[yellow]Could not login test user, skipping state creation[/yellow]")
|
|
188
|
+
return
|
|
189
|
+
|
|
190
|
+
jwt_token = login_response.json()["access_token"]
|
|
191
|
+
jwt_headers = {"Authorization": f"Bearer {jwt_token}"}
|
|
192
|
+
|
|
193
|
+
# Find two pending assignments
|
|
194
|
+
user_id = ssh_query(f"SELECT id FROM users WHERE email = '{TEST_EMAIL}'")
|
|
195
|
+
assignments = ssh_query(f"""
|
|
196
|
+
SELECT a.id FROM assignments a
|
|
197
|
+
LEFT JOIN sessions s ON s.assignment_id = a.id
|
|
198
|
+
AND s.status IN ('created', 'in_progress', 'paused', 'cancelled')
|
|
199
|
+
WHERE a.user_id = '{user_id}'
|
|
200
|
+
AND a.status = 'pending'
|
|
201
|
+
AND s.id IS NULL
|
|
202
|
+
LIMIT 2
|
|
203
|
+
""")
|
|
204
|
+
|
|
205
|
+
if not assignments:
|
|
206
|
+
console.print("[yellow]No available assignments for state tests[/yellow]")
|
|
207
|
+
return
|
|
208
|
+
|
|
209
|
+
assignment_ids = [a for a in assignments.split("\n") if a]
|
|
210
|
+
|
|
211
|
+
# Create and cancel a session
|
|
212
|
+
if len(assignment_ids) >= 1:
|
|
213
|
+
# Create session via CLI API
|
|
214
|
+
create_resp = requests.post(
|
|
215
|
+
f"{BASE_URL}/api/v1/cli/assignments/{assignment_ids[0]}/create-session",
|
|
216
|
+
headers=cli_headers,
|
|
217
|
+
timeout=30,
|
|
218
|
+
)
|
|
219
|
+
if create_resp.status_code == 200:
|
|
220
|
+
session_id = create_resp.json()["session_id"]
|
|
221
|
+
# Cancel via web UI API
|
|
222
|
+
cancel_resp = requests.post(
|
|
223
|
+
f"{BASE_URL}/api/v1/sessions/{session_id}/cancel",
|
|
224
|
+
headers=jwt_headers,
|
|
225
|
+
json={"reason": "testing", "notes": "E2E test cancelled session"},
|
|
226
|
+
timeout=30,
|
|
227
|
+
)
|
|
228
|
+
if cancel_resp.status_code == 200:
|
|
229
|
+
console.print(f"[dim]Created cancelled session: {session_id[:8]}...[/dim]")
|
|
230
|
+
else:
|
|
231
|
+
console.print(
|
|
232
|
+
f"[yellow]Failed to cancel session: {cancel_resp.status_code}[/yellow]"
|
|
233
|
+
)
|
|
234
|
+
|
|
235
|
+
# Create and pause a session
|
|
236
|
+
if len(assignment_ids) >= 2:
|
|
237
|
+
# Create session via CLI API
|
|
238
|
+
create_resp = requests.post(
|
|
239
|
+
f"{BASE_URL}/api/v1/cli/assignments/{assignment_ids[1]}/create-session",
|
|
240
|
+
headers=cli_headers,
|
|
241
|
+
timeout=30,
|
|
242
|
+
)
|
|
243
|
+
if create_resp.status_code == 200:
|
|
244
|
+
session_id = create_resp.json()["session_id"]
|
|
245
|
+
# Join to make it in_progress (required before pause)
|
|
246
|
+
join_resp = requests.post(
|
|
247
|
+
f"{BASE_URL}/api/v1/cli/sessions/{session_id}/join",
|
|
248
|
+
headers=cli_headers,
|
|
249
|
+
timeout=30,
|
|
250
|
+
)
|
|
251
|
+
if join_resp.status_code == 200:
|
|
252
|
+
# Pause via web UI API
|
|
253
|
+
pause_resp = requests.patch(
|
|
254
|
+
f"{BASE_URL}/api/v1/sessions/{session_id}/pause",
|
|
255
|
+
headers=jwt_headers,
|
|
256
|
+
json={"reason": "testing", "notes": "E2E test paused session"},
|
|
257
|
+
timeout=30,
|
|
258
|
+
)
|
|
259
|
+
if pause_resp.status_code == 200:
|
|
260
|
+
console.print(f"[dim]Created paused session: {session_id[:8]}...[/dim]")
|
|
261
|
+
else:
|
|
262
|
+
console.print(
|
|
263
|
+
f"[yellow]Failed to pause session: {pause_resp.status_code}[/yellow]"
|
|
264
|
+
)
|
|
265
|
+
|
|
266
|
+
|
|
161
267
|
@click.group()
|
|
162
268
|
def cli():
|
|
163
269
|
"""E2E Test Suite for cyber-task-horizons."""
|
|
@@ -765,6 +871,7 @@ def full(admin_password: str, yes: bool, skip_setup: bool, cleanup_after: bool):
|
|
|
765
871
|
|
|
766
872
|
from automated_runner import run_benchmark_test
|
|
767
873
|
|
|
874
|
+
first_benchmark_done = False
|
|
768
875
|
for benchmark in BENCHMARK_TASKS.keys():
|
|
769
876
|
console.print(f"\n[bold]--- {benchmark} ---[/bold]")
|
|
770
877
|
try:
|
|
@@ -779,10 +886,34 @@ def full(admin_password: str, yes: bool, skip_setup: bool, cleanup_after: bool):
|
|
|
779
886
|
console.print(f"[red]{benchmark}: ERROR - {e}[/red]")
|
|
780
887
|
results["phase2"][benchmark] = False
|
|
781
888
|
|
|
889
|
+
# Phase 2.5: After first benchmark, run session-join tests while sessions still exist
|
|
890
|
+
if not first_benchmark_done:
|
|
891
|
+
first_benchmark_done = True
|
|
892
|
+
console.print("\n[dim]Running session-join tests (while sessions active)...[/dim]")
|
|
893
|
+
join_result = subprocess.run(
|
|
894
|
+
[
|
|
895
|
+
"uv",
|
|
896
|
+
"run",
|
|
897
|
+
"pytest",
|
|
898
|
+
str(tests_dir / "test_session_lifecycle.py::TestSessionJoin"),
|
|
899
|
+
"-v",
|
|
900
|
+
"--tb=short",
|
|
901
|
+
],
|
|
902
|
+
cwd=tests_dir.parent.parent,
|
|
903
|
+
)
|
|
904
|
+
if join_result.returncode != 0:
|
|
905
|
+
console.print(
|
|
906
|
+
"[yellow]Session join tests had issues (some skips expected)[/yellow]"
|
|
907
|
+
)
|
|
908
|
+
|
|
782
909
|
phase2_passed = all(results["phase2"].values())
|
|
783
910
|
if not phase2_passed:
|
|
784
911
|
console.print("\n[yellow]Phase 2 had failures - continuing to Phase 3[/yellow]")
|
|
785
912
|
|
|
913
|
+
# Phase 2.9: Create cancelled and paused sessions for edge-case tests
|
|
914
|
+
console.print("\n[dim]Creating test sessions in cancelled/paused states...[/dim]")
|
|
915
|
+
_create_test_session_states()
|
|
916
|
+
|
|
786
917
|
# Phase 3: Session verification tests
|
|
787
918
|
console.print("\n" + "=" * 60)
|
|
788
919
|
console.print("[bold cyan]PHASE 3: Session Verification Tests[/bold cyan]")
|
|
@@ -339,7 +339,35 @@ class TestEvalLogIntegrity:
|
|
|
339
339
|
), f"Session ID not in path: {session_id} -> {path}"
|
|
340
340
|
|
|
341
341
|
def test_no_orphaned_eval_logs(self):
|
|
342
|
-
"""All eval logs on VPS should have corresponding sessions.
|
|
342
|
+
"""All eval logs on VPS should have corresponding sessions.
|
|
343
|
+
|
|
344
|
+
We ignore orphans that are:
|
|
345
|
+
1. From E2E test tasks (setup deletes sessions but not files)
|
|
346
|
+
2. From before the current DB started (historical artifacts from dev testing)
|
|
347
|
+
|
|
348
|
+
Only orphans from non-E2E tasks after the DB was created are flagged.
|
|
349
|
+
"""
|
|
350
|
+
import re
|
|
351
|
+
|
|
352
|
+
from tests.e2e.conftest import EXPECTED_TASKS
|
|
353
|
+
|
|
354
|
+
# Build set of E2E task path patterns (slashes become underscores in paths)
|
|
355
|
+
e2e_task_patterns = set()
|
|
356
|
+
for benchmark, tasks in EXPECTED_TASKS.items():
|
|
357
|
+
for task in tasks:
|
|
358
|
+
# Path format: /benchmark/task_id_sanitized/
|
|
359
|
+
sanitized = task.replace("/", "_")
|
|
360
|
+
e2e_task_patterns.add(f"/{benchmark}/{sanitized}/")
|
|
361
|
+
|
|
362
|
+
# Get the earliest session date to filter out pre-DB orphans
|
|
363
|
+
earliest_session = ssh_query("SELECT MIN(created_at) FROM sessions")
|
|
364
|
+
# Extract YYYYMMDD from earliest session (format: 2026-01-08 04:19:22)
|
|
365
|
+
earliest_date = None
|
|
366
|
+
if earliest_session:
|
|
367
|
+
date_match = re.match(r"(\d{4})-(\d{2})-(\d{2})", earliest_session)
|
|
368
|
+
if date_match:
|
|
369
|
+
earliest_date = date_match.group(1) + date_match.group(2) + date_match.group(3)
|
|
370
|
+
|
|
343
371
|
# Get all eval log paths from DB
|
|
344
372
|
db_paths = ssh_query("""
|
|
345
373
|
SELECT eval_log_path FROM sessions
|
|
@@ -352,9 +380,43 @@ class TestEvalLogIntegrity:
|
|
|
352
380
|
disk_set = set(disk_files.split("\n")) if disk_files else set()
|
|
353
381
|
|
|
354
382
|
# Check for orphans (files on disk not in DB)
|
|
355
|
-
|
|
383
|
+
all_orphans = disk_set - db_set - {""}
|
|
384
|
+
|
|
385
|
+
# Separate orphans by category
|
|
386
|
+
e2e_orphans = set()
|
|
387
|
+
pre_db_orphans = set()
|
|
388
|
+
real_orphans = set()
|
|
389
|
+
|
|
390
|
+
# Pattern to extract date from filename: {uuid}_{YYYYMMDD}_{HHMMSS}.eval.gz
|
|
391
|
+
date_pattern = re.compile(r"_(\d{8})_\d{6}\.eval\.gz$")
|
|
392
|
+
|
|
393
|
+
for orphan in all_orphans:
|
|
394
|
+
# Check if from E2E test task
|
|
395
|
+
is_e2e = any(pattern in orphan for pattern in e2e_task_patterns)
|
|
396
|
+
if is_e2e:
|
|
397
|
+
e2e_orphans.add(orphan)
|
|
398
|
+
continue
|
|
399
|
+
|
|
400
|
+
# Check if from before the DB started
|
|
401
|
+
if earliest_date:
|
|
402
|
+
date_match = date_pattern.search(orphan)
|
|
403
|
+
if date_match and date_match.group(1) < earliest_date:
|
|
404
|
+
pre_db_orphans.add(orphan)
|
|
405
|
+
continue
|
|
406
|
+
|
|
407
|
+
# This is a real orphan - could be lost expert data
|
|
408
|
+
real_orphans.add(orphan)
|
|
409
|
+
|
|
410
|
+
# Log expected orphans
|
|
411
|
+
if e2e_orphans:
|
|
412
|
+
print(f"Note: {len(e2e_orphans)} orphaned eval logs from E2E test tasks (expected)")
|
|
413
|
+
if pre_db_orphans:
|
|
414
|
+
print(
|
|
415
|
+
f"Note: {len(pre_db_orphans)} orphaned eval logs from before DB started (historical)"
|
|
416
|
+
)
|
|
356
417
|
|
|
357
|
-
#
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
418
|
+
# Real orphans are a problem - these could be lost expert data
|
|
419
|
+
assert len(real_orphans) == 0, (
|
|
420
|
+
f"Found {len(real_orphans)} orphaned eval logs from non-E2E tasks after DB started "
|
|
421
|
+
f"(files on disk without DB records). First 5: {list(real_orphans)[:5]}"
|
|
422
|
+
)
|
|
@@ -80,7 +80,16 @@ class TestSessionCreation:
|
|
|
80
80
|
SELECT DISTINCT status FROM sessions
|
|
81
81
|
WHERE user_id = '{get_test_user_id()}'
|
|
82
82
|
""")
|
|
83
|
-
valid_statuses = {
|
|
83
|
+
valid_statuses = {
|
|
84
|
+
"created",
|
|
85
|
+
"pending",
|
|
86
|
+
"in_progress",
|
|
87
|
+
"submitted",
|
|
88
|
+
"abandoned",
|
|
89
|
+
"skipped",
|
|
90
|
+
"cancelled",
|
|
91
|
+
"paused",
|
|
92
|
+
}
|
|
84
93
|
for status in statuses.split("\n"):
|
|
85
94
|
if status:
|
|
86
95
|
assert status in valid_statuses, f"Invalid status: {status}"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|