hte-cli 0.2.24__tar.gz → 0.2.25__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. {hte_cli-0.2.24 → hte_cli-0.2.25}/.gitignore +1 -0
  2. {hte_cli-0.2.24 → hte_cli-0.2.25}/PKG-INFO +1 -1
  3. {hte_cli-0.2.24 → hte_cli-0.2.25}/pyproject.toml +1 -1
  4. {hte_cli-0.2.24 → hte_cli-0.2.25}/src/hte_cli/cli.py +53 -7
  5. {hte_cli-0.2.24 → hte_cli-0.2.25}/src/hte_cli/image_utils.py +36 -1
  6. {hte_cli-0.2.24 → hte_cli-0.2.25}/tests/e2e/automated_runner.py +94 -11
  7. {hte_cli-0.2.24 → hte_cli-0.2.25}/tests/e2e/conftest.py +43 -1
  8. {hte_cli-0.2.24 → hte_cli-0.2.25}/tests/e2e/e2e_test.py +131 -0
  9. {hte_cli-0.2.24 → hte_cli-0.2.25}/tests/e2e/test_eval_logs.py +68 -6
  10. {hte_cli-0.2.24 → hte_cli-0.2.25}/tests/e2e/test_session_lifecycle.py +10 -1
  11. {hte_cli-0.2.24 → hte_cli-0.2.25}/uv.lock +1 -1
  12. {hte_cli-0.2.24 → hte_cli-0.2.25}/README.md +0 -0
  13. {hte_cli-0.2.24 → hte_cli-0.2.25}/src/hte_cli/__init__.py +0 -0
  14. {hte_cli-0.2.24 → hte_cli-0.2.25}/src/hte_cli/__main__.py +0 -0
  15. {hte_cli-0.2.24 → hte_cli-0.2.25}/src/hte_cli/api_client.py +0 -0
  16. {hte_cli-0.2.24 → hte_cli-0.2.25}/src/hte_cli/config.py +0 -0
  17. {hte_cli-0.2.24 → hte_cli-0.2.25}/src/hte_cli/errors.py +0 -0
  18. {hte_cli-0.2.24 → hte_cli-0.2.25}/src/hte_cli/events.py +0 -0
  19. {hte_cli-0.2.24 → hte_cli-0.2.25}/src/hte_cli/runner.py +0 -0
  20. {hte_cli-0.2.24 → hte_cli-0.2.25}/src/hte_cli/scorers.py +0 -0
  21. {hte_cli-0.2.24 → hte_cli-0.2.25}/src/hte_cli/version_check.py +0 -0
  22. {hte_cli-0.2.24 → hte_cli-0.2.25}/tests/__init__.py +0 -0
  23. {hte_cli-0.2.24 → hte_cli-0.2.25}/tests/e2e/__init__.py +0 -0
  24. {hte_cli-0.2.24 → hte_cli-0.2.25}/tests/e2e/test_benchmark_flows.py +0 -0
  25. {hte_cli-0.2.24 → hte_cli-0.2.25}/tests/e2e/test_infrastructure.py +0 -0
  26. {hte_cli-0.2.24 → hte_cli-0.2.25}/tests/e2e/test_runtime_imports.py +0 -0
  27. {hte_cli-0.2.24 → hte_cli-0.2.25}/tests/e2e/verify_docker_deps.py +0 -0
  28. {hte_cli-0.2.24 → hte_cli-0.2.25}/tests/unit/__init__.py +0 -0
  29. {hte_cli-0.2.24 → hte_cli-0.2.25}/tests/unit/conftest.py +0 -0
  30. {hte_cli-0.2.24 → hte_cli-0.2.25}/tests/unit/test_image_utils.py +0 -0
  31. {hte_cli-0.2.24 → hte_cli-0.2.25}/tests/unit/test_runner.py +0 -0
  32. {hte_cli-0.2.24 → hte_cli-0.2.25}/tests/unit/test_scorers.py +0 -0
@@ -1,4 +1,5 @@
1
1
  .env
2
+ .envrc
2
3
  .DS_Store
3
4
 
4
5
  docs/build/
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: hte-cli
3
- Version: 0.2.24
3
+ Version: 0.2.25
4
4
  Summary: Human Time-to-Completion Evaluation CLI
5
5
  Project-URL: Homepage, https://github.com/sean-peters-au/lyptus-mono
6
6
  Author: Lyptus Research
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "hte-cli"
3
- version = "0.2.24"
3
+ version = "0.2.25"
4
4
  description = "Human Time-to-Completion Evaluation CLI"
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.11"
@@ -172,6 +172,17 @@ def session_join(ctx, session_id: str, force_setup: bool):
172
172
  console.print("[red]Not logged in. Run: hte-cli auth login[/red]")
173
173
  sys.exit(1)
174
174
 
175
+ # Check Docker is running before we start (with retry prompt)
176
+ while True:
177
+ docker_ok, docker_error = _check_docker()
178
+ if docker_ok:
179
+ break
180
+ console.print(f"[red]{docker_error}[/red]")
181
+ console.print()
182
+ if not click.confirm("Start Docker and retry?", default=True):
183
+ sys.exit(1)
184
+ console.print("[dim]Checking Docker again...[/dim]")
185
+
175
186
  api = APIClient(config)
176
187
 
177
188
  # Step 1: Join session
@@ -201,8 +212,11 @@ def session_join(ctx, session_id: str, force_setup: bool):
201
212
  # Check if reconnecting (session already in_progress)
202
213
  is_reconnect = session_info.get("status") == "in_progress"
203
214
 
204
- if is_reconnect and not force_setup:
205
- console.print("[yellow]Reconnecting to existing session...[/yellow]")
215
+ # Always run setup on reconnect - previous attempt may have failed
216
+ # (e.g., image pull failed, Docker wasn't running, etc.)
217
+ if is_reconnect:
218
+ force_setup = True
219
+ console.print("[yellow]Reconnecting to existing session (re-running setup)...[/yellow]")
206
220
  console.print()
207
221
 
208
222
  console.print(
@@ -219,7 +233,11 @@ def session_join(ctx, session_id: str, force_setup: bool):
219
233
  import time
220
234
  from hte_cli.events import EventStreamer
221
235
  from hte_cli.runner import TaskRunner
222
- from hte_cli.image_utils import extract_images_from_compose, pull_image_with_progress
236
+ from hte_cli.image_utils import (
237
+ extract_images_from_compose,
238
+ extract_image_platforms_from_compose,
239
+ pull_image_with_progress,
240
+ )
223
241
 
224
242
  # Create event streamer
225
243
  events = EventStreamer(api, session_id)
@@ -285,9 +303,11 @@ def session_join(ctx, session_id: str, force_setup: bool):
285
303
  failed_images = []
286
304
 
287
305
  if not is_reconnect or force_setup:
288
- # Extract images from compose
306
+ # Extract images and their platforms from compose
307
+ image_platforms = {}
289
308
  if compose_yaml:
290
309
  images = extract_images_from_compose(compose_yaml)
310
+ image_platforms = extract_image_platforms_from_compose(compose_yaml)
291
311
 
292
312
  # Send setup_started event (includes CLI version for debugging)
293
313
  events.setup_started(images=images, cli_version=__version__)
@@ -298,9 +318,11 @@ def session_join(ctx, session_id: str, force_setup: bool):
298
318
 
299
319
  console.print(f"[bold]Step 2:[/bold] Pulling {len(images)} Docker image(s)...")
300
320
  pull_start = time.monotonic()
321
+ pull_errors = {}
301
322
 
302
323
  for img in images:
303
324
  short_name = img.split("/")[-1][:40]
325
+ platform = image_platforms.get(img)
304
326
 
305
327
  # Check if already cached
306
328
  if check_image_exists_locally(img):
@@ -310,6 +332,7 @@ def session_join(ctx, session_id: str, force_setup: bool):
310
332
 
311
333
  # Need to pull - show progress
312
334
  last_status = ["connecting..."]
335
+ last_error = [""]
313
336
  with console.status(
314
337
  f"[yellow]↓[/yellow] {short_name} [dim]connecting...[/dim]"
315
338
  ) as status:
@@ -328,14 +351,23 @@ def session_join(ctx, session_id: str, force_setup: bool):
328
351
  status.update(
329
352
  f"[yellow]↓[/yellow] {short_name} [dim]{display}[/dim]"
330
353
  )
354
+ # Capture error messages
355
+ if "error" in line.lower() or "denied" in line.lower():
356
+ last_error[0] = line
331
357
 
332
- success = pull_image_with_progress(img, on_progress=show_progress)
358
+ success = pull_image_with_progress(
359
+ img, platform=platform, on_progress=show_progress
360
+ )
333
361
 
334
362
  if success:
335
363
  console.print(f" [green]✓[/green] {short_name} [dim](downloaded)[/dim]")
336
364
  pulled_images.append(img)
337
365
  else:
338
- console.print(f" [red]✗[/red] {short_name} [dim](failed)[/dim]")
366
+ platform_note = f" (platform: {platform})" if platform else ""
367
+ console.print(f" [red]✗[/red] {short_name}{platform_note} [dim](failed)[/dim]")
368
+ if last_error[0]:
369
+ console.print(f" [dim]{last_error[0][:60]}[/dim]")
370
+ pull_errors[img] = last_error[0]
339
371
  failed_images.append(img)
340
372
 
341
373
  pull_duration = time.monotonic() - pull_start
@@ -347,6 +379,20 @@ def session_join(ctx, session_id: str, force_setup: bool):
347
379
  )
348
380
  console.print()
349
381
 
382
+ # Fail fast if any required image couldn't be pulled
383
+ if failed_images:
384
+ console.print(
385
+ f"[red]Error: Failed to pull {len(failed_images)} required Docker image(s).[/red]"
386
+ )
387
+ console.print()
388
+ console.print("[yellow]Troubleshooting:[/yellow]")
389
+ console.print(" 1. Check Docker is running: docker info")
390
+ console.print(" 2. Try manual pull: docker pull python:3.12-slim --platform linux/amd64")
391
+ console.print(" 3. Check network connectivity")
392
+ console.print()
393
+ console.print("Session remains active - you can retry with: hte-cli session join " + session_id)
394
+ sys.exit(1)
395
+
350
396
  # Send setup_completed - THIS STARTS THE TIMER ON SERVER
351
397
  total_setup = time.monotonic() - setup_start_time
352
398
  events.setup_completed(total_seconds=total_setup)
@@ -644,7 +690,7 @@ def _check_docker() -> tuple[bool, str | None]:
644
690
  timeout=10,
645
691
  )
646
692
  if result.returncode != 0:
647
- return False, "Docker is not running. Start Docker Desktop or the Docker daemon."
693
+ return False, "Docker is not running. Start Docker (Docker Desktop, colima, or dockerd)."
648
694
  except FileNotFoundError:
649
695
  return False, "Docker is not installed. Install from https://docs.docker.com/get-docker/"
650
696
  except Exception as e:
@@ -38,6 +38,33 @@ def extract_images_from_compose(compose_yaml: str) -> list[str]:
38
38
  return []
39
39
 
40
40
 
41
+ def extract_image_platforms_from_compose(compose_yaml: str) -> dict[str, str | None]:
42
+ """
43
+ Extract Docker image names and their platforms from a compose.yaml string.
44
+
45
+ Args:
46
+ compose_yaml: Docker Compose YAML content
47
+
48
+ Returns:
49
+ Dict mapping image names to their platform (or None if no platform specified)
50
+ """
51
+ try:
52
+ compose_data = yaml.safe_load(compose_yaml)
53
+ if not compose_data or "services" not in compose_data:
54
+ return {}
55
+
56
+ image_platforms = {}
57
+ for service_name, service_config in compose_data.get("services", {}).items():
58
+ if isinstance(service_config, dict) and "image" in service_config:
59
+ image = service_config["image"]
60
+ platform = service_config.get("platform")
61
+ image_platforms[image] = platform
62
+ return image_platforms
63
+ except yaml.YAMLError as e:
64
+ logger.warning(f"Failed to parse compose.yaml: {e}")
65
+ return {}
66
+
67
+
41
68
  def check_image_exists_locally(image: str) -> bool:
42
69
  """
43
70
  Check if a Docker image exists locally.
@@ -61,16 +88,20 @@ def check_image_exists_locally(image: str) -> bool:
61
88
 
62
89
  def pull_image_with_progress(
63
90
  image: str,
91
+ platform: str | None = None,
64
92
  on_progress: Callable[[str, str], None] | None = None,
65
93
  on_complete: Callable[[str, bool], None] | None = None,
94
+ on_error: Callable[[str, str], None] | None = None,
66
95
  ) -> bool:
67
96
  """
68
97
  Pull a Docker image with progress callbacks using PTY for real progress output.
69
98
 
70
99
  Args:
71
100
  image: Image name to pull
101
+ platform: Optional platform to pull (e.g., "linux/amd64")
72
102
  on_progress: Callback(image, status_line) called for each progress update
73
103
  on_complete: Callback(image, success) called when pull completes
104
+ on_error: Callback(image, error_message) called when pull fails
74
105
 
75
106
  Returns:
76
107
  True if pull succeeded, False otherwise
@@ -79,8 +110,12 @@ def pull_image_with_progress(
79
110
  # Use PTY to get real progress output from docker
80
111
  master_fd, slave_fd = pty.openpty()
81
112
 
113
+ cmd = ["docker", "pull", image]
114
+ if platform:
115
+ cmd.extend(["--platform", platform])
116
+
82
117
  process = subprocess.Popen(
83
- ["docker", "pull", image],
118
+ cmd,
84
119
  stdout=slave_fd,
85
120
  stderr=slave_fd,
86
121
  stdin=slave_fd,
@@ -169,7 +169,7 @@ ENVIRONMENT_CHECKS = {
169
169
  ("cat /etc/os-release | grep '^NAME='", "Kali", "Kali Linux base image"),
170
170
  # CyberGym-specific: submit.sh in home dir for PoC submission
171
171
  ("test -f /root/submit.sh && echo 'submit.sh'", "submit.sh", "submit.sh present in /root"),
172
- ("test -x /root/submit.sh && echo 'executable'", "executable", "submit.sh is executable"),
172
+ # Note: submit.sh executable check removed - file is made executable by task command
173
173
  # Required tools - STRICT (same image as cybench)
174
174
  ("which gdb", "/usr/bin/gdb", "gdb at /usr/bin/gdb"),
175
175
  ("which ghidra", "/usr/bin/ghidra", "ghidra at /usr/bin/ghidra"),
@@ -369,6 +369,63 @@ def create_session_for_task(task_id: str) -> str | None:
369
369
  return None
370
370
 
371
371
 
372
+ def test_join_endpoint(session_id: str, benchmark: str) -> list[TestResult]:
373
+ """Test the session join endpoint while session is in 'created' state.
374
+
375
+ This tests the API response before the CLI actually joins.
376
+ """
377
+ import json
378
+ from pathlib import Path
379
+
380
+ results = []
381
+ base_url = os.environ.get("HTE_API_URL", "https://cyber-task-horizons.com")
382
+
383
+ config_path = Path.home() / "Library" / "Application Support" / "hte-cli" / "config.json"
384
+ if not config_path.exists():
385
+ results.append(TestResult("Join API test", False, "", "No CLI config"))
386
+ return results
387
+
388
+ config = json.loads(config_path.read_text())
389
+ headers = {"Authorization": f"Bearer {config['api_key']}"}
390
+
391
+ # Test the join endpoint
392
+ response = requests.post(
393
+ f"{base_url}/api/v1/cli/sessions/{session_id}/join",
394
+ headers=headers,
395
+ timeout=30,
396
+ )
397
+
398
+ if response.status_code != 200:
399
+ results.append(TestResult("Join API response", False, "", f"Status {response.status_code}"))
400
+ return results
401
+
402
+ data = response.json()
403
+
404
+ # Test: scorer_type is present
405
+ if "scorer_type" in data:
406
+ results.append(TestResult("Join returns scorer_type", True, data["scorer_type"]))
407
+ else:
408
+ results.append(TestResult("Join returns scorer_type", False, "", "Missing scorer_type"))
409
+
410
+ # Test: intermediate_scoring is present
411
+ if "intermediate_scoring" in data:
412
+ results.append(
413
+ TestResult("Join returns intermediate_scoring", True, str(data["intermediate_scoring"]))
414
+ )
415
+ else:
416
+ results.append(TestResult("Join returns intermediate_scoring", False, "", "Missing"))
417
+
418
+ # Test: target is present for flag-based benchmarks
419
+ if benchmark in ("cybench", "nyuctf", "intercode-ctf"):
420
+ has_target = "target" in data or ("task" in data and "target" in data.get("task", {}))
421
+ if has_target:
422
+ results.append(TestResult("Join returns target", True, "Present"))
423
+ else:
424
+ results.append(TestResult("Join returns target", False, "", "Missing target"))
425
+
426
+ return results
427
+
428
+
372
429
  def run_automated_test(task_id: str, benchmark: str, timeout: int = 300) -> list[TestResult]:
373
430
  """
374
431
  Run automated E2E test for a task using pexpect.
@@ -389,6 +446,11 @@ def run_automated_test(task_id: str, benchmark: str, timeout: int = 300) -> list
389
446
  return results
390
447
  results.append(TestResult("Session creation", True, f"Session: {session_id[:8]}..."))
391
448
 
449
+ # Test join endpoint while session is in 'created' state (before CLI joins)
450
+ console.print("Testing join endpoint...")
451
+ join_results = test_join_endpoint(session_id, benchmark)
452
+ results.extend(join_results)
453
+
392
454
  # Start the CLI using the new session join flow
393
455
  # Session has status="created", so CLI will run full setup
394
456
  # Use explicit pipx path to test the published PyPI version, not local dev
@@ -634,24 +696,45 @@ def run_automated_test(task_id: str, benchmark: str, timeout: int = 300) -> list
634
696
  except Exception:
635
697
  pass
636
698
  docker_child.sendline(sub_tests["score_cmd"])
637
- time.sleep(2)
638
- docker_child.expect(prompt_patterns[:-1], timeout=30)
639
- output = strip_ansi(docker_child.before or "")
640
699
 
641
700
  expected_score = sub_tests.get("score_expect")
642
701
  if expected_score:
643
- passed = expected_score.lower() in output.lower()
644
- details = (
645
- output[:200]
646
- if passed
647
- else f"Expected '{expected_score}' in output: {output[:100]}..."
648
- )
702
+ # Wait specifically for the score output, not just any prompt
703
+ # The score output appears as "Answer: ..., Score: I" or similar
704
+ try:
705
+ idx = docker_child.expect(
706
+ [expected_score, pexpect.TIMEOUT],
707
+ timeout=10,
708
+ )
709
+ if idx == 0:
710
+ # Found expected output - capture surrounding context
711
+ output = strip_ansi(docker_child.before or "") + expected_score
712
+ # Read a bit more to get the full score line
713
+ try:
714
+ extra = docker_child.read_nonblocking(size=100, timeout=1)
715
+ output += strip_ansi(extra)
716
+ except Exception:
717
+ pass
718
+ passed = True
719
+ details = output[:200]
720
+ else:
721
+ # Timeout - capture what we have
722
+ output = strip_ansi(docker_child.before or "")
723
+ passed = False
724
+ details = f"Timeout waiting for '{expected_score}': {output[:100]}..."
725
+ except Exception as e:
726
+ passed = False
727
+ details = f"Error: {e}"
649
728
  results.append(TestResult("task score", passed, details))
650
729
  else:
730
+ # No expected output - just check command runs
731
+ time.sleep(2)
732
+ docker_child.expect(prompt_patterns[:-1], timeout=30)
733
+ output = strip_ansi(docker_child.before or "")
651
734
  results.append(
652
735
  TestResult(
653
736
  "task score",
654
- True, # Just checking it runs
737
+ True,
655
738
  output[:200],
656
739
  )
657
740
  )
@@ -102,14 +102,56 @@ def cleanup_stale_sessions_globally():
102
102
  This runs once at the start of the entire pytest session.
103
103
  The constraint is one active session per USER, so any leftover
104
104
  sessions from previous runs will block new session creation.
105
+
106
+ Also ensures we have sessions in various states for testing:
107
+ - At least one 'cancelled' session (for test_join_cancelled_session_fails)
108
+ - At least one 'paused' session (for test_join_paused_session_fails)
105
109
  """
106
110
  try:
107
111
  user_id = get_test_user_id()
112
+
113
+ # First, clean up truly stale sessions
108
114
  ssh_query(f"""
109
115
  UPDATE sessions SET status = 'abandoned'
110
116
  WHERE user_id = '{user_id}'
111
- AND status IN ('created', 'in_progress', 'paused')
117
+ AND status IN ('created', 'in_progress')
118
+ """)
119
+
120
+ # Ensure we have at least one cancelled session for testing
121
+ # (convert an abandoned session if none exist)
122
+ cancelled_count = ssh_query(f"""
123
+ SELECT COUNT(*) FROM sessions
124
+ WHERE user_id = '{user_id}' AND status = 'cancelled'
125
+ """)
126
+ if int(cancelled_count or 0) == 0:
127
+ ssh_query(f"""
128
+ UPDATE sessions SET status = 'cancelled'
129
+ WHERE user_id = '{user_id}'
130
+ AND status = 'abandoned'
131
+ AND id = (
132
+ SELECT id FROM sessions
133
+ WHERE user_id = '{user_id}' AND status = 'abandoned'
134
+ LIMIT 1
135
+ )
136
+ """)
137
+
138
+ # Ensure we have at least one paused session for testing
139
+ paused_count = ssh_query(f"""
140
+ SELECT COUNT(*) FROM sessions
141
+ WHERE user_id = '{user_id}' AND status = 'paused'
112
142
  """)
143
+ if int(paused_count or 0) == 0:
144
+ ssh_query(f"""
145
+ UPDATE sessions SET status = 'paused'
146
+ WHERE user_id = '{user_id}'
147
+ AND status = 'abandoned'
148
+ AND id = (
149
+ SELECT id FROM sessions
150
+ WHERE user_id = '{user_id}' AND status = 'abandoned'
151
+ LIMIT 1
152
+ )
153
+ """)
154
+
113
155
  except RuntimeError:
114
156
  # Test user doesn't exist yet - setup hasn't run
115
157
  pass
@@ -158,6 +158,112 @@ def ssh_command(cmd: str) -> str:
158
158
  return result.stdout.strip()
159
159
 
160
160
 
161
+ def _create_test_session_states():
162
+ """Create sessions in cancelled and paused states for edge-case tests.
163
+
164
+ This enables TestSessionJoin tests that verify joining cancelled/paused
165
+ sessions fails appropriately.
166
+
167
+ Uses the proper API flow:
168
+ 1. Login as test user (JWT auth for web UI routes)
169
+ 2. Create sessions via CLI API
170
+ 3. Cancel/pause them via web UI API
171
+ """
172
+ # Get CLI API key for creating sessions
173
+ if not CLI_CONFIG_PATH.exists():
174
+ console.print("[yellow]CLI config not found, skipping state creation[/yellow]")
175
+ return
176
+
177
+ config = json.loads(CLI_CONFIG_PATH.read_text())
178
+ cli_headers = {"Authorization": f"Bearer {config['api_key']}"}
179
+
180
+ # Login as test user to get JWT for web UI routes
181
+ login_response = requests.post(
182
+ f"{BASE_URL}/api/v1/auth/login",
183
+ json={"email": TEST_EMAIL, "password": TEST_PASSWORD},
184
+ timeout=30,
185
+ )
186
+ if login_response.status_code != 200:
187
+ console.print("[yellow]Could not login test user, skipping state creation[/yellow]")
188
+ return
189
+
190
+ jwt_token = login_response.json()["access_token"]
191
+ jwt_headers = {"Authorization": f"Bearer {jwt_token}"}
192
+
193
+ # Find two pending assignments
194
+ user_id = ssh_query(f"SELECT id FROM users WHERE email = '{TEST_EMAIL}'")
195
+ assignments = ssh_query(f"""
196
+ SELECT a.id FROM assignments a
197
+ LEFT JOIN sessions s ON s.assignment_id = a.id
198
+ AND s.status IN ('created', 'in_progress', 'paused', 'cancelled')
199
+ WHERE a.user_id = '{user_id}'
200
+ AND a.status = 'pending'
201
+ AND s.id IS NULL
202
+ LIMIT 2
203
+ """)
204
+
205
+ if not assignments:
206
+ console.print("[yellow]No available assignments for state tests[/yellow]")
207
+ return
208
+
209
+ assignment_ids = [a for a in assignments.split("\n") if a]
210
+
211
+ # Create and cancel a session
212
+ if len(assignment_ids) >= 1:
213
+ # Create session via CLI API
214
+ create_resp = requests.post(
215
+ f"{BASE_URL}/api/v1/cli/assignments/{assignment_ids[0]}/create-session",
216
+ headers=cli_headers,
217
+ timeout=30,
218
+ )
219
+ if create_resp.status_code == 200:
220
+ session_id = create_resp.json()["session_id"]
221
+ # Cancel via web UI API
222
+ cancel_resp = requests.post(
223
+ f"{BASE_URL}/api/v1/sessions/{session_id}/cancel",
224
+ headers=jwt_headers,
225
+ json={"reason": "testing", "notes": "E2E test cancelled session"},
226
+ timeout=30,
227
+ )
228
+ if cancel_resp.status_code == 200:
229
+ console.print(f"[dim]Created cancelled session: {session_id[:8]}...[/dim]")
230
+ else:
231
+ console.print(
232
+ f"[yellow]Failed to cancel session: {cancel_resp.status_code}[/yellow]"
233
+ )
234
+
235
+ # Create and pause a session
236
+ if len(assignment_ids) >= 2:
237
+ # Create session via CLI API
238
+ create_resp = requests.post(
239
+ f"{BASE_URL}/api/v1/cli/assignments/{assignment_ids[1]}/create-session",
240
+ headers=cli_headers,
241
+ timeout=30,
242
+ )
243
+ if create_resp.status_code == 200:
244
+ session_id = create_resp.json()["session_id"]
245
+ # Join to make it in_progress (required before pause)
246
+ join_resp = requests.post(
247
+ f"{BASE_URL}/api/v1/cli/sessions/{session_id}/join",
248
+ headers=cli_headers,
249
+ timeout=30,
250
+ )
251
+ if join_resp.status_code == 200:
252
+ # Pause via web UI API
253
+ pause_resp = requests.patch(
254
+ f"{BASE_URL}/api/v1/sessions/{session_id}/pause",
255
+ headers=jwt_headers,
256
+ json={"reason": "testing", "notes": "E2E test paused session"},
257
+ timeout=30,
258
+ )
259
+ if pause_resp.status_code == 200:
260
+ console.print(f"[dim]Created paused session: {session_id[:8]}...[/dim]")
261
+ else:
262
+ console.print(
263
+ f"[yellow]Failed to pause session: {pause_resp.status_code}[/yellow]"
264
+ )
265
+
266
+
161
267
  @click.group()
162
268
  def cli():
163
269
  """E2E Test Suite for cyber-task-horizons."""
@@ -765,6 +871,7 @@ def full(admin_password: str, yes: bool, skip_setup: bool, cleanup_after: bool):
765
871
 
766
872
  from automated_runner import run_benchmark_test
767
873
 
874
+ first_benchmark_done = False
768
875
  for benchmark in BENCHMARK_TASKS.keys():
769
876
  console.print(f"\n[bold]--- {benchmark} ---[/bold]")
770
877
  try:
@@ -779,10 +886,34 @@ def full(admin_password: str, yes: bool, skip_setup: bool, cleanup_after: bool):
779
886
  console.print(f"[red]{benchmark}: ERROR - {e}[/red]")
780
887
  results["phase2"][benchmark] = False
781
888
 
889
+ # Phase 2.5: After first benchmark, run session-join tests while sessions still exist
890
+ if not first_benchmark_done:
891
+ first_benchmark_done = True
892
+ console.print("\n[dim]Running session-join tests (while sessions active)...[/dim]")
893
+ join_result = subprocess.run(
894
+ [
895
+ "uv",
896
+ "run",
897
+ "pytest",
898
+ str(tests_dir / "test_session_lifecycle.py::TestSessionJoin"),
899
+ "-v",
900
+ "--tb=short",
901
+ ],
902
+ cwd=tests_dir.parent.parent,
903
+ )
904
+ if join_result.returncode != 0:
905
+ console.print(
906
+ "[yellow]Session join tests had issues (some skips expected)[/yellow]"
907
+ )
908
+
782
909
  phase2_passed = all(results["phase2"].values())
783
910
  if not phase2_passed:
784
911
  console.print("\n[yellow]Phase 2 had failures - continuing to Phase 3[/yellow]")
785
912
 
913
+ # Phase 2.9: Create cancelled and paused sessions for edge-case tests
914
+ console.print("\n[dim]Creating test sessions in cancelled/paused states...[/dim]")
915
+ _create_test_session_states()
916
+
786
917
  # Phase 3: Session verification tests
787
918
  console.print("\n" + "=" * 60)
788
919
  console.print("[bold cyan]PHASE 3: Session Verification Tests[/bold cyan]")
@@ -339,7 +339,35 @@ class TestEvalLogIntegrity:
339
339
  ), f"Session ID not in path: {session_id} -> {path}"
340
340
 
341
341
  def test_no_orphaned_eval_logs(self):
342
- """All eval logs on VPS should have corresponding sessions."""
342
+ """All eval logs on VPS should have corresponding sessions.
343
+
344
+ We ignore orphans that are:
345
+ 1. From E2E test tasks (setup deletes sessions but not files)
346
+ 2. From before the current DB started (historical artifacts from dev testing)
347
+
348
+ Only orphans from non-E2E tasks after the DB was created are flagged.
349
+ """
350
+ import re
351
+
352
+ from tests.e2e.conftest import EXPECTED_TASKS
353
+
354
+ # Build set of E2E task path patterns (slashes become underscores in paths)
355
+ e2e_task_patterns = set()
356
+ for benchmark, tasks in EXPECTED_TASKS.items():
357
+ for task in tasks:
358
+ # Path format: /benchmark/task_id_sanitized/
359
+ sanitized = task.replace("/", "_")
360
+ e2e_task_patterns.add(f"/{benchmark}/{sanitized}/")
361
+
362
+ # Get the earliest session date to filter out pre-DB orphans
363
+ earliest_session = ssh_query("SELECT MIN(created_at) FROM sessions")
364
+ # Extract YYYYMMDD from earliest session (format: 2026-01-08 04:19:22)
365
+ earliest_date = None
366
+ if earliest_session:
367
+ date_match = re.match(r"(\d{4})-(\d{2})-(\d{2})", earliest_session)
368
+ if date_match:
369
+ earliest_date = date_match.group(1) + date_match.group(2) + date_match.group(3)
370
+
343
371
  # Get all eval log paths from DB
344
372
  db_paths = ssh_query("""
345
373
  SELECT eval_log_path FROM sessions
@@ -352,9 +380,43 @@ class TestEvalLogIntegrity:
352
380
  disk_set = set(disk_files.split("\n")) if disk_files else set()
353
381
 
354
382
  # Check for orphans (files on disk not in DB)
355
- orphans = disk_set - db_set - {""}
383
+ all_orphans = disk_set - db_set - {""}
384
+
385
+ # Separate orphans by category
386
+ e2e_orphans = set()
387
+ pre_db_orphans = set()
388
+ real_orphans = set()
389
+
390
+ # Pattern to extract date from filename: {uuid}_{YYYYMMDD}_{HHMMSS}.eval.gz
391
+ date_pattern = re.compile(r"_(\d{8})_\d{6}\.eval\.gz$")
392
+
393
+ for orphan in all_orphans:
394
+ # Check if from E2E test task
395
+ is_e2e = any(pattern in orphan for pattern in e2e_task_patterns)
396
+ if is_e2e:
397
+ e2e_orphans.add(orphan)
398
+ continue
399
+
400
+ # Check if from before the DB started
401
+ if earliest_date:
402
+ date_match = date_pattern.search(orphan)
403
+ if date_match and date_match.group(1) < earliest_date:
404
+ pre_db_orphans.add(orphan)
405
+ continue
406
+
407
+ # This is a real orphan - could be lost expert data
408
+ real_orphans.add(orphan)
409
+
410
+ # Log expected orphans
411
+ if e2e_orphans:
412
+ print(f"Note: {len(e2e_orphans)} orphaned eval logs from E2E test tasks (expected)")
413
+ if pre_db_orphans:
414
+ print(
415
+ f"Note: {len(pre_db_orphans)} orphaned eval logs from before DB started (historical)"
416
+ )
356
417
 
357
- # Some orphans might be acceptable (old tests, etc.)
358
- # Just report, don't fail
359
- if orphans:
360
- pytest.skip(f"Found {len(orphans)} potentially orphaned eval logs")
418
+ # Real orphans are a problem - these could be lost expert data
419
+ assert len(real_orphans) == 0, (
420
+ f"Found {len(real_orphans)} orphaned eval logs from non-E2E tasks after DB started "
421
+ f"(files on disk without DB records). First 5: {list(real_orphans)[:5]}"
422
+ )
@@ -80,7 +80,16 @@ class TestSessionCreation:
80
80
  SELECT DISTINCT status FROM sessions
81
81
  WHERE user_id = '{get_test_user_id()}'
82
82
  """)
83
- valid_statuses = {"created", "pending", "in_progress", "submitted", "abandoned", "skipped"}
83
+ valid_statuses = {
84
+ "created",
85
+ "pending",
86
+ "in_progress",
87
+ "submitted",
88
+ "abandoned",
89
+ "skipped",
90
+ "cancelled",
91
+ "paused",
92
+ }
84
93
  for status in statuses.split("\n"):
85
94
  if status:
86
95
  assert status in valid_statuses, f"Invalid status: {status}"
@@ -625,7 +625,7 @@ wheels = [
625
625
 
626
626
  [[package]]
627
627
  name = "hte-cli"
628
- version = "0.2.23"
628
+ version = "0.2.24"
629
629
  source = { editable = "." }
630
630
  dependencies = [
631
631
  { name = "click" },
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes