hte-cli 0.2.21__tar.gz → 0.2.22__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. {hte_cli-0.2.21 → hte_cli-0.2.22}/PKG-INFO +1 -1
  2. {hte_cli-0.2.21 → hte_cli-0.2.22}/pyproject.toml +1 -1
  3. {hte_cli-0.2.21 → hte_cli-0.2.22}/src/hte_cli/cli.py +21 -1
  4. {hte_cli-0.2.21 → hte_cli-0.2.22}/tests/e2e/automated_runner.py +44 -15
  5. {hte_cli-0.2.21 → hte_cli-0.2.22}/tests/e2e/e2e_test.py +5 -2
  6. {hte_cli-0.2.21 → hte_cli-0.2.22}/tests/e2e/test_eval_logs.py +35 -15
  7. {hte_cli-0.2.21 → hte_cli-0.2.22}/uv.lock +1 -1
  8. {hte_cli-0.2.21 → hte_cli-0.2.22}/.gitignore +0 -0
  9. {hte_cli-0.2.21 → hte_cli-0.2.22}/README.md +0 -0
  10. {hte_cli-0.2.21 → hte_cli-0.2.22}/src/hte_cli/__init__.py +0 -0
  11. {hte_cli-0.2.21 → hte_cli-0.2.22}/src/hte_cli/__main__.py +0 -0
  12. {hte_cli-0.2.21 → hte_cli-0.2.22}/src/hte_cli/api_client.py +0 -0
  13. {hte_cli-0.2.21 → hte_cli-0.2.22}/src/hte_cli/config.py +0 -0
  14. {hte_cli-0.2.21 → hte_cli-0.2.22}/src/hte_cli/errors.py +0 -0
  15. {hte_cli-0.2.21 → hte_cli-0.2.22}/src/hte_cli/events.py +0 -0
  16. {hte_cli-0.2.21 → hte_cli-0.2.22}/src/hte_cli/image_utils.py +0 -0
  17. {hte_cli-0.2.21 → hte_cli-0.2.22}/src/hte_cli/runner.py +0 -0
  18. {hte_cli-0.2.21 → hte_cli-0.2.22}/src/hte_cli/scorers.py +0 -0
  19. {hte_cli-0.2.21 → hte_cli-0.2.22}/src/hte_cli/version_check.py +0 -0
  20. {hte_cli-0.2.21 → hte_cli-0.2.22}/tests/__init__.py +0 -0
  21. {hte_cli-0.2.21 → hte_cli-0.2.22}/tests/e2e/__init__.py +0 -0
  22. {hte_cli-0.2.21 → hte_cli-0.2.22}/tests/e2e/conftest.py +0 -0
  23. {hte_cli-0.2.21 → hte_cli-0.2.22}/tests/e2e/test_benchmark_flows.py +0 -0
  24. {hte_cli-0.2.21 → hte_cli-0.2.22}/tests/e2e/test_infrastructure.py +0 -0
  25. {hte_cli-0.2.21 → hte_cli-0.2.22}/tests/e2e/test_runtime_imports.py +0 -0
  26. {hte_cli-0.2.21 → hte_cli-0.2.22}/tests/e2e/test_session_lifecycle.py +0 -0
  27. {hte_cli-0.2.21 → hte_cli-0.2.22}/tests/e2e/verify_docker_deps.py +0 -0
  28. {hte_cli-0.2.21 → hte_cli-0.2.22}/tests/unit/__init__.py +0 -0
  29. {hte_cli-0.2.21 → hte_cli-0.2.22}/tests/unit/conftest.py +0 -0
  30. {hte_cli-0.2.21 → hte_cli-0.2.22}/tests/unit/test_image_utils.py +0 -0
  31. {hte_cli-0.2.21 → hte_cli-0.2.22}/tests/unit/test_runner.py +0 -0
  32. {hte_cli-0.2.21 → hte_cli-0.2.22}/tests/unit/test_scorers.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: hte-cli
3
- Version: 0.2.21
3
+ Version: 0.2.22
4
4
  Summary: Human Time-to-Completion Evaluation CLI
5
5
  Project-URL: Homepage, https://github.com/sean-peters-au/lyptus-mono
6
6
  Author: Lyptus Research
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "hte-cli"
3
- version = "0.2.21"
3
+ version = "0.2.22"
4
4
  description = "Human Time-to-Completion Evaluation CLI"
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.11"
@@ -280,6 +280,14 @@ def session_join(ctx, session_id: str, force_setup: bool):
280
280
  },
281
281
  }
282
282
 
283
+ # Send session_started event (records CLI version for debugging)
284
+ events.session_started(
285
+ {
286
+ "cli_version": __version__,
287
+ "task_id": session_info["task_id"],
288
+ }
289
+ )
290
+
283
291
  # Step 3: Run setup (skip if reconnecting without force)
284
292
  setup_start_time = time.monotonic()
285
293
  images = []
@@ -429,13 +437,21 @@ def session_join(ctx, session_id: str, force_setup: bool):
429
437
  console.print(f"Answer: {result.answer}")
430
438
  console.print(f"Time: {result.time_seconds:.1f}s")
431
439
 
440
+ # Track upload size and timing
441
+ upload_size_bytes = len(eval_log_bytes) if eval_log_bytes else 0
442
+ upload_size_kb = upload_size_bytes / 1024
443
+
444
+ events.upload_started(size_bytes=upload_size_bytes)
445
+ upload_start_time = time.monotonic()
446
+
432
447
  # Upload to server
433
448
  with Progress(
434
449
  SpinnerColumn(),
435
450
  TextColumn("[progress.description]{task.description}"),
436
451
  console=console,
437
452
  ) as progress:
438
- progress.add_task("Uploading result...", total=None)
453
+ size_str = f" ({upload_size_kb:.0f} KB)" if upload_size_kb > 0 else ""
454
+ progress.add_task(f"Uploading result{size_str}...", total=None)
439
455
  try:
440
456
  upload_result = api.upload_result(
441
457
  session_id=session_id,
@@ -450,6 +466,10 @@ def session_join(ctx, session_id: str, force_setup: bool):
450
466
  console.print(f"[red]Failed to upload result: {e}[/red]")
451
467
  sys.exit(1)
452
468
 
469
+ # Record upload completion
470
+ upload_duration = time.monotonic() - upload_start_time
471
+ events.upload_completed(duration_seconds=upload_duration, size_bytes=upload_size_bytes)
472
+
453
473
  if upload_result.get("score") is not None:
454
474
  console.print(f"Score: {upload_result['score']}")
455
475
 
@@ -52,6 +52,10 @@ def strip_ansi(text: str) -> str:
52
52
 
53
53
  VPS_HOST = os.environ.get("VPS_HOST", "root@209.38.25.118")
54
54
 
55
+ # Use the pipx-installed CLI (what experts use), not the local venv version
56
+ # This ensures E2E tests validate the actual published PyPI package
57
+ HTE_CLI_PATH = Path.home() / ".local" / "bin" / "hte-cli"
58
+
55
59
  # Environment verification commands per benchmark
56
60
  # Based on docs/benchmark-environments.md
57
61
  # Format: (command, expected_pattern, description)
@@ -155,15 +159,15 @@ ENVIRONMENT_CHECKS = {
155
159
  ("pwd", "/root", "Working directory is /root"),
156
160
  # OS: Kali Linux (same base as cybench)
157
161
  ("cat /etc/os-release | grep '^NAME='", "Kali", "Kali Linux base image"),
158
- # CyberGym-specific: submit.sh MUST exist for PoC submission
159
- ("test -f /submit.sh && echo 'submit.sh'", "submit.sh", "submit.sh present at /submit.sh"),
160
- ("test -x /submit.sh && echo 'executable'", "executable", "submit.sh is executable"),
162
+ # CyberGym-specific: submit.sh in home dir for PoC submission
163
+ ("test -f /root/submit.sh && echo 'submit.sh'", "submit.sh", "submit.sh present in /root"),
164
+ ("test -x /root/submit.sh && echo 'executable'", "executable", "submit.sh is executable"),
161
165
  # Required tools - STRICT (same image as cybench)
162
166
  ("which gdb", "/usr/bin/gdb", "gdb at /usr/bin/gdb"),
163
167
  ("which ghidra", "/usr/bin/ghidra", "ghidra at /usr/bin/ghidra"),
164
168
  ("which radare2", "/usr/bin/radare2", "radare2 at /usr/bin/radare2"),
165
- # Core security tools - STRICT
166
- ("which python3", "/usr/bin/python3", "python3 at /usr/bin/python3"),
169
+ # Python via venv (same as cybench/cybench:latest)
170
+ ("test -f /venv/bin/python3 && echo 'venv'", "venv", "Python venv at /venv"),
167
171
  ("which nmap", "/usr/bin/nmap", "nmap at /usr/bin/nmap"),
168
172
  ("which curl", "/usr/bin/curl", "curl at /usr/bin/curl"),
169
173
  # apt-get for additional tools
@@ -203,7 +207,7 @@ TASK_FILE_CHECKS = {
203
207
  "primary_knowledge": [], # No local files - network-based
204
208
  },
205
209
  "cybergym": {
206
- "*": ["/submit.sh"], # All cybergym tasks need submit.sh
210
+ "*": ["/root/submit.sh"], # All cybergym tasks need submit.sh in home
207
211
  },
208
212
  "nyuctf": {
209
213
  "*": ["/home/ctfplayer/ctf_files"], # Standard location
@@ -233,19 +237,19 @@ def get_file_checks_for_task(benchmark: str, task_id: str) -> list[tuple]:
233
237
  SUBMISSION_TESTS = {
234
238
  "intercode-ctf": {
235
239
  "score_cmd": 'task score "test_flag_123"',
236
- "score_expect": "incorrect", # Should say incorrect
240
+ "score_expect": "Score:", # Output shows "Score: I" (I=Incorrect)
237
241
  "submit_cmd": 'task submit "test_flag_for_e2e"',
238
242
  "submit_expect": "submitted",
239
243
  },
240
244
  "nyuctf": {
241
245
  "score_cmd": 'task score "test_flag_123"',
242
- "score_expect": "incorrect",
246
+ "score_expect": "Score:", # Output shows "Score: I" (I=Incorrect)
243
247
  "submit_cmd": 'task submit "test_flag_for_e2e"',
244
248
  "submit_expect": "submitted",
245
249
  },
246
250
  "cybench": {
247
251
  "score_cmd": 'task score "test_flag_123"',
248
- "score_expect": "incorrect",
252
+ "score_expect": "Score:", # Output shows "Score: I" (I=Incorrect)
249
253
  "submit_cmd": 'task submit "test_flag_for_e2e"',
250
254
  "submit_expect": "submitted",
251
255
  },
@@ -266,6 +270,12 @@ SUBMISSION_TESTS = {
266
270
  "submit_cmd": 'task submit "ls -la"',
267
271
  "submit_expect": "submitted",
268
272
  },
273
+ "nl2bash": {
274
+ # Single-shot - no score command (same as cybashbench)
275
+ "score_cmd": None,
276
+ "submit_cmd": 'task submit "ls -la"',
277
+ "submit_expect": "submitted",
278
+ },
269
279
  }
270
280
 
271
281
 
@@ -374,9 +384,16 @@ def run_automated_test(task_id: str, benchmark: str, timeout: int = 300) -> list
374
384
 
375
385
  # Start the CLI using the new session join flow
376
386
  # Session has status="created", so CLI will run full setup
377
- console.print(f"Launching hte-cli session join {session_id}...")
387
+ # Use explicit pipx path to test the published PyPI version, not local dev
388
+ if not HTE_CLI_PATH.exists():
389
+ console.print(f"[red]hte-cli not found at {HTE_CLI_PATH}[/red]")
390
+ console.print("[yellow]Install with: pipx install hte-cli[/yellow]")
391
+ results.append(TestResult("CLI installed", False, "", f"hte-cli not at {HTE_CLI_PATH}"))
392
+ return results
393
+
394
+ console.print(f"Launching {HTE_CLI_PATH} session join {session_id}...")
378
395
  child = pexpect.spawn(
379
- f"hte-cli session join {session_id}",
396
+ f"{HTE_CLI_PATH} session join {session_id}",
380
397
  encoding="utf-8",
381
398
  timeout=timeout,
382
399
  env={**os.environ, "TERM": "dumb"}, # Disable colors for easier parsing
@@ -426,6 +443,7 @@ def run_automated_test(task_id: str, benchmark: str, timeout: int = 300) -> list
426
443
  results.append(TestResult("Environment setup", True, "Environment ready"))
427
444
 
428
445
  # Wait for the "Login to the system" message and docker exec command
446
+ # CVE bench builds containers from source, can take 5+ minutes
429
447
  console.print("Waiting for docker exec command...")
430
448
  idx = child.expect(
431
449
  [
@@ -433,7 +451,7 @@ def run_automated_test(task_id: str, benchmark: str, timeout: int = 300) -> list
433
451
  r"docker exec -it",
434
452
  pexpect.TIMEOUT,
435
453
  ],
436
- timeout=120,
454
+ timeout=300, # 5 minutes for slow builds (cvebench)
437
455
  )
438
456
 
439
457
  if idx == 2: # TIMEOUT
@@ -603,6 +621,11 @@ def run_automated_test(task_id: str, benchmark: str, timeout: int = 300) -> list
603
621
 
604
622
  # Test score command if available
605
623
  if sub_tests.get("score_cmd"):
624
+ # Clear buffer before score test to avoid capturing stale output
625
+ try:
626
+ docker_child.read_nonblocking(size=10000, timeout=0.5)
627
+ except Exception:
628
+ pass
606
629
  docker_child.sendline(sub_tests["score_cmd"])
607
630
  time.sleep(2)
608
631
  docker_child.expect(prompt_patterns[:-1], timeout=30)
@@ -776,9 +799,15 @@ def verify_artifacts(task_id: str, benchmark: str) -> list[TestResult]:
776
799
  )
777
800
  )
778
801
 
779
- results.append(
780
- TestResult("Score recorded", score != "", f"Score: {score}" if score else "No score")
781
- )
802
+ # Score check - some benchmarks compute scores server-side later (not immediately)
803
+ no_immediate_score = benchmark in ("cybashbench", "nl2bash")
804
+ if no_immediate_score:
805
+ # These benchmarks don't produce immediate scores - skip check
806
+ pass
807
+ else:
808
+ results.append(
809
+ TestResult("Score recorded", score != "", f"Score: {score}" if score else "No score")
810
+ )
782
811
 
783
812
  # Check events (new flow uses setup_started/setup_completed instead of session_started)
784
813
  events = ssh_query(f"""
@@ -37,6 +37,9 @@ TEST_NAME = "E2E Test User"
37
37
  # CLI config path (matches platformdirs on macOS)
38
38
  CLI_CONFIG_PATH = Path.home() / "Library" / "Application Support" / "hte-cli" / "config.json"
39
39
 
40
+ # Use the pipx-installed CLI (what experts use), not the local venv version
41
+ HTE_CLI_PATH = Path.home() / ".local" / "bin" / "hte-cli"
42
+
40
43
  # Task assignments: 4 per benchmark
41
44
  # First 2 for pytest API tests, last 2 for interactive tests
42
45
  BENCHMARK_TASKS = {
@@ -347,10 +350,10 @@ def setup(admin_password: str, yes: bool):
347
350
  CLI_CONFIG_PATH.write_text(json.dumps(config, indent=2))
348
351
  console.print("[green]CLI config written[/green]")
349
352
 
350
- # 7. Verify CLI works
353
+ # 7. Verify CLI works (use pipx version, not local venv)
351
354
  console.print("\nVerifying CLI authentication...")
352
355
  result = subprocess.run(
353
- ["hte-cli", "auth", "status"],
356
+ [str(HTE_CLI_PATH), "auth", "status"],
354
357
  capture_output=True,
355
358
  text=True,
356
359
  )
@@ -28,6 +28,15 @@ LOCAL_EVAL_LOGS_DIR = Path.home() / "Library" / "Application Support" / "hte-cli
28
28
  VPS_EVAL_LOGS_DIR = "/opt/hte-web/data/eval_logs"
29
29
 
30
30
 
31
+ def db_path_to_host_path(db_path: str) -> str:
32
+ """Translate container path stored in DB to host path on VPS.
33
+
34
+ Backend runs in Docker with /opt/hte-web/data mounted as /data,
35
+ so paths are stored as /data/... but host has /opt/hte-web/data/...
36
+ """
37
+ return db_path.replace("/data/", "/opt/hte-web/data/")
38
+
39
+
31
40
  def ssh_query(query: str) -> str:
32
41
  """Run a sqlite3 query on the VPS."""
33
42
  result = subprocess.run(
@@ -129,9 +138,14 @@ class TestVPSEvalLogs:
129
138
  """)
130
139
 
131
140
  # All completed sessions should have eval log paths
132
- assert int(with_path) == int(
133
- count
134
- ), f"Only {with_path}/{count} completed sessions have eval_log_path"
141
+ # Handle empty string from SQL query
142
+ with_path_count = int(with_path) if with_path else 0
143
+ total_count = int(count) if count else 0
144
+
145
+ if total_count == 0:
146
+ pytest.skip("No completed sessions to check")
147
+
148
+ assert with_path_count == total_count, f"Only {with_path_count}/{total_count} completed sessions have eval_log_path"
135
149
 
136
150
  def test_eval_log_files_exist_on_vps(self):
137
151
  """Eval log files referenced in DB should exist on VPS."""
@@ -148,8 +162,9 @@ class TestVPSEvalLogs:
148
162
 
149
163
  for path in paths.split("\n"):
150
164
  if path:
151
- exists = ssh_command(f"test -f {path} && echo exists")
152
- assert exists == "exists", f"Eval log not found: {path}"
165
+ host_path = db_path_to_host_path(path)
166
+ exists = ssh_command(f"test -f {host_path} && echo exists")
167
+ assert exists == "exists", f"Eval log not found: {host_path} (DB path: {path})"
153
168
 
154
169
 
155
170
  # =============================================================================
@@ -176,32 +191,34 @@ class TestEvalLogFormat:
176
191
 
177
192
  def test_eval_log_can_be_decompressed(self):
178
193
  """Eval logs should be valid gzip files."""
179
- path = ssh_query("""
194
+ db_path = ssh_query("""
180
195
  SELECT eval_log_path FROM sessions
181
196
  WHERE status = 'submitted'
182
197
  AND eval_log_path IS NOT NULL
183
198
  LIMIT 1
184
199
  """)
185
200
 
186
- if not path:
201
+ if not db_path:
187
202
  pytest.skip("No eval logs to test")
188
203
 
204
+ path = db_path_to_host_path(db_path)
189
205
  # Try to decompress
190
206
  result = ssh_command(f"gunzip -t {path} 2>&1 && echo ok")
191
207
  assert "ok" in result, f"Eval log not valid gzip: {result}"
192
208
 
193
209
  def test_eval_log_contains_expected_structure(self):
194
210
  """Eval logs should contain expected Inspect AI structure."""
195
- path = ssh_query("""
211
+ db_path = ssh_query("""
196
212
  SELECT eval_log_path FROM sessions
197
213
  WHERE status = 'submitted'
198
214
  AND eval_log_path IS NOT NULL
199
215
  LIMIT 1
200
216
  """)
201
217
 
202
- if not path:
218
+ if not db_path:
203
219
  pytest.skip("No eval logs to test")
204
220
 
221
+ path = db_path_to_host_path(db_path)
205
222
  # List contents of the gzipped eval (it's actually a zip inside gzip)
206
223
  # First copy to temp, decompress, check structure
207
224
  result = ssh_command(f"""
@@ -226,40 +243,43 @@ class TestEvalLogUpload:
226
243
  """Test eval log upload functionality."""
227
244
 
228
245
  def test_upload_event_recorded(self):
229
- """Upload events should be recorded in session_events."""
246
+ """Upload events should be recorded in session_events for sessions with eval logs."""
247
+ # Only check sessions that have eval_log_path (proves upload succeeded)
230
248
  session_id = ssh_query(f"""
231
249
  SELECT id FROM sessions
232
250
  WHERE user_id = '{get_test_user_id()}'
233
251
  AND status = 'submitted'
252
+ AND eval_log_path IS NOT NULL
234
253
  LIMIT 1
235
254
  """)
236
255
 
237
256
  if not session_id:
238
- pytest.skip("No completed sessions")
257
+ pytest.skip("No completed sessions with eval logs")
239
258
 
240
259
  events = ssh_query(f"""
241
260
  SELECT event_type FROM session_events
242
261
  WHERE session_id = '{session_id}'
243
262
  """)
244
263
 
245
- # Should have upload-related events for completed sessions
264
+ # Should have upload-related events for sessions with eval logs
246
265
  event_list = events.split("\n") if events else []
247
266
  has_upload = any("upload" in e.lower() for e in event_list)
248
- # Completed sessions should have upload events
267
+
249
268
  assert has_upload, f"No upload events found for session {session_id}. Events: {event_list[:5]}"
250
269
 
251
270
  def test_eval_log_size_reasonable(self):
252
271
  """Eval logs should be reasonably sized (not empty, not huge)."""
253
- path = ssh_query("""
272
+ db_path = ssh_query("""
254
273
  SELECT eval_log_path FROM sessions
255
274
  WHERE status = 'submitted'
256
275
  AND eval_log_path IS NOT NULL
257
276
  LIMIT 1
258
277
  """)
259
278
 
260
- if not path:
279
+ if not db_path:
261
280
  pytest.skip("No eval logs to test")
262
281
 
282
+ path = db_path_to_host_path(db_path)
263
283
  size = ssh_command(f"stat -c%s {path} 2>/dev/null || stat -f%z {path}")
264
284
 
265
285
  if size.isdigit():
@@ -625,7 +625,7 @@ wheels = [
625
625
 
626
626
  [[package]]
627
627
  name = "hte-cli"
628
- version = "0.2.20"
628
+ version = "0.2.21"
629
629
  source = { editable = "." }
630
630
  dependencies = [
631
631
  { name = "click" },
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes