hte-cli 0.1.28__tar.gz → 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {hte_cli-0.1.28 → hte_cli-0.2.0}/PKG-INFO +5 -1
- {hte_cli-0.1.28 → hte_cli-0.2.0}/pyproject.toml +13 -1
- {hte_cli-0.1.28 → hte_cli-0.2.0}/src/hte_cli/api_client.py +25 -0
- {hte_cli-0.1.28 → hte_cli-0.2.0}/src/hte_cli/cli.py +245 -3
- hte_cli-0.2.0/tests/__init__.py +1 -0
- hte_cli-0.2.0/tests/e2e/__init__.py +1 -0
- hte_cli-0.2.0/tests/e2e/automated_runner.py +550 -0
- hte_cli-0.2.0/tests/e2e/conftest.py +113 -0
- hte_cli-0.2.0/tests/e2e/e2e_test.py +682 -0
- hte_cli-0.2.0/tests/e2e/test_benchmark_flows.py +331 -0
- hte_cli-0.2.0/tests/e2e/test_eval_logs.py +299 -0
- hte_cli-0.2.0/tests/e2e/test_infrastructure.py +359 -0
- hte_cli-0.2.0/tests/e2e/test_runtime_imports.py +198 -0
- hte_cli-0.2.0/tests/e2e/test_session_lifecycle.py +264 -0
- hte_cli-0.2.0/tests/e2e/verify_docker_deps.py +177 -0
- {hte_cli-0.1.28 → hte_cli-0.2.0}/uv.lock +67 -1
- {hte_cli-0.1.28 → hte_cli-0.2.0}/.gitignore +0 -0
- {hte_cli-0.1.28 → hte_cli-0.2.0}/README.md +0 -0
- {hte_cli-0.1.28 → hte_cli-0.2.0}/src/hte_cli/__init__.py +0 -0
- {hte_cli-0.1.28 → hte_cli-0.2.0}/src/hte_cli/__main__.py +0 -0
- {hte_cli-0.1.28 → hte_cli-0.2.0}/src/hte_cli/config.py +0 -0
- {hte_cli-0.1.28 → hte_cli-0.2.0}/src/hte_cli/errors.py +0 -0
- {hte_cli-0.1.28 → hte_cli-0.2.0}/src/hte_cli/events.py +0 -0
- {hte_cli-0.1.28 → hte_cli-0.2.0}/src/hte_cli/image_utils.py +0 -0
- {hte_cli-0.1.28 → hte_cli-0.2.0}/src/hte_cli/runner.py +0 -0
- {hte_cli-0.1.28 → hte_cli-0.2.0}/src/hte_cli/scorers.py +0 -0
- {hte_cli-0.1.28 → hte_cli-0.2.0}/src/hte_cli/version_check.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: hte-cli
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.2.0
|
|
4
4
|
Summary: Human Time-to-Completion Evaluation CLI
|
|
5
5
|
Project-URL: Homepage, https://github.com/sean-peters-au/lyptus-mono
|
|
6
6
|
Author: Lyptus Research
|
|
@@ -23,6 +23,10 @@ Requires-Dist: platformdirs>=4.0
|
|
|
23
23
|
Requires-Dist: pydantic>=2.0
|
|
24
24
|
Requires-Dist: pyyaml>=6.0
|
|
25
25
|
Requires-Dist: rich>=13.0
|
|
26
|
+
Provides-Extra: dev
|
|
27
|
+
Requires-Dist: pexpect>=4.8; extra == 'dev'
|
|
28
|
+
Requires-Dist: pytest>=7.0; extra == 'dev'
|
|
29
|
+
Requires-Dist: requests>=2.28; extra == 'dev'
|
|
26
30
|
Description-Content-Type: text/markdown
|
|
27
31
|
|
|
28
32
|
# hte-cli
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "hte-cli"
|
|
3
|
-
version = "0.
|
|
3
|
+
version = "0.2.0"
|
|
4
4
|
description = "Human Time-to-Completion Evaluation CLI"
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
requires-python = ">=3.11"
|
|
@@ -31,6 +31,13 @@ dependencies = [
|
|
|
31
31
|
"packaging>=21.0",
|
|
32
32
|
]
|
|
33
33
|
|
|
34
|
+
[project.optional-dependencies]
|
|
35
|
+
dev = [
|
|
36
|
+
"pexpect>=4.8",
|
|
37
|
+
"requests>=2.28",
|
|
38
|
+
"pytest>=7.0",
|
|
39
|
+
]
|
|
40
|
+
|
|
34
41
|
[project.scripts]
|
|
35
42
|
hte-cli = "hte_cli:main"
|
|
36
43
|
|
|
@@ -44,6 +51,11 @@ build-backend = "hatchling.build"
|
|
|
44
51
|
[tool.hatch.build.targets.wheel]
|
|
45
52
|
packages = ["src/hte_cli"]
|
|
46
53
|
|
|
54
|
+
[tool.pytest.ini_options]
|
|
55
|
+
# Force unbuffered output so we see results as they run
|
|
56
|
+
addopts = "-s --tb=short"
|
|
57
|
+
testpaths = ["tests"]
|
|
58
|
+
|
|
47
59
|
[tool.ruff]
|
|
48
60
|
line-length = 100
|
|
49
61
|
target-version = "py311"
|
|
@@ -201,6 +201,31 @@ class APIClient:
|
|
|
201
201
|
},
|
|
202
202
|
)
|
|
203
203
|
|
|
204
|
+
# =========================================================================
|
|
205
|
+
# Session-based API (new flow: hte-cli session join <session_id>)
|
|
206
|
+
# =========================================================================
|
|
207
|
+
|
|
208
|
+
def join_session(self, session_id: str) -> dict:
|
|
209
|
+
"""Join an existing session created by web UI.
|
|
210
|
+
|
|
211
|
+
Returns session info including task data, benchmark, mode, etc.
|
|
212
|
+
Sets cli_connected_at on the server.
|
|
213
|
+
"""
|
|
214
|
+
return self.post(f"/sessions/{session_id}/join")
|
|
215
|
+
|
|
216
|
+
def get_session_files(self, session_id: str) -> bytes:
|
|
217
|
+
"""Download task files for a session as zip."""
|
|
218
|
+
return self.get_raw(f"/sessions/{session_id}/files")
|
|
219
|
+
|
|
220
|
+
def get_session_compose(self, session_id: str) -> str:
|
|
221
|
+
"""Get compose.yaml content for a session."""
|
|
222
|
+
content = self.get_raw(f"/sessions/{session_id}/compose")
|
|
223
|
+
return content.decode("utf-8")
|
|
224
|
+
|
|
225
|
+
# =========================================================================
|
|
226
|
+
# Result Upload
|
|
227
|
+
# =========================================================================
|
|
228
|
+
|
|
204
229
|
def upload_result(
|
|
205
230
|
self,
|
|
206
231
|
session_id: str,
|
|
@@ -147,13 +147,239 @@ def auth_status(ctx):
|
|
|
147
147
|
|
|
148
148
|
|
|
149
149
|
# =============================================================================
|
|
150
|
-
#
|
|
150
|
+
# Session Commands (New flow: session join <session_id>)
|
|
151
|
+
# =============================================================================
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
@cli.group()
|
|
155
|
+
def session():
|
|
156
|
+
"""Session management commands."""
|
|
157
|
+
pass
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
@session.command("join")
|
|
161
|
+
@click.argument("session_id")
|
|
162
|
+
@click.option("--force-setup", is_flag=True, help="Re-run setup even if reconnecting")
|
|
163
|
+
@click.pass_context
|
|
164
|
+
def session_join(ctx, session_id: str, force_setup: bool):
|
|
165
|
+
"""Join an existing session by ID.
|
|
166
|
+
|
|
167
|
+
This is the primary way to start working on a task:
|
|
168
|
+
1. Start the task from the web UI (creates session)
|
|
169
|
+
2. Run this command with the session ID shown in the web UI
|
|
170
|
+
3. The environment will be set up and the timer will start
|
|
171
|
+
"""
|
|
172
|
+
config: Config = ctx.obj["config"]
|
|
173
|
+
|
|
174
|
+
if not config.is_authenticated():
|
|
175
|
+
console.print("[red]Not logged in. Run: hte-cli auth login[/red]")
|
|
176
|
+
sys.exit(1)
|
|
177
|
+
|
|
178
|
+
api = APIClient(config)
|
|
179
|
+
|
|
180
|
+
# Step 1: Join session
|
|
181
|
+
console.print()
|
|
182
|
+
with Progress(
|
|
183
|
+
SpinnerColumn(),
|
|
184
|
+
TextColumn("[progress.description]{task.description}"),
|
|
185
|
+
console=console,
|
|
186
|
+
) as progress:
|
|
187
|
+
progress.add_task("Joining session...", total=None)
|
|
188
|
+
|
|
189
|
+
try:
|
|
190
|
+
session_info = api.join_session(session_id)
|
|
191
|
+
except APIError as e:
|
|
192
|
+
if "Invalid session ID format" in str(e):
|
|
193
|
+
console.print(f"[red]{e}[/red]")
|
|
194
|
+
elif e.status_code == 404:
|
|
195
|
+
console.print("[red]Session not found. Check the session ID and try again.[/red]")
|
|
196
|
+
elif e.status_code == 400 and "paused" in str(e).lower():
|
|
197
|
+
console.print("[yellow]Session is paused. Please resume from the web UI first.[/yellow]")
|
|
198
|
+
else:
|
|
199
|
+
console.print(f"[red]Error: {e}[/red]")
|
|
200
|
+
sys.exit(1)
|
|
201
|
+
|
|
202
|
+
# Check if reconnecting (session already in_progress)
|
|
203
|
+
is_reconnect = session_info.get("status") == "in_progress"
|
|
204
|
+
|
|
205
|
+
if is_reconnect and not force_setup:
|
|
206
|
+
console.print("[yellow]Reconnecting to existing session...[/yellow]")
|
|
207
|
+
console.print()
|
|
208
|
+
|
|
209
|
+
console.print(
|
|
210
|
+
Panel(
|
|
211
|
+
f"[bold]Task:[/bold] {session_info['task_id']}\n"
|
|
212
|
+
f"[bold]Benchmark:[/bold] {session_info['benchmark']}\n"
|
|
213
|
+
f"[bold]Mode:[/bold] {session_info['mode']}",
|
|
214
|
+
title="Session Joined",
|
|
215
|
+
)
|
|
216
|
+
)
|
|
217
|
+
console.print()
|
|
218
|
+
|
|
219
|
+
# Import runner components
|
|
220
|
+
from hte_cli.events import EventStreamer
|
|
221
|
+
from hte_cli.runner import TaskRunner, DockerComposeManager
|
|
222
|
+
|
|
223
|
+
# Create event streamer
|
|
224
|
+
events = EventStreamer(api, session_id)
|
|
225
|
+
|
|
226
|
+
# Step 2: Download task files and compose (skip if reconnecting without force)
|
|
227
|
+
files_zip = None
|
|
228
|
+
compose_yaml = None
|
|
229
|
+
|
|
230
|
+
if not is_reconnect or force_setup:
|
|
231
|
+
with Progress(
|
|
232
|
+
SpinnerColumn(),
|
|
233
|
+
TextColumn("[progress.description]{task.description}"),
|
|
234
|
+
console=console,
|
|
235
|
+
) as progress:
|
|
236
|
+
task_id_display = progress.add_task("Downloading task files...", total=None)
|
|
237
|
+
try:
|
|
238
|
+
files_zip = api.get_session_files(session_id)
|
|
239
|
+
except APIError as e:
|
|
240
|
+
# Files are optional for some benchmarks
|
|
241
|
+
console.print(f"[dim]Note: {e}[/dim]")
|
|
242
|
+
files_zip = None
|
|
243
|
+
|
|
244
|
+
progress.update(task_id_display, description="Downloading compose file...")
|
|
245
|
+
try:
|
|
246
|
+
compose_yaml = api.get_session_compose(session_id)
|
|
247
|
+
except APIError as e:
|
|
248
|
+
# Compose is optional for simple benchmarks
|
|
249
|
+
compose_yaml = None
|
|
250
|
+
|
|
251
|
+
# Validate compose for sandbox-required benchmarks
|
|
252
|
+
benchmark = session_info.get("benchmark", "").lower()
|
|
253
|
+
SANDBOX_REQUIRED_BENCHMARKS = {"cybench", "intercode-ctf", "cvebench", "cybergym", "nyuctf"}
|
|
254
|
+
if benchmark in SANDBOX_REQUIRED_BENCHMARKS and not compose_yaml and not is_reconnect:
|
|
255
|
+
console.print(
|
|
256
|
+
f"[red]Error: {benchmark} requires a Docker sandbox but no compose file was found.[/red]"
|
|
257
|
+
)
|
|
258
|
+
console.print()
|
|
259
|
+
console.print(
|
|
260
|
+
f"Please contact support: {SUPPORT_EMAIL}"
|
|
261
|
+
)
|
|
262
|
+
sys.exit(1)
|
|
263
|
+
|
|
264
|
+
# Build assignment dict for runner compatibility
|
|
265
|
+
assignment = {
|
|
266
|
+
"assignment_id": session_info.get("assignment_id"),
|
|
267
|
+
"task_id": session_info["task_id"],
|
|
268
|
+
"benchmark": session_info["benchmark"],
|
|
269
|
+
"mode": session_info["mode"],
|
|
270
|
+
"time_cap_seconds": session_info.get("time_cap_seconds"),
|
|
271
|
+
}
|
|
272
|
+
|
|
273
|
+
# Build task dict for runner
|
|
274
|
+
task_data = {
|
|
275
|
+
"instructions": session_info.get("instructions", ""),
|
|
276
|
+
"metadata": session_info.get("metadata", {}),
|
|
277
|
+
"scorer_type": session_info.get("scorer_type"),
|
|
278
|
+
"intermediate_scoring": session_info.get("intermediate_scoring", False),
|
|
279
|
+
}
|
|
280
|
+
|
|
281
|
+
# Step 3: Run setup (skip if reconnecting without force)
|
|
282
|
+
if not is_reconnect or force_setup:
|
|
283
|
+
# Send setup_started event
|
|
284
|
+
events.setup_started({"cli_version": __version__, "task_id": assignment["task_id"]})
|
|
285
|
+
|
|
286
|
+
# Extract files and run compose
|
|
287
|
+
runner = TaskRunner(session_id, api, events, console)
|
|
288
|
+
compose_manager = None
|
|
289
|
+
|
|
290
|
+
if compose_yaml:
|
|
291
|
+
compose_manager = DockerComposeManager(
|
|
292
|
+
compose_yaml=compose_yaml,
|
|
293
|
+
files_zip=files_zip,
|
|
294
|
+
session_id=session_id,
|
|
295
|
+
task_id=assignment["task_id"],
|
|
296
|
+
console=console,
|
|
297
|
+
)
|
|
298
|
+
|
|
299
|
+
# Pull images
|
|
300
|
+
events.image_pull_started({})
|
|
301
|
+
compose_manager.pull_images()
|
|
302
|
+
events.image_pull_completed({})
|
|
303
|
+
|
|
304
|
+
# Start containers
|
|
305
|
+
compose_manager.up()
|
|
306
|
+
|
|
307
|
+
# Send setup_completed - THIS STARTS THE TIMER ON SERVER
|
|
308
|
+
events.setup_completed({"cli_version": __version__})
|
|
309
|
+
console.print("[green]Environment ready! Timer started.[/green]")
|
|
310
|
+
console.print()
|
|
311
|
+
else:
|
|
312
|
+
# Reconnecting - compose should already be running
|
|
313
|
+
console.print("[dim]Skipping setup (use --force-setup to re-run)[/dim]")
|
|
314
|
+
console.print()
|
|
315
|
+
|
|
316
|
+
# Step 4: Show instructions
|
|
317
|
+
if session_info.get("instructions"):
|
|
318
|
+
console.print(Panel(session_info["instructions"], title="Task Instructions"))
|
|
319
|
+
console.print()
|
|
320
|
+
|
|
321
|
+
# Step 5: Run the task interaction loop
|
|
322
|
+
runner = TaskRunner(session_id, api, events, console)
|
|
323
|
+
try:
|
|
324
|
+
result = runner.run(
|
|
325
|
+
assignment=assignment,
|
|
326
|
+
task=task_data,
|
|
327
|
+
compose_yaml=compose_yaml,
|
|
328
|
+
files_zip=files_zip,
|
|
329
|
+
)
|
|
330
|
+
except KeyboardInterrupt:
|
|
331
|
+
console.print()
|
|
332
|
+
console.print("[yellow]Interrupted. Session remains active - you can reconnect later.[/yellow]")
|
|
333
|
+
sys.exit(0)
|
|
334
|
+
|
|
335
|
+
# Step 6: Upload result
|
|
336
|
+
if result and result.answer:
|
|
337
|
+
events.session_completed(
|
|
338
|
+
elapsed_seconds=result.time_seconds,
|
|
339
|
+
answer=result.answer,
|
|
340
|
+
)
|
|
341
|
+
|
|
342
|
+
console.print()
|
|
343
|
+
console.print("[green]Task completed![/green]")
|
|
344
|
+
console.print(f"Answer: {result.answer}")
|
|
345
|
+
console.print(f"Time: {result.time_seconds:.1f}s")
|
|
346
|
+
|
|
347
|
+
# Upload to server
|
|
348
|
+
with Progress(
|
|
349
|
+
SpinnerColumn(),
|
|
350
|
+
TextColumn("[progress.description]{task.description}"),
|
|
351
|
+
console=console,
|
|
352
|
+
) as progress:
|
|
353
|
+
progress.add_task("Uploading result...", total=None)
|
|
354
|
+
try:
|
|
355
|
+
upload_result = api.upload_result(
|
|
356
|
+
session_id=session_id,
|
|
357
|
+
answer=result.answer or "",
|
|
358
|
+
client_active_seconds=result.time_seconds,
|
|
359
|
+
eval_log_bytes=result.eval_log_bytes,
|
|
360
|
+
score=result.score,
|
|
361
|
+
score_binarized=result.score_binarized,
|
|
362
|
+
agent_id=result.agent_id,
|
|
363
|
+
)
|
|
364
|
+
except APIError as e:
|
|
365
|
+
console.print(f"[red]Failed to upload result: {e}[/red]")
|
|
366
|
+
sys.exit(1)
|
|
367
|
+
|
|
368
|
+
if upload_result.get("score") is not None:
|
|
369
|
+
console.print(f"Score: {upload_result['score']}")
|
|
370
|
+
|
|
371
|
+
console.print()
|
|
372
|
+
console.print("[green]Done! Return to the web UI to see your results.[/green]")
|
|
373
|
+
|
|
374
|
+
|
|
375
|
+
# =============================================================================
|
|
376
|
+
# Tasks Commands (DEPRECATED - use 'session join' instead)
|
|
151
377
|
# =============================================================================
|
|
152
378
|
|
|
153
379
|
|
|
154
380
|
@cli.group()
|
|
155
381
|
def tasks():
|
|
156
|
-
"""Task commands."""
|
|
382
|
+
"""Task commands (deprecated - use 'session join' instead)."""
|
|
157
383
|
pass
|
|
158
384
|
|
|
159
385
|
|
|
@@ -223,7 +449,23 @@ def tasks_list(ctx):
|
|
|
223
449
|
@click.argument("task_id", required=False)
|
|
224
450
|
@click.pass_context
|
|
225
451
|
def tasks_run(ctx, task_id: str | None):
|
|
226
|
-
"""Run a task
|
|
452
|
+
"""[DEPRECATED] Run a task - use 'session join' instead."""
|
|
453
|
+
console.print()
|
|
454
|
+
console.print("[red]This command is deprecated.[/red]")
|
|
455
|
+
console.print()
|
|
456
|
+
console.print("The new workflow is:")
|
|
457
|
+
console.print(" 1. Start the task from the web UI: https://cyber-task-horizons.com")
|
|
458
|
+
console.print(" 2. Run the command shown: [bold]hte-cli session join <session_id>[/bold]")
|
|
459
|
+
console.print()
|
|
460
|
+
console.print("This ensures accurate timing by starting the timer only when")
|
|
461
|
+
console.print("the environment is ready, not including Docker setup time.")
|
|
462
|
+
console.print()
|
|
463
|
+
sys.exit(1)
|
|
464
|
+
|
|
465
|
+
|
|
466
|
+
# Keep the old implementation as _tasks_run_legacy for testing if needed
|
|
467
|
+
def _tasks_run_legacy(ctx, task_id: str | None):
|
|
468
|
+
"""Legacy implementation of tasks run (for testing only)."""
|
|
227
469
|
config: Config = ctx.obj["config"]
|
|
228
470
|
|
|
229
471
|
if not config.is_authenticated():
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
# Tests for hte-cli
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
# E2E tests for cyber-task-horizons CLI
|