PyPI - hte-cli - Versions diffs - 0.2.21__tar.gz → 0.2.23__tar.gz - Mend

hte-cli 0.2.21tar.gz → 0.2.23tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

{hte_cli-0.2.21 → hte_cli-0.2.23}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: hte-cli
-Version: 0.2.21
+Version: 0.2.23
 Summary: Human Time-to-Completion Evaluation CLI
 Project-URL: Homepage, https://github.com/sean-peters-au/lyptus-mono
 Author: Lyptus Research

{hte_cli-0.2.21 → hte_cli-0.2.23}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "hte-cli"
-version = "0.2.21"
+version = "0.2.23"
 description = "Human Time-to-Completion Evaluation CLI"
 readme = "README.md"
 requires-python = ">=3.11"

{hte_cli-0.2.21 → hte_cli-0.2.23}/src/hte_cli/cli.py RENAMED Viewed

@@ -194,7 +194,9 @@ def session_join(ctx, session_id: str, force_setup: bool):
             elif e.status_code == 404:
                 console.print("[red]Session not found. Check the session ID and try again.[/red]")
             elif e.status_code == 400 and "paused" in str(e).lower():
-                console.print("[yellow]Session is paused. Please resume from the web UI first.[/yellow]")
+                console.print(
+                    "[yellow]Session is paused. Please resume from the web UI first.[/yellow]"
+                )
             else:
                 console.print(f"[red]Error: {e}[/red]")
             sys.exit(1)
@@ -236,16 +238,16 @@ def session_join(ctx, session_id: str, force_setup: bool):
             try:
                 files_zip = api.get_session_files(session_id)
                 console.print("  [green]✓[/green] Task files downloaded")
-            except APIError as e:
-                console.print(f"  [dim]○ No task files (optional)[/dim]")
+            except APIError:
+                console.print("  [dim]○ No task files (optional)[/dim]")
                 files_zip = None
         with console.status("[dim]Fetching compose configuration...[/dim]"):
             try:
                 compose_yaml = api.get_session_compose(session_id)
                 console.print("  [green]✓[/green] Docker compose downloaded")
-            except APIError as e:
-                console.print(f"  [dim]○ No compose file (optional)[/dim]")
+            except APIError:
+                console.print("  [dim]○ No compose file (optional)[/dim]")
                 compose_yaml = None
         console.print()
@@ -258,9 +260,7 @@ def session_join(ctx, session_id: str, force_setup: bool):
             f"[red]Error: {benchmark} requires a Docker sandbox but no compose file was found.[/red]"
         )
         console.print()
-        console.print(
-            f"Please contact support: {SUPPORT_EMAIL}"
-        )
+        console.print(f"Please contact support: {SUPPORT_EMAIL}")
         sys.exit(1)
     # Build assignment dict for runner compatibility
@@ -280,6 +280,14 @@ def session_join(ctx, session_id: str, force_setup: bool):
         },
     }
+    # Send session_started event (records CLI version for debugging)
+    events.session_started(
+        {
+            "cli_version": __version__,
+            "task_id": session_info["task_id"],
+        }
+    )
     # Step 3: Run setup (skip if reconnecting without force)
     setup_start_time = time.monotonic()
     images = []
@@ -313,7 +321,10 @@ def session_join(ctx, session_id: str, force_setup: bool):
                 # Need to pull - show progress
                 last_status = ["connecting..."]
-                with console.status(f"[yellow]↓[/yellow] {short_name} [dim]connecting...[/dim]") as status:
+                with console.status(
+                    f"[yellow]↓[/yellow] {short_name} [dim]connecting...[/dim]"
+                ) as status:
                     def show_progress(image: str, line: str):
                         # Show docker output directly - includes MB progress from PTY
                         # Lines look like: "abc123: Downloading  360.9MB/4.075GB"
@@ -325,7 +336,9 @@ def session_join(ctx, session_id: str, force_setup: bool):
                                 display = f"{layer_id}: {layer_status}"
                                 if display != last_status[0]:
                                     last_status[0] = display
-                                    status.update(f"[yellow]↓[/yellow] {short_name} [dim]{display}[/dim]")
+                                    status.update(
+                                        f"[yellow]↓[/yellow] {short_name} [dim]{display}[/dim]"
+                                    )
                     success = pull_image_with_progress(img, on_progress=show_progress)
@@ -370,7 +383,13 @@ def session_join(ctx, session_id: str, force_setup: bool):
         console.print()
     # Step 3: Run the task using TaskRunner
-    step_num = "3" if (not is_reconnect or force_setup) and images else "2" if (not is_reconnect or force_setup) else "1"
+    step_num = (
+        "3"
+        if (not is_reconnect or force_setup) and images
+        else "2"
+        if (not is_reconnect or force_setup)
+        else "1"
+    )
     console.print(f"[bold]Step {step_num}:[/bold] Starting task environment...")
     console.print("[dim]Launching Docker containers...[/dim]")
     console.print()
@@ -391,7 +410,9 @@ def session_join(ctx, session_id: str, force_setup: bool):
     except KeyboardInterrupt:
         events.docker_stopped(exit_code=130)
         console.print()
-        console.print("[yellow]Interrupted. Session remains active - you can reconnect later.[/yellow]")
+        console.print(
+            "[yellow]Interrupted. Session remains active - you can reconnect later.[/yellow]"
+        )
         sys.exit(0)
     except Exception as e:
         events.docker_stopped(exit_code=1)
@@ -415,10 +436,12 @@ def session_join(ctx, session_id: str, force_setup: bool):
             try:
                 from io import BytesIO
                 from zipfile import ZipFile
                 with ZipFile(BytesIO(files_zip)) as zf:
                     if "difficulty_levels.json" in zf.namelist():
                         with zf.open("difficulty_levels.json") as f:
                             import json
                             difficulty_info = json.load(f)
                             agent_id = difficulty_info.get("agent_id")
             except Exception:
@@ -429,13 +452,21 @@ def session_join(ctx, session_id: str, force_setup: bool):
         console.print(f"Answer: {result.answer}")
         console.print(f"Time: {result.time_seconds:.1f}s")
+        # Track upload size and timing
+        upload_size_bytes = len(eval_log_bytes) if eval_log_bytes else 0
+        upload_size_kb = upload_size_bytes / 1024
+        events.upload_started(size_bytes=upload_size_bytes)
+        upload_start_time = time.monotonic()
         # Upload to server
         with Progress(
             SpinnerColumn(),
             TextColumn("[progress.description]{task.description}"),
             console=console,
         ) as progress:
-            progress.add_task("Uploading result...", total=None)
+            size_str = f" ({upload_size_kb:.0f} KB)" if upload_size_kb > 0 else ""
+            progress.add_task(f"Uploading result{size_str}...", total=None)
             try:
                 upload_result = api.upload_result(
                     session_id=session_id,
@@ -450,6 +481,10 @@ def session_join(ctx, session_id: str, force_setup: bool):
                 console.print(f"[red]Failed to upload result: {e}[/red]")
                 sys.exit(1)
+        # Record upload completion
+        upload_duration = time.monotonic() - upload_start_time
+        events.upload_completed(duration_seconds=upload_duration, size_bytes=upload_size_bytes)
         if upload_result.get("score") is not None:
             console.print(f"Score: {upload_result['score']}")
@@ -548,474 +583,6 @@ def tasks_run(ctx, task_id: str | None):
     sys.exit(1)
-# Keep the old implementation as _tasks_run_legacy for testing if needed
-def _tasks_run_legacy(ctx, task_id: str | None):
-    """Legacy implementation of tasks run (for testing only)."""
-    config: Config = ctx.obj["config"]
-    if not config.is_authenticated():
-        console.print("[red]Not logged in. Run: hte-cli auth login[/red]")
-        sys.exit(1)
-    # Check Docker and Compose version
-    docker_ok, docker_error = _check_docker()
-    if not docker_ok:
-        console.print(f"[red]{docker_error}[/red]")
-        sys.exit(1)
-    api = APIClient(config)
-    # Get assignments
-    with Progress(
-        SpinnerColumn(),
-        TextColumn("[progress.description]{task.description}"),
-        console=console,
-    ) as progress:
-        progress.add_task("Fetching assignments...", total=None)
-        try:
-            assignments = api.get_assignments()
-        except APIError as e:
-            console.print(f"[red]Error: {e}[/red]")
-            sys.exit(1)
-    if not assignments:
-        console.print("[yellow]No pending assignments[/yellow]")
-        return
-    # Find the assignment to run
-    assignment = None
-    if task_id:
-        for a in assignments:
-            if a["task_id"] == task_id:
-                assignment = a
-                break
-        if not assignment:
-            console.print(f"[red]Task not found in your assignments: {task_id}[/red]")
-            sys.exit(1)
-    else:
-        # Take highest priority (first in list, already sorted by server)
-        assignment = assignments[0]
-    console.print()
-    console.print(
-        Panel(
-            f"[bold]Task:[/bold] {assignment['task_id']}\n"
-            f"[bold]Benchmark:[/bold] {assignment['benchmark']}\n"
-            f"[bold]Mode:[/bold] {assignment['mode']}",
-            title="Starting Task",
-        )
-    )
-    console.print()
-    # Import runner and events
-    from hte_cli.runner import TaskRunner
-    from hte_cli.events import EventStreamer
-    # Step 1: Start session (or resume existing)
-    session_id = assignment.get("session_id")
-    if not session_id:
-        with Progress(
-            SpinnerColumn(),
-            TextColumn("[progress.description]{task.description}"),
-            console=console,
-        ) as progress:
-            progress.add_task("Starting session...", total=None)
-            try:
-                session_info = api.start_session(assignment["assignment_id"])
-                session_id = session_info["session_id"]
-            except APIError as e:
-                console.print(f"[red]Failed to start session: {e}[/red]")
-                sys.exit(1)
-    else:
-        console.print(f"[yellow]Resuming session: {session_id}[/yellow]")
-    # Create event streamer
-    events = EventStreamer(api, session_id)
-    # Step 2: Download task files and compose
-    files_zip = None
-    compose_yaml = None
-    with Progress(
-        SpinnerColumn(),
-        TextColumn("[progress.description]{task.description}"),
-        console=console,
-    ) as progress:
-        # Download files
-        task_id_display = progress.add_task("Downloading task files...", total=None)
-        try:
-            files_zip = api.get_assignment_files(assignment["assignment_id"])
-        except APIError as e:
-            # Files are optional for some benchmarks
-            console.print(f"[dim]Note: {e}[/dim]")
-        # Download compose
-        progress.update(task_id_display, description="Downloading compose file...")
-        try:
-            compose_yaml = api.get_assignment_compose(assignment["assignment_id"])
-        except APIError as e:
-            # Compose is optional for simple benchmarks
-            console.print(f"[dim]Note: {e}[/dim]")
-    # Validate compose for sandbox-required benchmarks
-    benchmark = assignment.get("benchmark", "").lower()
-    SANDBOX_REQUIRED_BENCHMARKS = {"cybench", "intercode-ctf", "cvebench", "cybergym", "nyuctf"}
-    if benchmark in SANDBOX_REQUIRED_BENCHMARKS and not compose_yaml:
-        console.print(
-            f"[red]Error: This {benchmark} task requires a Docker sandbox but none is available.[/red]"
-        )
-        if benchmark == "cvebench":
-            console.print(
-                "[yellow]This CVE may have infrastructure issues preventing Docker support.[/yellow]"
-            )
-        console.print("[dim]Contact support if you believe this is an error.[/dim]")
-        sys.exit(1)
-    # Step 3: Send session_started event
-    events.session_started(
-        {
-            "cli_version": __version__,
-            "task_id": assignment["task_id"],
-        }
-    )
-    # Step 4: Show pre-task message (benchmark-specific)
-    single_shot_benchmarks = {"nl2bash", "cybashbench"}
-    console.print()
-    if benchmark in single_shot_benchmarks:
-        # Single-shot benchmarks - no task score available
-        console.print(
-            Panel(
-                "[bold]Instructions[/bold]\n\n"
-                "You are about to enter the task environment.\n\n"
-                "[yellow]IMPORTANT: This is a SINGLE-SHOT task.[/yellow]\n"
-                "[yellow]You get ONE submission attempt - no retries![/yellow]\n\n"
-                "Commands available:\n"
-                "  [cyan]task status[/cyan]  - Show elapsed time\n"
-                '  [cyan]task submit "answer"[/cyan]  - Submit your FINAL answer (ends task)\n'
-                "  [cyan]task quit[/cyan]  - Quit without submitting\n"
-                '  [cyan]task note "text"[/cyan]  - Record observations\n',
-                title="Task Environment",
-            )
-        )
-    else:
-        # CTF/sandbox benchmarks - task score available
-        console.print(
-            Panel(
-                "[bold]Instructions[/bold]\n\n"
-                "You are about to enter the task environment.\n\n"
-                "Commands available:\n"
-                "  [cyan]task status[/cyan]  - Show elapsed time\n"
-                '  [cyan]task score "answer"[/cyan]  - CHECK if correct (does NOT end task)\n'
-                '  [cyan]task submit "answer"[/cyan]  - Submit FINAL answer (ends task)\n'
-                "  [cyan]task quit[/cyan]  - Quit without submitting\n"
-                '  [cyan]task note "text"[/cyan]  - Record observations\n\n'
-                "[green]TIP: Use 'task score' to verify before submitting![/green]\n",
-                title="Task Environment",
-            )
-        )
-    console.print()
-    if not click.confirm("Ready to start?"):
-        console.print("[yellow]Cancelled[/yellow]")
-        return
-    # Step 5: Pre-pull Docker images with progress
-    from hte_cli.image_utils import extract_images_from_compose
-    import re
-    import time
-    setup_start_time = time.monotonic()
-    images: list[str] = []
-    results: list[tuple[str, bool, str]] = []
-    if compose_yaml:
-        images = extract_images_from_compose(compose_yaml)
-        if images:
-            events.setup_started(images)
-            console.print()
-            console.print(f"[bold]Preparing Docker environment ({len(images)} images)...[/bold]")
-            # Track layer progress per image: {layer_id: (status, downloaded_mb, total_mb)}
-            image_layers: dict[str, dict[str, tuple[str, float, float]]] = {}
-            def parse_size(size_str: str) -> float:
-                """Parse size string like '1.2MB' or '500kB' to MB."""
-                size_str = size_str.strip().upper()
-                if "GB" in size_str:
-                    return float(size_str.replace("GB", "").strip()) * 1024
-                elif "MB" in size_str:
-                    return float(size_str.replace("MB", "").strip())
-                elif "KB" in size_str:
-                    return float(size_str.replace("KB", "").strip()) / 1024
-                elif "B" in size_str:
-                    return float(size_str.replace("B", "").strip()) / (1024 * 1024)
-                return 0
-            def parse_docker_line(line: str) -> tuple[str | None, str, float, float]:
-                """Parse Docker pull output to extract layer ID, status, and sizes.
-                Returns: (layer_id, status, downloaded_mb, total_mb)
-                """
-                # Format: "79f742de2855: Downloading [==>] 1.2MB/50MB"
-                # Or: "79f742de2855: Pull complete"
-                match = re.match(r"([a-f0-9]+): (.+)", line)
-                if not match:
-                    return None, "", 0, 0
-                layer_id = match.group(1)
-                status_part = match.group(2)
-                # Try to extract size info from "Downloading [==>] 1.2MB/50MB"
-                size_match = re.search(r"([\d.]+[kKmMgG]?[bB]?)/([\d.]+[kKmMgG]?[bB])", status_part)
-                if size_match:
-                    downloaded = parse_size(size_match.group(1))
-                    total = parse_size(size_match.group(2))
-                    return layer_id, status_part, downloaded, total
-                return layer_id, status_part, 0, 0
-            def get_progress_summary(image: str) -> str:
-                """Get a human-readable progress summary for an image with MB counts."""
-                if image not in image_layers or not image_layers[image]:
-                    return "connecting..."
-                layers = image_layers[image]
-                total_layers = len(layers)
-                # Count layers in different states
-                complete = 0
-                downloading = 0
-                waiting = 0
-                total_downloaded_mb = 0
-                total_size_mb = 0
-                for status, downloaded, total in layers.values():
-                    status_lower = status.lower()
-                    if "complete" in status_lower:
-                        complete += 1
-                        total_downloaded_mb += total
-                        total_size_mb += total
-                    elif "downloading" in status_lower:
-                        downloading += 1
-                        total_downloaded_mb += downloaded
-                        total_size_mb += total
-                    elif "waiting" in status_lower:
-                        waiting += 1
-                # Choose the most informative display
-                if complete == total_layers and total_layers > 0:
-                    if total_size_mb > 0:
-                        return f"done ({total_size_mb:.0f}MB)"
-                    return f"done ({total_layers} layers)"
-                elif total_size_mb > 0:
-                    # Show MB progress when available
-                    pct = int(100 * total_downloaded_mb / total_size_mb) if total_size_mb > 0 else 0
-                    return f"{total_downloaded_mb:.0f}/{total_size_mb:.0f}MB ({pct}%)"
-                elif downloading > 0:
-                    return f"downloading ({complete}/{total_layers} done)"
-                elif complete > 0:
-                    return f"extracting ({complete}/{total_layers} done)"
-                elif waiting > 0:
-                    return f"queued ({total_layers} layers)"
-                else:
-                    return f"preparing ({total_layers} layers)"
-            def on_image_progress(image: str, line: str):
-                """Track layer-level progress with size info."""
-                if image not in image_layers:
-                    image_layers[image] = {}
-                layer_id, status, downloaded, total = parse_docker_line(line)
-                if layer_id:
-                    image_layers[image][layer_id] = (status, downloaded, total)
-            # Process images sequentially with clear output
-            results = []
-            for idx, img in enumerate(images, 1):
-                short_name = img.split("/")[-1] if "/" in img else img
-                # Check if cached first
-                from hte_cli.image_utils import check_image_exists_locally, pull_image_with_progress
-                if check_image_exists_locally(img):
-                    console.print(f"  [green]✓[/green] {short_name} [dim](cached)[/dim]")
-                    results.append((img, True, "cached"))
-                    continue
-                # Need to pull - use Rich Status for live updates
-                image_layers[img] = {}
-                with console.status(
-                    f"[yellow]↓[/yellow] {short_name} [dim]connecting...[/dim]"
-                ) as status:
-                    def show_progress(image: str, line: str):
-                        on_image_progress(image, line)
-                        summary = get_progress_summary(image)
-                        status.update(f"[yellow]↓[/yellow] {short_name} [dim]{summary}[/dim]")
-                    success = pull_image_with_progress(img, on_progress=show_progress)
-                # Final status (printed after status context exits)
-                if success:
-                    console.print(f"  [green]✓[/green] {short_name} [dim](downloaded)[/dim]")
-                    results.append((img, True, "pulled"))
-                else:
-                    console.print(f"  [red]✗[/red] {short_name} [dim](failed)[/dim]")
-                    results.append((img, False, "failed"))
-            failed_count = sum(1 for _, ok, _ in results if not ok)
-            if failed_count > 0:
-                console.print(
-                    f"[yellow]Warning: {failed_count} image(s) failed to pull. "
-                    "Task may fail to start.[/yellow]"
-                )
-            console.print()
-    # Record image pull timing
-    if images:
-        pull_duration = time.monotonic() - setup_start_time
-        pulled = [img for img, ok, status in results if ok and status == "pulled"]
-        cached = [img for img, ok, status in results if ok and status == "cached"]
-        failed = [img for img, ok, status in results if not ok]
-        events.image_pull_completed(
-            duration_seconds=pull_duration,
-            pulled=pulled,
-            cached=cached,
-            failed=failed,
-        )
-    # Step 6: Run Inspect's human_cli
-    runner = TaskRunner()
-    console.print("[bold]Starting task environment...[/bold]")
-    console.print("[dim]Launching Docker containers...[/dim]")
-    console.print()
-    events.docker_started()
-    # Record total setup time (image pulls + compose up)
-    total_setup = time.monotonic() - setup_start_time
-    events.setup_completed(total_seconds=total_setup)
-    eval_log_bytes = None
-    local_eval_path = None
-    try:
-        result = runner.run_from_assignment(
-            assignment=assignment,
-            compose_yaml=compose_yaml,
-            files_zip=files_zip,
-        )
-        # Read eval log BEFORE cleanup (cleanup deletes the temp directory)
-        if result.eval_log_path and result.eval_log_path.exists():
-            eval_log_bytes = result.eval_log_path.read_bytes()
-            # Save local copy for safety
-            eval_logs_dir = get_eval_logs_dir()
-            eval_logs_dir.mkdir(parents=True, exist_ok=True)
-            local_eval_path = eval_logs_dir / result.eval_log_path.name
-            local_eval_path.write_bytes(eval_log_bytes)
-    except Exception as e:
-        events.docker_stopped(exit_code=1)
-        console.print(f"[red]Task execution failed: {e}[/red]")
-        sys.exit(1)
-    finally:
-        runner.cleanup()
-    events.docker_stopped(exit_code=0)
-    # Step 6: Show post-task summary
-    console.print()
-    console.print(
-        Panel(
-            f"[bold]Time spent:[/bold] {result.time_seconds / 60:.1f} minutes\n"
-            f"[bold]Answer:[/bold] {result.answer or '(none)'}\n"
-            f"[bold]Score:[/bold] {result.score if result.score is not None else 'pending'}",
-            title="Task Complete",
-        )
-    )
-    # Defensive check: don't upload if task didn't actually run
-    # (catches edge cases where runner returned without proper error)
-    if result.time_seconds == 0.0 and result.answer is None:
-        console.print()
-        console.print("[red]Task did not complete successfully (0 time, no answer).[/red]")
-        console.print("[yellow]Session preserved - run 'hte-cli tasks run' to retry.[/yellow]")
-        sys.exit(1)
-    # Step 7: Upload result
-    events.session_completed(
-        elapsed_seconds=result.time_seconds,
-        answer=result.answer,
-    )
-    # Extract agent_id from task files for CyberGym post-hoc verification
-    agent_id = None
-    if files_zip:
-        try:
-            with ZipFile(BytesIO(files_zip)) as zf:
-                if "difficulty_levels.json" in zf.namelist():
-                    with zf.open("difficulty_levels.json") as f:
-                        difficulty_info = json.load(f)
-                        agent_id = difficulty_info.get("agent_id")
-        except Exception:
-            pass  # Not a CyberGym task or malformed zip
-    # Show upload size info and track timing
-    upload_size_bytes = len(eval_log_bytes) if eval_log_bytes else 0
-    upload_size_kb = upload_size_bytes / 1024
-    if upload_size_kb / 1024 > 50:
-        console.print(f"[yellow]Warning: Large eval log ({upload_size_kb / 1024:.1f} MB)[/yellow]")
-    events.upload_started(size_bytes=upload_size_bytes)
-    upload_start_time = time.monotonic()
-    with Progress(
-        SpinnerColumn(),
-        TextColumn("[progress.description]{task.description}"),
-        console=console,
-    ) as progress:
-        size_str = f" ({upload_size_kb:.0f} KB)" if upload_size_kb > 0 else ""
-        progress.add_task(f"Uploading result{size_str}...", total=None)
-        try:
-            upload_result = api.upload_result(
-                session_id=session_id,
-                answer=result.answer or "",
-                client_active_seconds=result.time_seconds,
-                eval_log_bytes=eval_log_bytes,
-                score=result.score,
-                score_binarized=result.score_binarized,
-                agent_id=agent_id,
-            )
-        except APIError as e:
-            console.print(f"[red]Failed to upload result: {e}[/red]")
-            if local_eval_path:
-                console.print(f"[yellow]Eval log saved locally: {local_eval_path}[/yellow]")
-            console.print("[yellow]Your result was saved locally but not uploaded.[/yellow]")
-            sys.exit(1)
-    # Record upload completion
-    upload_duration = time.monotonic() - upload_start_time
-    events.upload_completed(duration_seconds=upload_duration, size_bytes=upload_size_bytes)
-    console.print()
-    console.print("[green]Result uploaded successfully![/green]")
-    # Show local eval log path (quote paths with spaces for easy copy-paste)
-    if local_eval_path:
-        path_str = str(local_eval_path)
-        if " " in path_str:
-            path_str = f'"{path_str}"'
-        console.print(f"[dim]Eval log: {path_str}[/dim]")
-    # Show next task if available
-    if upload_result.get("next_assignment_id"):
-        console.print()
-        console.print("Run [bold]hte-cli tasks run[/bold] for the next task.")
 @tasks.command("pull-images")
 @click.option("--count", "-n", default=5, help="Number of upcoming tasks to pull images for")
 @click.pass_context

{hte_cli-0.2.21 → hte_cli-0.2.23}/tests/e2e/automated_runner.py RENAMED Viewed

@@ -52,6 +52,10 @@ def strip_ansi(text: str) -> str:
 VPS_HOST = os.environ.get("VPS_HOST", "root@209.38.25.118")
+# Use the pipx-installed CLI (what experts use), not the local venv version
+# This ensures E2E tests validate the actual published PyPI package
+HTE_CLI_PATH = Path.home() / ".local" / "bin" / "hte-cli"
 # Environment verification commands per benchmark
 # Based on docs/benchmark-environments.md
 # Format: (command, expected_pattern, description)
@@ -114,7 +118,11 @@ ENVIRONMENT_CHECKS = {
         # 32-bit support (for CTF binaries)
         ("test -f /lib32/libc.so.6 && echo 'lib32'", "lib32", "32-bit libc available"),
         # Task files location - STRICT
-        ("test -d /home/ctfplayer/ctf_files && echo 'EXISTS'", "EXISTS", "ctf_files directory exists"),
+        (
+            "test -d /home/ctfplayer/ctf_files && echo 'EXISTS'",
+            "EXISTS",
+            "ctf_files directory exists",
+        ),
     ],
     "cybench": [
         # Working directory: /app (official cybench/cybench:latest image)
@@ -148,22 +156,26 @@ ENVIRONMENT_CHECKS = {
         # Check that files exist in /root (not just empty directory) using test -gt 0
         ("test $(ls -A /root/ | wc -l) -gt 0", None, "Items in /root"),
         # For tasks with subdirs (chunky, etc.), verify structure is mounted
-        ("test $(find /root -type f 2>/dev/null | head -5 | wc -l) -gt 0", None, "Files under /root (recursive)"),
+        (
+            "test $(find /root -type f 2>/dev/null | head -5 | wc -l) -gt 0",
+            None,
+            "Files under /root (recursive)",
+        ),
     ],
     "cybergym": [
         # Working directory: /root
         ("pwd", "/root", "Working directory is /root"),
         # OS: Kali Linux (same base as cybench)
         ("cat /etc/os-release | grep '^NAME='", "Kali", "Kali Linux base image"),
-        # CyberGym-specific: submit.sh MUST exist for PoC submission
-        ("test -f /submit.sh && echo 'submit.sh'", "submit.sh", "submit.sh present at /submit.sh"),
-        ("test -x /submit.sh && echo 'executable'", "executable", "submit.sh is executable"),
+        # CyberGym-specific: submit.sh in home dir for PoC submission
+        ("test -f /root/submit.sh && echo 'submit.sh'", "submit.sh", "submit.sh present in /root"),
+        ("test -x /root/submit.sh && echo 'executable'", "executable", "submit.sh is executable"),
         # Required tools - STRICT (same image as cybench)
         ("which gdb", "/usr/bin/gdb", "gdb at /usr/bin/gdb"),
         ("which ghidra", "/usr/bin/ghidra", "ghidra at /usr/bin/ghidra"),
         ("which radare2", "/usr/bin/radare2", "radare2 at /usr/bin/radare2"),
-        # Core security tools - STRICT
-        ("which python3", "/usr/bin/python3", "python3 at /usr/bin/python3"),
+        # Python via venv (same as cybench/cybench:latest)
+        ("test -f /venv/bin/python3 && echo 'venv'", "venv", "Python venv at /venv"),
         ("which nmap", "/usr/bin/nmap", "nmap at /usr/bin/nmap"),
         ("which curl", "/usr/bin/curl", "curl at /usr/bin/curl"),
         # apt-get for additional tools
@@ -203,7 +215,7 @@ TASK_FILE_CHECKS = {
         "primary_knowledge": [],  # No local files - network-based
     },
     "cybergym": {
-        "*": ["/submit.sh"],  # All cybergym tasks need submit.sh
+        "*": ["/root/submit.sh"],  # All cybergym tasks need submit.sh in home
     },
     "nyuctf": {
         "*": ["/home/ctfplayer/ctf_files"],  # Standard location
@@ -223,29 +235,28 @@ def get_file_checks_for_task(benchmark: str, task_id: str) -> list[tuple]:
     expected = task_files.get(task_id) or task_files.get("*", [])
     for path in expected:
-        checks.append(
-            (f"test -e {path} && echo 'EXISTS'", "EXISTS", f"{path} exists")
-        )
+        checks.append((f"test -e {path} && echo 'EXISTS'", "EXISTS", f"{path} exists"))
     return checks
 # Commands to run for submission tests
 SUBMISSION_TESTS = {
     "intercode-ctf": {
         "score_cmd": 'task score "test_flag_123"',
-        "score_expect": "incorrect",  # Should say incorrect
+        "score_expect": "Score:",  # Output shows "Score: I" (I=Incorrect)
         "submit_cmd": 'task submit "test_flag_for_e2e"',
         "submit_expect": "submitted",
     },
     "nyuctf": {
         "score_cmd": 'task score "test_flag_123"',
-        "score_expect": "incorrect",
+        "score_expect": "Score:",  # Output shows "Score: I" (I=Incorrect)
         "submit_cmd": 'task submit "test_flag_for_e2e"',
         "submit_expect": "submitted",
     },
     "cybench": {
         "score_cmd": 'task score "test_flag_123"',
-        "score_expect": "incorrect",
+        "score_expect": "Score:",  # Output shows "Score: I" (I=Incorrect)
         "submit_cmd": 'task submit "test_flag_for_e2e"',
         "submit_expect": "submitted",
     },
@@ -266,6 +277,12 @@ SUBMISSION_TESTS = {
         "submit_cmd": 'task submit "ls -la"',
         "submit_expect": "submitted",
     },
+    "nl2bash": {
+        # Single-shot - no score command (same as cybashbench)
+        "score_cmd": None,
+        "submit_cmd": 'task submit "ls -la"',
+        "submit_expect": "submitted",
+    },
 }
@@ -374,9 +391,16 @@ def run_automated_test(task_id: str, benchmark: str, timeout: int = 300) -> list
     # Start the CLI using the new session join flow
     # Session has status="created", so CLI will run full setup
-    console.print(f"Launching hte-cli session join {session_id}...")
+    # Use explicit pipx path to test the published PyPI version, not local dev
+    if not HTE_CLI_PATH.exists():
+        console.print(f"[red]hte-cli not found at {HTE_CLI_PATH}[/red]")
+        console.print("[yellow]Install with: pipx install hte-cli[/yellow]")
+        results.append(TestResult("CLI installed", False, "", f"hte-cli not at {HTE_CLI_PATH}"))
+        return results
+    console.print(f"Launching {HTE_CLI_PATH} session join {session_id}...")
     child = pexpect.spawn(
-        f"hte-cli session join {session_id}",
+        f"{HTE_CLI_PATH} session join {session_id}",
         encoding="utf-8",
         timeout=timeout,
         env={**os.environ, "TERM": "dumb"},  # Disable colors for easier parsing
@@ -426,6 +450,7 @@ def run_automated_test(task_id: str, benchmark: str, timeout: int = 300) -> list
         results.append(TestResult("Environment setup", True, "Environment ready"))
         # Wait for the "Login to the system" message and docker exec command
+        # CVE bench builds containers from source, can take 5+ minutes
         console.print("Waiting for docker exec command...")
         idx = child.expect(
             [
@@ -433,7 +458,7 @@ def run_automated_test(task_id: str, benchmark: str, timeout: int = 300) -> list
                 r"docker exec -it",
                 pexpect.TIMEOUT,
             ],
-            timeout=120,
+            timeout=300,  # 5 minutes for slow builds (cvebench)
         )
         if idx == 2:  # TIMEOUT
@@ -603,15 +628,24 @@ def run_automated_test(task_id: str, benchmark: str, timeout: int = 300) -> list
         # Test score command if available
         if sub_tests.get("score_cmd"):
+            # Clear buffer before score test to avoid capturing stale output
+            try:
+                docker_child.read_nonblocking(size=10000, timeout=0.5)
+            except Exception:
+                pass
             docker_child.sendline(sub_tests["score_cmd"])
             time.sleep(2)
             docker_child.expect(prompt_patterns[:-1], timeout=30)
             output = strip_ansi(docker_child.before or "")
             expected_score = sub_tests.get("score_expect")
             if expected_score:
                 passed = expected_score.lower() in output.lower()
-                details = output[:200] if passed else f"Expected '{expected_score}' in output: {output[:100]}..."
+                details = (
+                    output[:200]
+                    if passed
+                    else f"Expected '{expected_score}' in output: {output[:100]}..."
+                )
                 results.append(TestResult("task score", passed, details))
             else:
                 results.append(
@@ -663,7 +697,10 @@ def run_automated_test(task_id: str, benchmark: str, timeout: int = 300) -> list
             else:
                 results.append(
                     TestResult(
-                        "Submission", False, docker_child.before or "", "Submission timed out waiting for result"
+                        "Submission",
+                        False,
+                        docker_child.before or "",
+                        "Submission timed out waiting for result",
                     )
                 )
         elif idx < 3:
@@ -759,26 +796,36 @@ def verify_artifacts(task_id: str, benchmark: str) -> list[TestResult]:
                 "Active time recorded", float(active_seconds or 0) > 0, f"Seconds: {active_seconds}"
             )
         )
         # Verify answer
-        if expected_answer and benchmark != "cybergym": # Cybergym submits file content, hard to verify here
-             results.append(
+        if (
+            expected_answer and benchmark != "cybergym"
+        ):  # Cybergym submits file content, hard to verify here
+            results.append(
                 TestResult(
                     "Answer matches submission",
                     answer == expected_answer,
-                    f"Expected: '{expected_answer}', Got: '{answer}'"
+                    f"Expected: '{expected_answer}', Got: '{answer}'",
                 )
             )
         else:
-             results.append(
+            results.append(
                 TestResult(
                     "Answer recorded", bool(answer), f"Answer: {answer[:50]}..." if answer else ""
                 )
             )
-        results.append(
-            TestResult("Score recorded", score != "", f"Score: {score}" if score else "No score")
-        )
+        # Score check - some benchmarks compute scores server-side later (not immediately)
+        no_immediate_score = benchmark in ("cybashbench", "nl2bash")
+        if no_immediate_score:
+            # These benchmarks don't produce immediate scores - skip check
+            pass
+        else:
+            results.append(
+                TestResult(
+                    "Score recorded", score != "", f"Score: {score}" if score else "No score"
+                )
+            )
         # Check events (new flow uses setup_started/setup_completed instead of session_started)
         events = ssh_query(f"""

{hte_cli-0.2.21 → hte_cli-0.2.23}/tests/e2e/e2e_test.py RENAMED Viewed

@@ -37,6 +37,9 @@ TEST_NAME = "E2E Test User"
 # CLI config path (matches platformdirs on macOS)
 CLI_CONFIG_PATH = Path.home() / "Library" / "Application Support" / "hte-cli" / "config.json"
+# Use the pipx-installed CLI (what experts use), not the local venv version
+HTE_CLI_PATH = Path.home() / ".local" / "bin" / "hte-cli"
 # Task assignments: 4 per benchmark
 # First 2 for pytest API tests, last 2 for interactive tests
 BENCHMARK_TASKS = {
@@ -347,10 +350,10 @@ def setup(admin_password: str, yes: bool):
     CLI_CONFIG_PATH.write_text(json.dumps(config, indent=2))
     console.print("[green]CLI config written[/green]")
-    # 7. Verify CLI works
+    # 7. Verify CLI works (use pipx version, not local venv)
     console.print("\nVerifying CLI authentication...")
     result = subprocess.run(
-        ["hte-cli", "auth", "status"],
+        [str(HTE_CLI_PATH), "auth", "status"],
         capture_output=True,
         text=True,
     )
@@ -734,11 +737,14 @@ def full(admin_password: str, yes: bool, skip_setup: bool, cleanup_after: bool):
     phase1_result = subprocess.run(
         [
-            "uv", "run", "pytest",
+            "uv",
+            "run",
+            "pytest",
             str(tests_dir / "test_infrastructure.py"),
             str(tests_dir / "test_runtime_imports.py"),
             str(tests_dir / "test_benchmark_flows.py"),
-            "-v", "--tb=short",
+            "-v",
+            "--tb=short",
         ],
         cwd=tests_dir.parent.parent,
     )
@@ -785,10 +791,13 @@ def full(admin_password: str, yes: bool, skip_setup: bool, cleanup_after: bool):
     phase3_result = subprocess.run(
         [
-            "uv", "run", "pytest",
+            "uv",
+            "run",
+            "pytest",
             str(tests_dir / "test_session_lifecycle.py"),
             str(tests_dir / "test_eval_logs.py"),
-            "-v", "--tb=short",
+            "-v",
+            "--tb=short",
         ],
         cwd=tests_dir.parent.parent,
     )
@@ -833,10 +842,11 @@ def _print_full_summary(results: dict):
     if results["phase2"]:
         passed = sum(1 for v in results["phase2"].values() if v)
         total = len(results["phase2"])
-        status = "[green]PASSED[/green]" if passed == total else f"[yellow]{passed}/{total}[/yellow]"
+        status = (
+            "[green]PASSED[/green]" if passed == total else f"[yellow]{passed}/{total}[/yellow]"
+        )
         details = ", ".join(
-            f"[green]{b}[/green]" if v else f"[red]{b}[/red]"
-            for b, v in results["phase2"].items()
+            f"[green]{b}[/green]" if v else f"[red]{b}[/red]" for b, v in results["phase2"].items()
         )
         table.add_row("Phase 2: Benchmarks", status, details)

{hte_cli-0.2.21 → hte_cli-0.2.23}/tests/e2e/test_benchmark_flows.py RENAMED Viewed

@@ -16,7 +16,6 @@ import requests
 from tests.e2e.conftest import (
     BASE_URL,
     EXPECTED_ASSIGNMENT_COUNT,
-    EXPECTED_TASKS,
     get_test_user_id,
     ssh_command,
     ssh_query,
@@ -379,9 +378,9 @@ class TestCrossBenchmark:
             SELECT COUNT(*) FROM assignments
             WHERE user_id = '{get_test_user_id()}'
         """)
-        assert int(count) == EXPECTED_ASSIGNMENT_COUNT, (
-            f"Expected {EXPECTED_ASSIGNMENT_COUNT} assignments, got {count}"
-        )
+        assert (
+            int(count) == EXPECTED_ASSIGNMENT_COUNT
+        ), f"Expected {EXPECTED_ASSIGNMENT_COUNT} assignments, got {count}"
 # =============================================================================

{hte_cli-0.2.21 → hte_cli-0.2.23}/tests/e2e/test_eval_logs.py RENAMED Viewed

@@ -28,6 +28,18 @@ LOCAL_EVAL_LOGS_DIR = Path.home() / "Library" / "Application Support" / "hte-cli
 VPS_EVAL_LOGS_DIR = "/opt/hte-web/data/eval_logs"
+def db_path_to_host_path(db_path: str) -> str:
+    """Translate container path stored in DB to host path on VPS.
+    Backend may store paths as:
+    - /data/... (container-relative, needs translation)
+    - /opt/hte-web/data/... (already host path, return as-is)
+    """
+    if db_path.startswith("/opt/hte-web/"):
+        return db_path  # Already a host path
+    return db_path.replace("/data/", "/opt/hte-web/data/")
 def ssh_query(query: str) -> str:
     """Run a sqlite3 query on the VPS."""
     result = subprocess.run(
@@ -129,9 +141,16 @@ class TestVPSEvalLogs:
         """)
         # All completed sessions should have eval log paths
-        assert int(with_path) == int(
-            count
-        ), f"Only {with_path}/{count} completed sessions have eval_log_path"
+        # Handle empty string from SQL query
+        with_path_count = int(with_path) if with_path else 0
+        total_count = int(count) if count else 0
+        if total_count == 0:
+            pytest.skip("No completed sessions to check")
+        assert (
+            with_path_count == total_count
+        ), f"Only {with_path_count}/{total_count} completed sessions have eval_log_path"
     def test_eval_log_files_exist_on_vps(self):
         """Eval log files referenced in DB should exist on VPS."""
@@ -148,8 +167,9 @@ class TestVPSEvalLogs:
         for path in paths.split("\n"):
             if path:
-                exists = ssh_command(f"test -f {path} && echo exists")
-                assert exists == "exists", f"Eval log not found: {path}"
+                host_path = db_path_to_host_path(path)
+                exists = ssh_command(f"test -f {host_path} && echo exists")
+                assert exists == "exists", f"Eval log not found: {host_path} (DB path: {path})"
 # =============================================================================
@@ -176,39 +196,41 @@ class TestEvalLogFormat:
     def test_eval_log_can_be_decompressed(self):
         """Eval logs should be valid gzip files."""
-        path = ssh_query("""
+        db_path = ssh_query("""
             SELECT eval_log_path FROM sessions
             WHERE status = 'submitted'
             AND eval_log_path IS NOT NULL
             LIMIT 1
         """)
-        if not path:
+        if not db_path:
             pytest.skip("No eval logs to test")
+        path = db_path_to_host_path(db_path)
         # Try to decompress
         result = ssh_command(f"gunzip -t {path} 2>&1 && echo ok")
         assert "ok" in result, f"Eval log not valid gzip: {result}"
     def test_eval_log_contains_expected_structure(self):
         """Eval logs should contain expected Inspect AI structure."""
-        path = ssh_query("""
+        db_path = ssh_query("""
             SELECT eval_log_path FROM sessions
             WHERE status = 'submitted'
             AND eval_log_path IS NOT NULL
             LIMIT 1
         """)
-        if not path:
+        if not db_path:
             pytest.skip("No eval logs to test")
+        path = db_path_to_host_path(db_path)
         # List contents of the gzipped eval (it's actually a zip inside gzip)
-        # First copy to temp, decompress, check structure
+        # Use python's zipfile since unzip may not be installed
         result = ssh_command(f"""
             cd /tmp &&
             cp {path} test_eval.gz &&
             gunzip -f test_eval.gz &&
-            unzip -l test_eval 2>/dev/null | head -20
+            python3 -c "import zipfile; z=zipfile.ZipFile('test_eval'); print('\\n'.join(z.namelist()[:20]))"
         """)
         # Should contain header.json at minimum
@@ -226,40 +248,58 @@ class TestEvalLogUpload:
     """Test eval log upload functionality."""
     def test_upload_event_recorded(self):
-        """Upload events should be recorded in session_events."""
+        """Upload events should be recorded in session_events for sessions with eval logs.
+        Note: Upload events were added in CLI v0.2.22. Sessions created with older
+        CLI versions won't have these events.
+        """
+        # Find a session that has:
+        # 1. eval_log_path (proves upload succeeded)
+        # 2. session_started event with cli_version >= 0.2.22 (has upload events)
         session_id = ssh_query(f"""
-            SELECT id FROM sessions
-            WHERE user_id = '{get_test_user_id()}'
-            AND status = 'submitted'
+            SELECT s.id FROM sessions s
+            JOIN session_events se ON s.id = se.session_id
+            WHERE s.user_id = '{get_test_user_id()}'
+            AND s.status = 'submitted'
+            AND s.eval_log_path IS NOT NULL
+            AND se.event_type = 'session_started'
+            AND (
+                json_extract(se.event_data, '$.cli_version') >= '0.2.22'
+                OR json_extract(se.event_data, '$.cli_version') LIKE '0.3.%'
+                OR json_extract(se.event_data, '$.cli_version') LIKE '1.%'
+            )
             LIMIT 1
         """)
         if not session_id:
-            pytest.skip("No completed sessions")
+            pytest.skip("No sessions with CLI v0.2.22+ (upload events added in v0.2.22)")
         events = ssh_query(f"""
             SELECT event_type FROM session_events
             WHERE session_id = '{session_id}'
         """)
-        # Should have upload-related events for completed sessions
+        # Should have upload-related events for sessions with eval logs
         event_list = events.split("\n") if events else []
         has_upload = any("upload" in e.lower() for e in event_list)
-        # Completed sessions should have upload events
-        assert has_upload, f"No upload events found for session {session_id}. Events: {event_list[:5]}"
+        assert (
+            has_upload
+        ), f"No upload events found for session {session_id}. Events: {event_list[:5]}"
     def test_eval_log_size_reasonable(self):
         """Eval logs should be reasonably sized (not empty, not huge)."""
-        path = ssh_query("""
+        db_path = ssh_query("""
             SELECT eval_log_path FROM sessions
             WHERE status = 'submitted'
             AND eval_log_path IS NOT NULL
             LIMIT 1
         """)
-        if not path:
+        if not db_path:
             pytest.skip("No eval logs to test")
+        path = db_path_to_host_path(db_path)
         size = ssh_command(f"stat -c%s {path} 2>/dev/null || stat -f%z {path}")
         if size.isdigit():

{hte_cli-0.2.21 → hte_cli-0.2.23}/tests/e2e/test_infrastructure.py RENAMED Viewed

@@ -114,9 +114,9 @@ class TestAssignments:
         count = ssh_query(
             f"SELECT COUNT(*) FROM assignments WHERE user_id = '{get_test_user_id()}'"
         )
-        assert int(count) == EXPECTED_ASSIGNMENT_COUNT, (
-            f"Expected {EXPECTED_ASSIGNMENT_COUNT} assignments, got {count}"
-        )
+        assert (
+            int(count) == EXPECTED_ASSIGNMENT_COUNT
+        ), f"Expected {EXPECTED_ASSIGNMENT_COUNT} assignments, got {count}"
     @pytest.mark.parametrize("benchmark,tasks", EXPECTED_TASKS.items())
     def test_benchmark_tasks_assigned(self, benchmark, tasks):

{hte_cli-0.2.21 → hte_cli-0.2.23}/tests/e2e/test_runtime_imports.py RENAMED Viewed

@@ -150,9 +150,7 @@ print(f'Loaded {len(HUMAN_REGISTRY)} benchmarks: {list(HUMAN_REGISTRY.keys())}')
         assert "Loaded" in result.stdout
         # Should have exactly 7 benchmarks
-        assert "7 benchmarks" in result.stdout, (
-            f"Expected 7 benchmarks, got: {result.stdout}"
-        )
+        assert "7 benchmarks" in result.stdout, f"Expected 7 benchmarks, got: {result.stdout}"
     def test_backend_can_import_adapters(self):
         """Backend should be able to instantiate adapters."""
@@ -180,9 +178,9 @@ for name, cls in HUMAN_REGISTRY.items():
         # All benchmarks should show OK - STRICT check
         for benchmark in BENCHMARKS:
-            assert f"{benchmark}: OK" in result.stdout, (
-                f"Benchmark {benchmark} not found or not OK in output: {result.stdout}"
-            )
+            assert (
+                f"{benchmark}: OK" in result.stdout
+            ), f"Benchmark {benchmark} not found or not OK in output: {result.stdout}"
 class TestLocalImports:

{hte_cli-0.2.21 → hte_cli-0.2.23}/tests/e2e/test_session_lifecycle.py RENAMED Viewed

@@ -223,7 +223,6 @@ class TestSessionState:
             WHERE user_id = '{get_test_user_id()}'
             AND status = 'abandoned'
         """)
-        count = int(abandoned_count) if abandoned_count else 0
         # Verify the query returned a valid number (not empty/error)
         assert abandoned_count.strip().isdigit(), f"Query returned invalid value: {abandoned_count}"
         # Note: count can legitimately be 0 if no sessions were abandoned

{hte_cli-0.2.21 → hte_cli-0.2.23}/uv.lock RENAMED Viewed

@@ -625,7 +625,7 @@ wheels = [
 [[package]]
 name = "hte-cli"
-version = "0.2.20"
+version = "0.2.22"
 source = { editable = "." }
 dependencies = [
     { name = "click" },