PyPI - wafer-cli - Versions diffs - 0.2.9__py3-none-any.whl → 0.2.10__py3-none-any.whl - Mend

wafer-cli 0.2.9py3-none-any.whl → 0.2.10py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

wafer/GUIDE.md +18 -7
wafer/api_client.py +4 -0
wafer/cli.py +1177 -278
wafer/corpus.py +158 -32
wafer/evaluate.py +75 -6
wafer/kernel_scope.py +132 -31
wafer/nsys_analyze.py +903 -73
wafer/nsys_profile.py +511 -0
wafer/output.py +241 -0
wafer/skills/wafer-guide/SKILL.md +13 -0
wafer/ssh_keys.py +261 -0
wafer/targets_ops.py +718 -0
wafer/wevin_cli.py +127 -18
wafer/workspaces.py +232 -184
{wafer_cli-0.2.9.dist-info → wafer_cli-0.2.10.dist-info}/METADATA +1 -1
{wafer_cli-0.2.9.dist-info → wafer_cli-0.2.10.dist-info}/RECORD +19 -15
{wafer_cli-0.2.9.dist-info → wafer_cli-0.2.10.dist-info}/WHEEL +0 -0
{wafer_cli-0.2.9.dist-info → wafer_cli-0.2.10.dist-info}/entry_points.txt +0 -0
{wafer_cli-0.2.9.dist-info → wafer_cli-0.2.10.dist-info}/top_level.txt +0 -0

wafer/cli.py CHANGED Viewed

@@ -182,7 +182,12 @@ workspaces_app = typer.Typer(
 Workspaces are on-demand cloud GPU environments. Requires authentication (wafer login).
-  wafer workspaces create dev --gpu H100   # Create workspace
+Available GPUs:
+  MI300X  AMD Instinct MI300X (192GB HBM3, ROCm)
+  B200    NVIDIA Blackwell B200 (180GB HBM3e, CUDA)
+Commands:
+  wafer workspaces create dev --gpu B200   # Create workspace
   wafer workspaces exec dev -- python x.py # Run commands
   wafer workspaces ssh dev                 # Interactive SSH
   wafer workspaces sync dev ./project      # Sync files
@@ -190,6 +195,36 @@ Workspaces are on-demand cloud GPU environments. Requires authentication (wafer
 )
 app.add_typer(workspaces_app, name="workspaces")
+# SSH Key management (BYOK - Bring Your Own Key)
+ssh_keys_app = typer.Typer(
+    help="""Manage SSH public keys for workspace access.
+Register your SSH public keys here. These keys are installed in all workspaces
+you provision, enabling SSH access from any machine with your private key.
+  wafer ssh-keys list              # List registered keys
+  wafer ssh-keys add               # Add key (auto-detects ~/.ssh/id_ed25519.pub)
+  wafer ssh-keys add ~/.ssh/id_rsa.pub --name laptop  # Add specific key
+  wafer ssh-keys remove <key-id>   # Remove a key"""
+)
+app.add_typer(ssh_keys_app, name="ssh-keys")
+# Target operations (exec/ssh/sync on configured targets)
+targets_ops_app = typer.Typer(
+    help="""Execute commands on configured GPU targets.
+Run commands, SSH, or sync files to targets without going through evaluate.
+Useful for exploratory work, debugging, or custom scripts.
+  wafer targets exec my-target -- python test.py    # Run command
+  wafer targets ssh my-target                       # Interactive SSH
+  wafer targets sync my-target ./local_dir          # Sync files
+Supports: RunPod, DigitalOcean (auto-provisions), SSH targets (baremetal/vm).
+Configure targets with: wafer config targets init ..."""
+)
+app.add_typer(targets_ops_app, name="targets")
 # Billing management
 billing_app = typer.Typer(help="Manage billing, credits, and subscription")
 app.add_typer(billing_app, name="billing")
@@ -257,13 +292,100 @@ nvidia_app.add_typer(tracelens_app, name="tracelens")
 amd_app = typer.Typer(help="AMD GPU profiling and analysis tools")
 app.add_typer(amd_app, name="amd")
-# ISA analysis - under amd
-isa_app = typer.Typer(help="ISA analysis for AMD GPU code objects (.co files)")
+# Unified ISA Analyzer - supports both .co files and Triton artifacts
+isa_app = typer.Typer(help="ISA analysis for AMD GPU kernels (.co, .s, .ll, .ttgir files)")
 amd_app.add_typer(isa_app, name="isa")
-# Kernel Scope - static ISA analysis for Triton kernels
-kernel_scope_app = typer.Typer(help="Static ISA analysis for Triton compilation artifacts")
-amd_app.add_typer(kernel_scope_app, name="kernel-scope")
+# =============================================================================
+# Roofline analysis (wafer roofline)
+# =============================================================================
+@app.command("roofline")
+def roofline_cmd(
+    gpu: str | None = typer.Option(
+        None, "--gpu", "-g", help="GPU name (e.g., H100, B200, MI300X, A100)"
+    ),
+    bytes_moved: float | None = typer.Option(
+        None, "--bytes", "-b", help="Theoretical minimum bytes moved"
+    ),
+    flops: float | None = typer.Option(None, "--flops", "-f", help="Theoretical minimum FLOPs"),
+    time_ms: float | None = typer.Option(
+        None, "--time-ms", "-t", help="Actual kernel time in milliseconds"
+    ),
+    dtype: str = typer.Option(
+        "fp16", "--dtype", "-d", help="Data type for compute ceiling (fp16, fp32, bf16, fp8, int8)"
+    ),
+    list_gpus: bool = typer.Option(False, "--list-gpus", help="List available GPU specs and exit"),
+) -> None:
+    """Analyze kernel performance against roofline model.
+    The roofline model shows the theoretical speed-of-light (SOL) for your kernel
+    based on whether it's memory-bound or compute-bound.
+    You need to provide:
+    - The GPU you ran on
+    - Theoretical minimum bytes moved (not actual - what the algorithm requires)
+    - Theoretical minimum FLOPs
+    - Actual measured kernel time
+    Example:
+        # Analyze a matmul kernel (4096x4096x4096, FP16)
+        # Theoretical: 2*M*N*K FLOPs = 137.4 TFLOP
+        # Theoretical bytes: (M*K + K*N + M*N) * 2 = 100.7 MB
+        wafer roofline --gpu H100 --bytes 100.7e6 --flops 137.4e12 --time-ms 85
+        # Analyze a memory-bound elementwise add (1B elements FP32)
+        # Reads 2 tensors, writes 1 = 12 GB total
+        # 1B adds = 1 GFLOP
+        wafer roofline --gpu H100 --bytes 12e9 --flops 1e9 --time-ms 4 --dtype fp32
+        # List available GPUs
+        wafer roofline --list-gpus
+    """
+    from wafer_core.roofline import get_gpu_spec, roofline_analysis
+    from wafer_core.roofline import list_gpus as get_all_gpus
+    if list_gpus:
+        typer.echo("Available GPUs:")
+        for name in get_all_gpus():
+            spec = get_gpu_spec(name)
+            typer.echo(
+                f"  {name}: {spec.peak_bandwidth_gbps:.0f} GB/s, {spec.peak_tflops_fp16:.0f} TFLOPS FP16"
+            )
+        return
+    # Validate required args for analysis
+    missing = []
+    if gpu is None:
+        missing.append("--gpu")
+    if bytes_moved is None:
+        missing.append("--bytes")
+    if flops is None:
+        missing.append("--flops")
+    if time_ms is None:
+        missing.append("--time-ms")
+    if missing:
+        typer.echo(f"Error: Missing required options: {', '.join(missing)}", err=True)
+        typer.echo("", err=True)
+        typer.echo("Run 'wafer roofline --help' for usage.", err=True)
+        raise typer.Exit(1)
+    try:
+        result = roofline_analysis(
+            gpu=gpu,
+            dtype=dtype,
+            bytes_moved=bytes_moved,
+            flops=flops,
+            time_ms=time_ms,
+        )
+    except ValueError as e:
+        typer.echo(f"Error: {e}", err=True)
+        raise typer.Exit(1) from None
+    typer.echo(result.format_report())
 # =============================================================================
 # Skill management (wafer skill ...)
@@ -279,21 +401,22 @@ def skill_install(
         "all",
         "--target",
         "-t",
-        help="Target tool: claude, codex, or all",
+        help="Target tool: claude, codex, cursor, or all",
     ),
     force: bool = typer.Option(False, "--force", "-f", help="Overwrite existing skill"),
 ) -> None:
     """Install the wafer-guide skill for AI coding assistants.
     Installs the bundled skill to make wafer commands discoverable by
-    Claude Code and/or OpenAI Codex CLI.
+    Claude Code, OpenAI Codex CLI, and/or Cursor.
     Skills follow the open agent skills specification (agentskills.io).
     Examples:
-        wafer skill install              # Install for both Claude and Codex
+        wafer skill install              # Install for all tools
         wafer skill install -t claude    # Install for Claude Code only
         wafer skill install -t codex     # Install for Codex CLI only
+        wafer skill install -t cursor    # Install for Cursor only
         wafer skill install --force      # Overwrite existing installation
     """
     # Locate bundled skill
@@ -311,9 +434,13 @@ def skill_install(
         ))
     if target in ("all", "codex"):
         targets_to_install.append(("Codex CLI", Path.home() / ".codex" / "skills" / "wafer-guide"))
+    if target in ("all", "cursor"):
+        targets_to_install.append(("Cursor", Path.home() / ".cursor" / "skills" / "wafer-guide"))
     if not targets_to_install:
-        typer.echo(f"Error: Unknown target '{target}'. Use: claude, codex, or all", err=True)
+        typer.echo(
+            f"Error: Unknown target '{target}'. Use: claude, codex, cursor, or all", err=True
+        )
         raise typer.Exit(1)
     for tool_name, dest_path in targets_to_install:
@@ -348,14 +475,15 @@ def skill_uninstall(
         "all",
         "--target",
         "-t",
-        help="Target tool: claude, codex, or all",
+        help="Target tool: claude, codex, cursor, or all",
     ),
 ) -> None:
     """Uninstall the wafer-guide skill.
     Examples:
-        wafer skill uninstall              # Uninstall from both
+        wafer skill uninstall              # Uninstall from all tools
         wafer skill uninstall -t claude    # Uninstall from Claude Code only
+        wafer skill uninstall -t cursor    # Uninstall from Cursor only
     """
     targets_to_uninstall: list[tuple[str, Path]] = []
@@ -369,9 +497,16 @@ def skill_uninstall(
             "Codex CLI",
             Path.home() / ".codex" / "skills" / "wafer-guide",
         ))
+    if target in ("all", "cursor"):
+        targets_to_uninstall.append((
+            "Cursor",
+            Path.home() / ".cursor" / "skills" / "wafer-guide",
+        ))
     if not targets_to_uninstall:
-        typer.echo(f"Error: Unknown target '{target}'. Use: claude, codex, or all", err=True)
+        typer.echo(
+            f"Error: Unknown target '{target}'. Use: claude, codex, cursor, or all", err=True
+        )
         raise typer.Exit(1)
     for tool_name, dest_path in targets_to_uninstall:
@@ -406,6 +541,7 @@ def skill_status() -> None:
     installations = [
         ("Claude Code", Path.home() / ".claude" / "skills" / "wafer-guide"),
         ("Codex CLI", Path.home() / ".codex" / "skills" / "wafer-guide"),
+        ("Cursor", Path.home() / ".cursor" / "skills" / "wafer-guide"),
     ]
     for tool_name, path in installations:
@@ -1114,6 +1250,11 @@ def agent(  # noqa: PLR0913
         "--list-sessions",
         help="List recent sessions and exit",
     ),
+    get_session: str | None = typer.Option(
+        None,
+        "--get-session",
+        help="Get session by ID and print messages (use with --json)",
+    ),
     tools: str | None = typer.Option(
         None,
         "--tools",
@@ -1160,47 +1301,7 @@ def agent(  # noqa: PLR0913
         None,
         "--corpus",
         "-c",
-        help="Documentation corpus to use (cuda, cutlass, hip). Must be downloaded first.",
-    ),
-    # Legacy kernel optimization options (hidden, for backwards compat)
-    problem: Path | None = typer.Option(
-        None,
-        "--problem",
-        hidden=True,
-        help="[Legacy] Path to problem YAML config file",
-    ),
-    reference: Path | None = typer.Option(
-        None,
-        "--reference",
-        "--ref",
-        hidden=True,
-        help="[Legacy] Path to reference kernel file",
-    ),
-    description: str | None = typer.Option(
-        None,
-        "--description",
-        "--desc",
-        hidden=True,
-        help="[Legacy] Problem description",
-    ),
-    test: list[str] | None = typer.Option(
-        None,
-        "--test",
-        hidden=True,
-        help="[Legacy] Test case",
-    ),
-    benchmark: list[str] | None = typer.Option(
-        None,
-        "--benchmark",
-        "-b",
-        hidden=True,
-        help="[Legacy] Benchmark case",
-    ),
-    speedup_target: float | None = typer.Option(
-        None,
-        "--speedup",
-        hidden=True,
-        help="[Legacy] Speedup target",
+        help="Documentation corpus to use (cuda, cutlass, hip, amd). Must be downloaded first.",
     ),
 ) -> None:
     """AI assistant for GPU kernel development.
@@ -1287,20 +1388,15 @@ def agent(  # noqa: PLR0913
         prompt=actual_prompt,
         interactive=use_tui,
         single_turn=single_turn,
-        problem=str(problem) if problem else None,
-        reference=str(reference) if reference else None,
-        description=description,
-        tests=list(test) if test else None,
-        benchmarks=list(benchmark) if benchmark else None,
         model=model,
-        max_turns=max_turns,
-        speedup_target=speedup_target,
         resume=resume,
         from_turn=from_turn,
         list_sessions=list_sessions,
+        get_session=get_session,
         tools=tools.split(",") if tools else None,
         allow_spawn=allow_spawn,
         max_tool_fails=max_tool_fails,
+        max_turns=max_turns,
         json_output=json_output,
         template=template,
         template_args=parsed_template_args,
@@ -1310,7 +1406,7 @@ def agent(  # noqa: PLR0913
 # =============================================================================
 # Evaluate command
-# Hidden aliases for backwards compatibility
+# Hidden aliases for agent command
 def _make_agent_alias(name: str, doc: str) -> None:
     """Create a hidden alias that delegates to agent()."""
@@ -1325,6 +1421,7 @@ def _make_agent_alias(name: str, doc: str) -> None:
         resume: str | None = typer.Option(None, "--resume", "-r"),
         from_turn: int | None = typer.Option(None, "--from-turn"),
         list_sessions: bool = typer.Option(False, "--list-sessions"),
+        get_session: str | None = typer.Option(None, "--get-session"),
         tools: str | None = typer.Option(None, "--tools"),
         allow_spawn: bool = typer.Option(False, "--allow-spawn"),
         max_tool_fails: int | None = typer.Option(None, "--max-tool-fails"),
@@ -1334,12 +1431,6 @@ def _make_agent_alias(name: str, doc: str) -> None:
         template: str | None = typer.Option(None, "--template", "-t"),
         template_args: list[str] | None = typer.Option(None, "--args"),
         corpus: str | None = typer.Option(None, "--corpus"),
-        problem: Path | None = typer.Option(None, "--problem", hidden=True),
-        reference: Path | None = typer.Option(None, "--reference", hidden=True),
-        description: str | None = typer.Option(None, "--description", hidden=True),
-        test: list[Path] | None = typer.Option(None, "--test", hidden=True),
-        benchmark: list[Path] | None = typer.Option(None, "--benchmark", hidden=True),
-        speedup_target: float | None = typer.Option(None, "--speedup-target", hidden=True),
     ) -> None:
         agent(
             prompt=prompt,
@@ -1349,6 +1440,7 @@ def _make_agent_alias(name: str, doc: str) -> None:
             resume=resume,
             from_turn=from_turn,
             list_sessions=list_sessions,
+            get_session=get_session,
             tools=tools,
             allow_spawn=allow_spawn,
             max_tool_fails=max_tool_fails,
@@ -1358,12 +1450,6 @@ def _make_agent_alias(name: str, doc: str) -> None:
             template=template,
             template_args=template_args,
             corpus=corpus,
-            problem=problem,
-            reference=reference,
-            description=description,
-            test=test,
-            benchmark=benchmark,
-            speedup_target=speedup_target,
         )
     alias_cmd.__doc__ = doc
@@ -1649,7 +1735,7 @@ def kernelbench_list_problems() -> None:
 @kernelbench_app.callback(invoke_without_command=True)
-def kernelbench_evaluate(  # noqa: PLR0913
+def kernelbench_evaluate(  # noqa: PLR0913, PLR0915
     ctx: typer.Context,
     implementation: Path | None = typer.Option(
         None,
@@ -1685,10 +1771,22 @@ def kernelbench_evaluate(  # noqa: PLR0913
     defensive: bool = typer.Option(
         False, "--defensive", help="Enable defensive timing to detect evaluation hacking"
     ),
+    backend: str | None = typer.Option(
+        None,
+        "--backend",
+        help="Kernel backend for static validation (hip, cuda, triton, cute, tilelang, thunderkittens). "
+        "When specified, validates that the implementation uses the correct backend primitives.",
+    ),
     sync_artifacts: bool = typer.Option(
         True, "--sync-artifacts/--no-sync-artifacts", help="Download artifacts"
     ),
     gpu_id: int | None = typer.Option(None, "--gpu-id", help="Override GPU ID"),
+    json_output: bool = typer.Option(
+        False, "--json", help="Output as single JSON object (machine-readable)"
+    ),
+    jsonl_output: bool = typer.Option(
+        False, "--jsonl", help="Output as streaming JSON Lines (one object per event)"
+    ),
 ) -> None:
     """Run kernel evaluation in KernelBench format (ModelNew class).
@@ -1744,6 +1842,10 @@ def kernelbench_evaluate(  # noqa: PLR0913
         raise typer.Exit(1)
     from .evaluate import KernelBenchEvaluateArgs, run_evaluate_kernelbench
+    from .output import OutputCollector, format_evaluate_result, get_output_format
+    output_format = get_output_format(json_output, jsonl_output)
+    collector = OutputCollector(format=output_format)
     # If pool specified, acquire a target from the pool
     resolved_target = target or ""
@@ -1756,32 +1858,36 @@ def kernelbench_evaluate(  # noqa: PLR0913
         try:
             pool_targets = get_pool(pool)
         except FileNotFoundError as e:
-            typer.echo(f"Error: {e}", err=True)
+            collector.set_error("pool", "PoolNotFound", pool=pool, message=str(e))
+            collector.finalize()
             raise typer.Exit(1) from None
         # Filter to only targets with valid auth
         usable_targets, skipped = filter_pool_by_auth(pool_targets)
         if skipped:
-            typer.echo(f"Skipping targets without auth: {', '.join(skipped)}", err=True)
+            collector.emit("pool_auth_skip", targets=skipped)
         if not usable_targets:
-            typer.echo(f"Error: No usable targets in pool '{pool}'", err=True)
-            typer.echo("  All targets require authentication that is not configured.", err=True)
-            typer.echo("  Run 'wafer auth status' to see which providers need setup.", err=True)
+            collector.set_error("pool", "NoUsableTargets", pool=pool)
+            collector.finalize()
             raise typer.Exit(1) from None
-        typer.echo(f"Acquiring target from pool '{pool}' ({len(usable_targets)} targets)...")
+        collector.emit("pool_acquire", pool=pool, count=len(usable_targets))
         pool_lock_context = acquire_from_pool(usable_targets)
         acquired_target = pool_lock_context.__enter__()
         if acquired_target is None:
-            typer.echo(f"Error: All targets in pool '{pool}' are busy", err=True)
-            typer.echo(f"  Targets: {', '.join(usable_targets)}", err=True)
+            # Exit context manager before raising to avoid resource leak
+            pool_lock_context.__exit__(None, None, None)
+            collector.set_error("pool", "AllTargetsBusy", pool=pool, targets=usable_targets)
+            collector.finalize()
             raise typer.Exit(1)
-        typer.echo(f"Acquired target: {acquired_target}")
+        collector.emit("pool_acquired", target=acquired_target)
         resolved_target = acquired_target
+    collector.target = resolved_target
     args = KernelBenchEvaluateArgs(
         implementation=implementation,
         reference=reference,
@@ -1791,41 +1897,45 @@ def kernelbench_evaluate(  # noqa: PLR0913
         inputs=inputs,
         seed=seed,
         defensive=defensive,
+        backend=backend,
         sync_artifacts=sync_artifacts,
         gpu_id=gpu_id,
     )
+    collector.emit("started", target=resolved_target)
     try:
         import trio_asyncio
+        collector.emit("evaluation", status="running")
         result = trio_asyncio.run(run_evaluate_kernelbench, args)
     except KeyboardInterrupt:
-        typer.echo("\nInterrupted by user", err=True)
+        collector.set_error("evaluation", "Interrupted", message="Interrupted by user")
+        collector.finalize()
         raise typer.Exit(130) from None
     except Exception as e:
-        typer.echo(f"Error: {e}", err=True)
+        collector.set_error("evaluation", "Exception", message=str(e))
+        collector.finalize()
         raise typer.Exit(1) from None
     finally:
         # Release pool lock if we acquired one
         if pool_lock_context is not None:
             pool_lock_context.__exit__(None, None, None)
-    # Print results
+    # Build structured output
+    eval_output = format_evaluate_result(result, target=resolved_target)
+    collector._result = eval_output
+    # Print results based on output format
     if result.success:
-        typer.echo("")
-        typer.echo("=" * 60)
-        status = "PASS" if result.all_correct else "FAIL"
-        typer.echo(f"Result: {status}")
-        score_pct = f"{result.correctness_score:.1%}"
-        typer.echo(f"Correctness: {result.passed_tests}/{result.total_tests} ({score_pct})")
-        if result.geomean_speedup > 0:
-            typer.echo(f"Speedup: {result.geomean_speedup:.2f}x")
-        typer.echo("=" * 60)
+        collector.output_text_result(result)
+        collector.finalize()
         if not result.all_correct:
             raise typer.Exit(1)
     else:
-        typer.echo(f"Error: {result.error_message}", err=True)
+        collector.output_text_error(result.error_message or "Unknown error")
+        collector.finalize()
         raise typer.Exit(1)
@@ -2182,6 +2292,8 @@ def gpumode_evaluate(  # noqa: PLR0913, PLR0915
         acquired_target = pool_lock_context.__enter__()
         if acquired_target is None:
+            # Exit context manager before raising to avoid resource leak
+            pool_lock_context.__exit__(None, None, None)
             typer.echo(f"Error: All targets in pool '{pool}' are busy", err=True)
             typer.echo(f"  Targets: {', '.join(usable_targets)}", err=True)
             raise typer.Exit(1)
@@ -2402,6 +2514,7 @@ def _run_api_mode(  # noqa: PLR0913
     upload_dir: Path | None,
     workspace_id: str | None,
     gpu_id: int | None,
+    gpu_count: int,
     docker_image: str | None,
     docker_entrypoint: str | None,
     pull_image: bool,
@@ -2416,6 +2529,8 @@ def _run_api_mode(  # noqa: PLR0913
         typer.echo(f"Workspace: {workspace_id}")
     if gpu_id is not None:
         typer.echo(f"GPU: {gpu_id}")
+    if gpu_count > 1:
+        typer.echo(f"GPU count: {gpu_count}")
     if docker_image:
         typer.echo(f"Image: {docker_image}")
     if docker_entrypoint:
@@ -2433,6 +2548,7 @@ def _run_api_mode(  # noqa: PLR0913
             upload_dir=upload_dir,
             workspace_id=workspace_id,
             gpu_id=gpu_id,
+            gpu_count=gpu_count,
             docker_image=docker_image,
             docker_entrypoint=docker_entrypoint,
             pull_image=pull_image,
@@ -2456,6 +2572,7 @@ def remote_run(  # noqa: PLR0913
         None, "--workspace-id", "-w", help="Workspace ID (from wafer push)"
     ),
     gpu_id: int | None = typer.Option(None, "--gpu", "-g", help="GPU ID"),
+    gpu_count: int = typer.Option(1, "--gpu-count", "-n", help="Number of GPUs (1-8)"),
     docker_image: str | None = typer.Option(None, "--image", "-i", help="Docker image override"),
     docker_entrypoint: str | None = typer.Option(
         None, "--docker-entrypoint", help="Override Docker entrypoint (e.g., 'bash')"
@@ -2525,6 +2642,7 @@ def remote_run(  # noqa: PLR0913
             upload_dir,
             workspace_id,
             gpu_id,
+            gpu_count,
             docker_image,
             docker_entrypoint,
             pull_image,
@@ -4108,6 +4226,81 @@ def billing_portal(
         raise typer.Exit(1) from None
+# =============================================================================
+# SSH Keys commands (BYOK - Bring Your Own Key)
+# =============================================================================
+@ssh_keys_app.command("list")
+def ssh_keys_list(
+    json_output: bool = typer.Option(False, "--json", "-j", help="Output as JSON"),
+) -> None:
+    """List all registered SSH public keys.
+    Example:
+        wafer ssh-keys list
+        wafer ssh-keys list --json
+    """
+    from .ssh_keys import list_ssh_keys
+    try:
+        result = list_ssh_keys(json_output=json_output)
+        typer.echo(result)
+    except RuntimeError as e:
+        typer.echo(f"Error: {e}", err=True)
+        raise typer.Exit(1) from e
+@ssh_keys_app.command("add")
+def ssh_keys_add(
+    pubkey_path: Path | None = typer.Argument(
+        None, help="Path to public key file (auto-detects ~/.ssh/id_ed25519.pub if not specified)"
+    ),
+    name: str | None = typer.Option(None, "--name", "-n", help="Friendly name for the key"),
+    json_output: bool = typer.Option(False, "--json", "-j", help="Output as JSON"),
+) -> None:
+    """Add an SSH public key.
+    If no path is specified, auto-detects keys from ~/.ssh/ in preference order:
+    id_ed25519.pub, id_rsa.pub, id_ecdsa.pub.
+    Example:
+        wafer ssh-keys add                              # Auto-detect
+        wafer ssh-keys add ~/.ssh/id_rsa.pub            # Specific file
+        wafer ssh-keys add ~/.ssh/id_ed25519.pub --name laptop
+    """
+    from .ssh_keys import add_ssh_key
+    try:
+        result = add_ssh_key(pubkey_path=pubkey_path, name=name, json_output=json_output)
+        typer.echo(result)
+    except RuntimeError as e:
+        typer.echo(f"Error: {e}", err=True)
+        raise typer.Exit(1) from e
+@ssh_keys_app.command("remove")
+def ssh_keys_remove(
+    key_id: str = typer.Argument(..., help="UUID of the SSH key to remove"),
+    json_output: bool = typer.Option(False, "--json", "-j", help="Output as JSON"),
+) -> None:
+    """Remove an SSH public key.
+    Get the key ID from 'wafer ssh-keys list'.
+    Example:
+        wafer ssh-keys remove abc123-def456-...
+    """
+    from .ssh_keys import remove_ssh_key
+    try:
+        result = remove_ssh_key(key_id=key_id, json_output=json_output)
+        typer.echo(result)
+    except RuntimeError as e:
+        typer.echo(f"Error: {e}", err=True)
+        raise typer.Exit(1) from e
 # =============================================================================
 # Workspaces commands
 # =============================================================================
@@ -4136,21 +4329,34 @@ def workspaces_list(
 @workspaces_app.command("create")
 def workspaces_create(
     name: str = typer.Argument(..., help="Workspace name"),
-    gpu_type: str = typer.Option("B200", "--gpu", "-g", help="GPU type (default: B200)"),
+    gpu_type: str = typer.Option("B200", "--gpu", "-g", help="GPU type: MI300X (AMD) or B200 (NVIDIA, default)"),
     image: str | None = typer.Option(None, "--image", "-i", help="Docker image (optional)"),
+    wait: bool = typer.Option(False, "--wait", "-w", help="Wait for provisioning and show SSH credentials"),
     json_output: bool = typer.Option(False, "--json", "-j", help="Output as JSON"),
 ) -> None:
     """Create a new workspace.
+    Available GPUs:
+        MI300X  AMD Instinct MI300X (192GB HBM3, ROCm)
+        B200    NVIDIA Blackwell B200 (180GB HBM3e, CUDA)
     Example:
-        wafer workspaces create my-kernel
-        wafer workspaces create my-kernel --gpu H100
+        wafer workspaces create my-kernel                # B200 (default)
+        wafer workspaces create my-kernel --gpu MI300X   # AMD MI300X
+        wafer workspaces create my-kernel --gpu B200     # NVIDIA B200
         wafer workspaces create my-kernel --image pytorch/pytorch:2.5.1-cuda12.4-cudnn9-devel
+        wafer workspaces create my-kernel --wait
     """
     from .workspaces import create_workspace
     try:
-        result = create_workspace(name, gpu_type=gpu_type, image=image, json_output=json_output)
+        result = create_workspace(
+            name,
+            gpu_type=gpu_type,
+            image=image,
+            wait=wait,
+            json_output=json_output,
+        )
         typer.echo(result)
     except RuntimeError as e:
         typer.echo(f"Error: {e}", err=True)
@@ -4160,16 +4366,23 @@ def workspaces_create(
 @workspaces_app.command("delete")
 def workspaces_delete(
     workspace_id: str = typer.Argument(..., help="Workspace ID to delete"),
+    yes: bool = typer.Option(False, "--yes", "-y", help="Skip confirmation prompt"),
     json_output: bool = typer.Option(False, "--json", "-j", help="Output as JSON"),
 ) -> None:
     """Delete a workspace.
     Example:
         wafer workspaces delete ws_abc123
+        wafer workspaces delete ws_abc123 -y
     """
     from .workspaces import delete_workspace
     try:
+        if not yes:
+            confirm = typer.confirm(f"Delete workspace '{workspace_id}'?")
+            if not confirm:
+                typer.echo("Cancelled.")
+                raise typer.Exit(0)
         result = delete_workspace(workspace_id, json_output=json_output)
         typer.echo(result)
     except RuntimeError as e:
@@ -4177,32 +4390,6 @@ def workspaces_delete(
         raise typer.Exit(1) from None
-@workspaces_app.command("attach")
-def workspaces_attach(
-    workspace_id: str = typer.Argument(..., help="Workspace ID to attach to"),
-    json_output: bool = typer.Option(False, "--json", "-j", help="Output as JSON"),
-) -> None:
-    """Attach to a workspace (get SSH credentials).
-    This will:
-    1. Start the workspace if needed
-    2. Return SSH connection details
-    3. Save the private key to ~/.wafer/keys/
-    Example:
-        wafer workspaces attach ws_abc123
-        wafer workspaces attach ws_abc123 --json
-    """
-    from .workspaces import attach_workspace
-    try:
-        result = attach_workspace(workspace_id, json_output=json_output)
-        typer.echo(result)
-    except RuntimeError as e:
-        typer.echo(f"Error: {e}", err=True)
-        raise typer.Exit(1) from None
 @workspaces_app.command("show")
 def workspaces_show(
     workspace_id: str = typer.Argument(..., help="Workspace ID to show"),
@@ -4224,12 +4411,19 @@ def workspaces_show(
         raise typer.Exit(1) from None
-@workspaces_app.command("exec", context_settings={"allow_interspersed_args": False})
+@workspaces_app.command(
+    "exec",
+    context_settings={
+        "allow_interspersed_args": False,
+        "ignore_unknown_options": True,
+        "allow_extra_args": True,
+    },
+)
 def workspaces_exec(
+    ctx: typer.Context,
     workspace: str | None = typer.Argument(
         None, help="Workspace name or ID (optional if default set)"
     ),
-    command: list[str] = typer.Argument(..., help="Command to execute"),
     timeout: int | None = typer.Option(
         None,
         "--timeout",
@@ -4247,6 +4441,7 @@ def workspaces_exec(
     baremetal: bool = typer.Option(
         False, "--baremetal", help="Force baremetal target (for hardware counters like ncu/nsys)"
     ),
+    pull_image: bool = typer.Option(False, "--pull-image", help="Pull image on target if missing"),
     verbose: bool = typer.Option(False, "--verbose", "-v", help="Show [wafer] status messages"),
     quiet: bool = typer.Option(False, "--quiet", "-q", help="Suppress [wafer] status messages"),
 ) -> None:
@@ -4263,6 +4458,8 @@ def workspaces_exec(
     If workspace is not specified, uses the default workspace from config,
     or the only workspace if you have exactly one.
+    IMPORTANT: Options must come before the workspace name.
     Examples:
         wafer workspaces exec dev -- python train.py
         wafer workspaces exec dev -- python -c "import torch; print(torch.cuda.is_available())"
@@ -4273,6 +4470,34 @@ def workspaces_exec(
     from .global_config import get_defaults, get_preferences
     from .workspaces import exec_command, resolve_workspace, sync_files
+    # Enforce option ordering to avoid treating CLI flags as remote commands
+    known_options = {
+        "--timeout",
+        "-t",
+        "--sync",
+        "-s",
+        "--gpu",
+        "--cpu",
+        "--baremetal",
+        "--pull-image",
+        "--verbose",
+        "-v",
+        "--quiet",
+        "-q",
+        "--help",
+        "-h",
+    }
+    for arg in ctx.args:
+        if arg == "--":
+            break
+        if arg in known_options:
+            typer.echo(
+                "Error: options must come before the workspace name. "
+                "Example: wafer workspaces exec --pull-image dev -- python -V",
+                err=True,
+            )
+            raise typer.Exit(1)
     # Validate mutually exclusive routing flags
     routing_flags = sum([gpu, cpu, baremetal])
     if routing_flags > 1:
@@ -4339,27 +4564,30 @@ def workspaces_exec(
             typer.echo(f"Error: {e}", err=True)
             raise typer.Exit(1) from None
+    # Get command from context args (passthrough after --)
+    import shlex
+    command = list(ctx.args)
+    if command and command[0] == "--":
+        command = command[1:]
+    if not command:
+        typer.echo("Error: No command specified", err=True)
+        raise typer.Exit(1)
     if show_status:
         typer.echo(f"[wafer] Executing (timeout: {effective_timeout}s)...", err=True)
-    # Join command list into shell command string, stripping leading "--" separator
-    if isinstance(command, list):
-        import shlex
-        # Remove leading "--" if present (typer passes it through with allow_interspersed_args=False)
-        if command and command[0] == "--":
-            command = command[1:]
-        # Handle two cases:
-        # 1. Single element: user quoted the whole command (e.g., "echo hello world")
-        #    -> use directly, don't re-quote
-        # 2. Multiple elements: user passed separate args (e.g., -- python -c "print(1)")
-        #    -> use shlex.join to properly quote args with spaces
-        if len(command) == 1:
-            command_str = command[0]
-        else:
-            command_str = shlex.join(command)
+    # Build command string
+    # Handle two cases:
+    # 1. Single element: user quoted the whole command (e.g., "echo hello world")
+    #    -> use directly, don't re-quote
+    # 2. Multiple elements: user passed separate args (e.g., -- python -c "print(1)")
+    #    -> use shlex.join to properly quote args with spaces
+    if len(command) == 1:
+        command_str = command[0]
     else:
-        command_str = command
+        command_str = shlex.join(command)
     try:
         exit_code = exec_command(
@@ -4367,6 +4595,7 @@ def workspaces_exec(
             command=command_str,
             timeout_seconds=effective_timeout,
             routing=routing,
+            pull_image=pull_image,
         )
     except RuntimeError as e:
         typer.echo(f"Error: {e}", err=True)
@@ -4386,7 +4615,7 @@ def workspaces_ssh(
 ) -> None:
     """SSH into a workspace.
-    Gets SSH credentials via attach, then execs into SSH.
+    Uses workspace SSH credentials once the workspace is running.
     If workspace is not specified, uses the default workspace.
     Examples:
@@ -4395,7 +4624,7 @@ def workspaces_ssh(
     """
     import os
-    from .workspaces import get_ssh_credentials, resolve_workspace
+    from .workspaces import get_workspace_raw, resolve_workspace
     # Resolve workspace
     try:
@@ -4406,26 +4635,39 @@ def workspaces_ssh(
     typer.echo(f"Connecting to workspace: {resolved_workspace}...", err=True)
-    # Get SSH credentials (this calls attach)
+    # Get SSH credentials from workspace
     try:
-        creds = get_ssh_credentials(resolved_workspace)
+        ws = get_workspace_raw(resolved_workspace)
     except RuntimeError as e:
         typer.echo(f"Error: {e}", err=True)
         raise typer.Exit(1) from None
-    # Exec into SSH - replaces this process
-    ssh_args = [
-        "ssh",
-        "-i",
-        str(creds.key_path),
+    from .workspaces import VALID_STATUSES
+    workspace_status = ws.get("status")
+    assert workspace_status in VALID_STATUSES, (
+        f"Workspace {resolved_workspace} has invalid status '{workspace_status}'. "
+        f"Valid statuses: {VALID_STATUSES}"
+    )
+    if workspace_status != "running":
+        typer.echo(f"Error: Workspace is {workspace_status}. Wait for it to be running.", err=True)
+        raise typer.Exit(1)
+    if not ws.get("ssh_host") or not ws.get("ssh_port") or not ws.get("ssh_user"):
+        typer.echo("Error: SSH credentials not available yet.", err=True)
+        raise typer.Exit(1)
+    # Build SSH args - key_path is None for BYOK model (uses default SSH key)
+    ssh_args = ["ssh"]
+    ssh_args.extend([
         "-p",
-        str(creds.port),
+        str(ws.get("ssh_port")),
         "-o",
         "StrictHostKeyChecking=no",
         "-o",
         "UserKnownHostsFile=/dev/null",
-        f"{creds.user}@{creds.host}",
-    ]
+        f"{ws.get('ssh_user')}@{ws.get('ssh_host')}",
+    ])
     # Replace current process with SSH
     os.execvp("ssh", ssh_args)
@@ -4492,51 +4734,568 @@ def workspaces_sync(
 # =============================================================================
-# Perfetto trace analysis commands
+# Target operations commands (exec/ssh/sync)
 # =============================================================================
-@perfetto_app.command("query")
-def perfetto_query(
-    trace_path: Path = typer.Argument(..., help="Path to Perfetto trace file"),
-    sql: str = typer.Argument(..., help="SQL query to execute"),
-    json_output: bool = typer.Option(True, "--json", "-j", help="Output as JSON"),
+@targets_ops_app.command("exec", context_settings={"allow_interspersed_args": False})
+def targets_exec(
+    target: str = typer.Argument(
+        ...,
+        help="Target name",
+        autocompletion=complete_target_name,
+    ),
+    command: list[str] = typer.Argument(..., help="Command to execute"),
+    timeout: int | None = typer.Option(
+        None,
+        "--timeout",
+        "-t",
+        help="Execution timeout in seconds (default: 300)",
+    ),
+    verbose: bool = typer.Option(False, "--verbose", "-v", help="Show [wafer] status messages"),
+    quiet: bool = typer.Option(False, "--quiet", "-q", help="Suppress [wafer] status messages"),
 ) -> None:
-    """Execute SQL query against a Perfetto trace.
+    """Execute a command on a configured target.
-    Starts trace_processor, loads the trace, executes the query, and returns results.
+    Provisions the target if needed (RunPod, DigitalOcean), then runs the command via SSH.
+    For cloud targets, the instance is kept alive after execution - use
+    'wafer config targets cleanup <name>' to terminate.
+    Supported targets: RunPod, DigitalOcean, SSH (baremetal/vm).
+    Not supported: Modal (serverless), Local (no SSH), Workspace (use 'wafer workspaces exec').
     Examples:
-        wafer perfetto query trace.perfetto "SELECT * FROM slice LIMIT 10"
-        wafer perfetto query trace.perfetto "SELECT name, dur FROM slice ORDER BY dur DESC LIMIT 5"
+        wafer targets exec runpod-mi300x -- python -c "import torch; print(torch.cuda.is_available())"
+        wafer targets exec runpod-mi300x -- rocm-smi
+        wafer targets exec my-ssh-server -- nvidia-smi
+        wafer targets exec runpod-mi300x "echo hello && ls -la" --timeout 60
     """
-    from wafer_core.lib.perfetto.perfetto_tool import PerfettoConfig, PerfettoTool
+    from .global_config import get_preferences
+    from .targets import load_target
+    from .targets_ops import TargetExecError, exec_on_target_sync, get_target_ssh_info
-    config = PerfettoConfig(
-        workspace_root=".",
-        storage_dir=str(Path.home() / ".wafer" / "perfetto"),
-    )
-    tool = PerfettoTool(config)
+    # Determine verbosity
+    prefs = get_preferences()
+    if quiet:
+        show_status = False
+    elif verbose:
+        show_status = True
+    else:
+        show_status = prefs.mode == "explicit"
+    # Load target
     try:
-        results, err = tool.query(sql, str(trace_path))
-        if err:
-            typer.echo(f"Error: {err}", err=True)
-            raise typer.Exit(1)
-        if json_output:
-            typer.echo(json.dumps({"results": results, "count": len(results or [])}, indent=2))
-        else:
-            if not results:
-                typer.echo("No results")
-            else:
-                # Simple table output
-                if results:
-                    headers = list(results[0].keys())
-                    typer.echo("\t".join(headers))
-                    for row in results:
-                        typer.echo("\t".join(str(row.get(h, "")) for h in headers))
-    except Exception as e:
+        target_config = load_target(target)
+    except FileNotFoundError as e:
+        typer.echo(f"Error: {e}", err=True)
+        typer.echo("List available targets with: wafer config targets list", err=True)
+        raise typer.Exit(1) from None
+    except ValueError as e:
+        typer.echo(f"Error loading target config: {e}", err=True)
+        raise typer.Exit(1) from None
+    if show_status:
+        typer.echo(f"[wafer] Target: {target} ({type(target_config).__name__})", err=True)
+    # Get SSH info (may provision)
+    if show_status:
+        typer.echo("[wafer] Connecting to target...", err=True)
+    try:
+        ssh_info = trio.run(get_target_ssh_info, target_config)
+    except TargetExecError as e:
+        typer.echo(f"Error: {e}", err=True)
+        raise typer.Exit(1) from None
+    if show_status:
+        typer.echo(f"[wafer] Connected: {ssh_info.user}@{ssh_info.host}:{ssh_info.port}", err=True)
+    # Build command string
+    if isinstance(command, list):
+        import shlex
+        # Remove leading "--" if present
+        if command and command[0] == "--":
+            command = command[1:]
+        if not command:
+            typer.echo("Error: No command specified", err=True)
+            raise typer.Exit(1)
+        if len(command) == 1:
+            command_str = command[0]
+        else:
+            command_str = shlex.join(command)
+    else:
+        command_str = command
+    # Default timeout
+    effective_timeout = timeout if timeout is not None else 300
+    if show_status:
+        typer.echo(f"[wafer] Executing (timeout: {effective_timeout}s)...", err=True)
+    # Execute
+    try:
+        exit_code = exec_on_target_sync(ssh_info, command_str, effective_timeout)
+    except TargetExecError as e:
+        typer.echo(f"Error: {e}", err=True)
+        raise typer.Exit(1) from None
+    if show_status:
+        typer.echo(f"[wafer] Exit code: {exit_code}", err=True)
+    raise typer.Exit(exit_code)
+@targets_ops_app.command("ssh")
+def targets_ssh(
+    target: str = typer.Argument(
+        ...,
+        help="Target name",
+        autocompletion=complete_target_name,
+    ),
+) -> None:
+    """SSH into a configured target.
+    Provisions the target if needed (RunPod, DigitalOcean), then starts an interactive SSH session.
+    For cloud targets, the instance is kept alive - use 'wafer config targets cleanup <name>' to terminate.
+    Examples:
+        wafer targets ssh runpod-mi300x
+        wafer targets ssh my-baremetal-server
+    """
+    from .targets import load_target
+    from .targets_ops import TargetExecError, get_target_ssh_info
+    # Load target
+    try:
+        target_config = load_target(target)
+    except FileNotFoundError as e:
+        typer.echo(f"Error: {e}", err=True)
+        typer.echo("List available targets with: wafer config targets list", err=True)
+        raise typer.Exit(1) from None
+    except ValueError as e:
+        typer.echo(f"Error loading target config: {e}", err=True)
+        raise typer.Exit(1) from None
+    typer.echo(f"Connecting to target: {target}...", err=True)
+    # Get SSH info (may provision)
+    try:
+        ssh_info = trio.run(get_target_ssh_info, target_config)
+    except TargetExecError as e:
+        typer.echo(f"Error: {e}", err=True)
+        raise typer.Exit(1) from None
+    # Build SSH command
+    ssh_args = [
+        "ssh",
+        "-i",
+        str(ssh_info.key_path),
+        "-p",
+        str(ssh_info.port),
+        "-o",
+        "StrictHostKeyChecking=no",
+        "-o",
+        "UserKnownHostsFile=/dev/null",
+        f"{ssh_info.user}@{ssh_info.host}",
+    ]
+    # Replace current process with SSH
+    os.execvp("ssh", ssh_args)
+@targets_ops_app.command("sync")
+def targets_sync(
+    target: str = typer.Argument(
+        ...,
+        help="Target name",
+        autocompletion=complete_target_name,
+    ),
+    path: Path = typer.Argument(..., help="Local file or directory to sync"),
+    dest: str | None = typer.Option(
+        None,
+        "--dest",
+        "-d",
+        help="Remote destination path (default: /tmp/<basename>)",
+    ),
+    verbose: bool = typer.Option(False, "--verbose", "-v", help="Show [wafer] status messages"),
+    quiet: bool = typer.Option(False, "--quiet", "-q", help="Suppress [wafer] status messages"),
+) -> None:
+    """Sync local files to a configured target.
+    Uses rsync over SSH to copy files to the target. Provisions the target if needed.
+    Examples:
+        wafer targets sync runpod-mi300x ./my-project
+        wafer targets sync runpod-mi300x ./script.py --dest /workspace/script.py
+        wafer targets sync my-server ./kernels --dest /tmp/kernels
+    """
+    from .global_config import get_preferences
+    from .targets import load_target
+    from .targets_ops import TargetExecError, get_target_ssh_info, sync_to_target
+    # Determine verbosity
+    prefs = get_preferences()
+    if quiet:
+        show_status = False
+    elif verbose:
+        show_status = True
+    else:
+        show_status = prefs.mode == "explicit"
+    # Validate path
+    if not path.exists():
+        typer.echo(f"Error: Path not found: {path}", err=True)
+        raise typer.Exit(1)
+    # Load target
+    try:
+        target_config = load_target(target)
+    except FileNotFoundError as e:
+        typer.echo(f"Error: {e}", err=True)
+        typer.echo("List available targets with: wafer config targets list", err=True)
+        raise typer.Exit(1) from None
+    except ValueError as e:
+        typer.echo(f"Error loading target config: {e}", err=True)
+        raise typer.Exit(1) from None
+    if show_status:
+        typer.echo(f"[wafer] Target: {target} ({type(target_config).__name__})", err=True)
+    # Get SSH info (may provision)
+    if show_status:
+        typer.echo("[wafer] Connecting to target...", err=True)
+    try:
+        ssh_info = trio.run(get_target_ssh_info, target_config)
+    except TargetExecError as e:
+        typer.echo(f"Error: {e}", err=True)
+        raise typer.Exit(1) from None
+    if show_status:
+        typer.echo(f"[wafer] Connected: {ssh_info.user}@{ssh_info.host}:{ssh_info.port}", err=True)
+    # Sync
+    def on_progress(msg: str) -> None:
+        if show_status:
+            typer.echo(f"[wafer] {msg}", err=True)
+    try:
+        file_count = sync_to_target(ssh_info, path.resolve(), dest, on_progress)
+    except TargetExecError as e:
+        typer.echo(f"Error: {e}", err=True)
+        raise typer.Exit(1) from None
+    if show_status:
+        typer.echo(f"[wafer] Done. Synced {file_count} files.", err=True)
+@targets_ops_app.command("scp")
+def targets_scp(
+    source: str = typer.Argument(..., help="Source path (prefix with target: for remote)"),
+    dest: str = typer.Argument(..., help="Destination path (prefix with target: for remote)"),
+    recursive: bool = typer.Option(False, "-r", "--recursive", help="Copy directories recursively"),
+    verbose: bool = typer.Option(False, "--verbose", "-v", help="Show [wafer] status messages"),
+    quiet: bool = typer.Option(False, "--quiet", "-q", help="Suppress [wafer] status messages"),
+) -> None:
+    """Copy files to/from a target using scp-style syntax.
+    Use target: prefix to indicate remote paths. Exactly one of source or dest
+    must be remote.
+    Examples:
+        wafer targets scp runpod-mi300x:/tmp/trace.json ./trace.json  # download
+        wafer targets scp ./script.py runpod-mi300x:/tmp/script.py    # upload
+        wafer targets scp -r ./kernels runpod-mi300x:/tmp/kernels     # upload dir
+        wafer targets scp -r runpod-mi300x:/tmp/results ./results     # download dir
+    """
+    from .global_config import get_preferences
+    from .targets import load_target
+    from .targets_ops import TargetExecError, get_target_ssh_info, parse_scp_path, scp_transfer
+    # Determine verbosity
+    prefs = get_preferences()
+    if quiet:
+        show_status = False
+    elif verbose:
+        show_status = True
+    else:
+        show_status = prefs.mode == "explicit"
+    # Parse source and dest
+    source_target, source_path = parse_scp_path(source)
+    dest_target, dest_path = parse_scp_path(dest)
+    # Validate: exactly one must be remote
+    if source_target and dest_target:
+        typer.echo("Error: Both paths are remote. Use ssh to transfer between remotes.", err=True)
+        raise typer.Exit(1)
+    if not source_target and not dest_target:
+        typer.echo("Error: Both paths are local. Use regular cp command.", err=True)
+        raise typer.Exit(1)
+    # Determine direction and target
+    is_download = source_target is not None
+    target_name = source_target if is_download else dest_target
+    # Load target
+    try:
+        target_config = load_target(target_name)
+    except FileNotFoundError:
+        typer.echo(f"Error: Target '{target_name}' not found.", err=True)
+        typer.echo("Run 'wafer config targets list' to see available targets.", err=True)
+        raise typer.Exit(1) from None
+    except ValueError as e:
+        typer.echo(f"Error loading target config: {e}", err=True)
+        raise typer.Exit(1) from None
+    # Validate local path exists (for upload)
+    if not is_download:
+        local_path = Path(source_path)
+        if not local_path.exists():
+            typer.echo(f"Error: Local path '{source_path}' does not exist.", err=True)
+            raise typer.Exit(1)
+        if local_path.is_dir() and not recursive:
+            typer.echo(
+                f"Error: '{source_path}' is a directory. Use -r flag for recursive copy.", err=True
+            )
+            raise typer.Exit(1)
+    if show_status:
+        typer.echo(f"[wafer] Target: {target_name} ({type(target_config).__name__})", err=True)
+        typer.echo("[wafer] Connecting to target...", err=True)
+    # Get SSH info (may provision)
+    try:
+        ssh_info = trio.run(get_target_ssh_info, target_config)
+    except TargetExecError as e:
+        typer.echo(f"Error: {e}", err=True)
+        raise typer.Exit(1) from None
+    if show_status:
+        typer.echo(f"[wafer] Connected: {ssh_info.user}@{ssh_info.host}:{ssh_info.port}", err=True)
+        direction = "Downloading" if is_download else "Uploading"
+        typer.echo(f"[wafer] {direction}...", err=True)
+    # Transfer
+    try:
+        if is_download:
+            scp_transfer(ssh_info, source_path, dest_path, is_download=True, recursive=recursive)
+        else:
+            scp_transfer(ssh_info, source_path, dest_path, is_download=False, recursive=recursive)
+    except TargetExecError as e:
+        typer.echo(f"Error: {e}", err=True)
+        raise typer.Exit(1) from None
+    if show_status:
+        typer.echo("[wafer] Done.", err=True)
+@targets_ops_app.command("ensure")
+def targets_ensure(  # noqa: PLR0915
+    target: str = typer.Argument(
+        None,
+        help="Target name",
+        autocompletion=complete_target_name,
+    ),
+    tool: str = typer.Argument(None, help="Tool to ensure is installed"),
+    check_only: bool = typer.Option(False, "--check-only", "-c", help="Only check, don't install"),
+    force: bool = typer.Option(False, "--force", "-f", help="Reinstall even if present"),
+    list_tools: bool = typer.Option(False, "--list", "-l", help="List available tools"),
+    timeout: int = typer.Option(300, "--timeout", "-t", help="Installation timeout in seconds"),
+    verbose: bool = typer.Option(False, "--verbose", "-v", help="Show [wafer] status messages"),
+    quiet: bool = typer.Option(False, "--quiet", "-q", help="Suppress [wafer] status messages"),
+) -> None:
+    """Ensure a tool is installed on a target.
+    Checks if a tool exists on the target and installs it if missing.
+    Useful for profiling tools like rocprof-compute that aren't pre-installed.
+    Examples:
+        wafer targets ensure runpod-mi300x rocprof-compute
+        wafer targets ensure runpod-mi300x rocprof-compute --check-only
+        wafer targets ensure runpod-mi300x rocprof-compute --force
+        wafer targets ensure --list
+    """
+    from .global_config import get_preferences
+    from .targets import load_target
+    from .targets_ops import (
+        TOOL_REGISTRY,
+        TargetExecError,
+        ensure_tool,
+        get_target_platform,
+        get_target_ssh_info,
+    )
+    # Handle --list flag
+    if list_tools:
+        typer.echo("Available tools:\n")
+        typer.echo("AMD tools:")
+        for name, spec in sorted(TOOL_REGISTRY.items()):
+            if spec.platform == "amd":
+                auto = "auto-install" if spec.install_cmd else "manual"
+                typer.echo(f"  {name:20} ({auto}) - {spec.description}")
+        typer.echo("\nNVIDIA tools:")
+        for name, spec in sorted(TOOL_REGISTRY.items()):
+            if spec.platform == "nvidia":
+                auto = "auto-install" if spec.install_cmd else "manual"
+                typer.echo(f"  {name:20} ({auto}) - {spec.description}")
+        typer.echo("\nCross-platform:")
+        for name, spec in sorted(TOOL_REGISTRY.items()):
+            if spec.platform == "any":
+                auto = "auto-install" if spec.install_cmd else "manual"
+                typer.echo(f"  {name:20} ({auto}) - {spec.description}")
+        return
+    # Require target and tool if not listing
+    if not target:
+        typer.echo("Error: Missing argument 'TARGET'", err=True)
+        typer.echo("Usage: wafer targets ensure TARGET TOOL", err=True)
+        typer.echo("   or: wafer targets ensure --list", err=True)
+        raise typer.Exit(1)
+    if not tool:
+        typer.echo("Error: Missing argument 'TOOL'", err=True)
+        typer.echo("Usage: wafer targets ensure TARGET TOOL", err=True)
+        typer.echo("   or: wafer targets ensure --list", err=True)
+        raise typer.Exit(1)
+    # Check tool exists
+    if tool not in TOOL_REGISTRY:
+        typer.echo(f"Error: Unknown tool '{tool}'", err=True)
+        typer.echo(f"Available tools: {', '.join(sorted(TOOL_REGISTRY.keys()))}", err=True)
+        typer.echo("Run 'wafer targets ensure --list' for details.", err=True)
+        raise typer.Exit(1)
+    spec = TOOL_REGISTRY[tool]
+    # Determine verbosity
+    prefs = get_preferences()
+    if quiet:
+        show_status = False
+    elif verbose:
+        show_status = True
+    else:
+        show_status = prefs.mode == "explicit"
+    # Load target
+    try:
+        target_config = load_target(target)
+    except FileNotFoundError as e:
+        typer.echo(f"Error: {e}", err=True)
+        typer.echo("List available targets with: wafer config targets list", err=True)
+        raise typer.Exit(1) from None
+    except ValueError as e:
+        typer.echo(f"Error loading target config: {e}", err=True)
+        raise typer.Exit(1) from None
+    # Platform validation
+    platform = get_target_platform(target_config)
+    if spec.platform != "any" and spec.platform != platform:
+        typer.echo(
+            f"Error: {tool} is an {spec.platform.upper()} tool but target '{target}' "
+            f"is {platform.upper()}",
+            err=True,
+        )
+        raise typer.Exit(1)
+    if show_status:
+        typer.echo(f"[wafer] Target: {target} ({platform.upper()})", err=True)
+        typer.echo(f"[wafer] Checking for {tool}...", err=True)
+    # Get SSH info (may provision)
+    try:
+        ssh_info = trio.run(get_target_ssh_info, target_config)
+    except TargetExecError as e:
+        typer.echo(f"Error: {e}", err=True)
+        raise typer.Exit(1) from None
+    if show_status:
+        typer.echo(f"[wafer] Connected: {ssh_info.user}@{ssh_info.host}:{ssh_info.port}", err=True)
+    # Check-only mode
+    if check_only:
+        from .targets_ops import TargetExecError, exec_on_target_sync
+        try:
+            exit_code = exec_on_target_sync(ssh_info, spec.check_cmd, timeout_seconds=30)
+        except TargetExecError as e:
+            typer.echo(f"Error: {e}", err=True)
+            raise typer.Exit(1) from None
+        if exit_code == 0:
+            typer.echo(f"{tool} is installed")
+        else:
+            typer.echo(f"{tool} is NOT installed", err=True)
+            raise typer.Exit(1)
+        return
+    # Ensure tool is installed
+    result = ensure_tool(ssh_info, tool, force=force, timeout=timeout)
+    if result.error:
+        typer.echo(f"Error: {result.error}", err=True)
+        raise typer.Exit(1)
+    if result.already_installed:
+        typer.echo(f"{tool} is already installed")
+    elif result.installed:
+        if result.verified:
+            typer.echo(f"{tool} installed successfully")
+        else:
+            typer.echo(f"{tool} installed (verification skipped)")
+# =============================================================================
+# Perfetto trace analysis commands
+# =============================================================================
+@perfetto_app.command("query")
+def perfetto_query(
+    trace_path: Path = typer.Argument(..., help="Path to Perfetto trace file"),
+    sql: str = typer.Argument(..., help="SQL query to execute"),
+    json_output: bool = typer.Option(True, "--json", "-j", help="Output as JSON"),
+) -> None:
+    """Execute SQL query against a Perfetto trace.
+    Starts trace_processor, loads the trace, executes the query, and returns results.
+    Examples:
+        wafer perfetto query trace.perfetto "SELECT * FROM slice LIMIT 10"
+        wafer perfetto query trace.perfetto "SELECT name, dur FROM slice ORDER BY dur DESC LIMIT 5"
+    """
+    from wafer_core.lib.perfetto.perfetto_tool import PerfettoConfig, PerfettoTool
+    config = PerfettoConfig(
+        workspace_root=".",
+        storage_dir=str(Path.home() / ".wafer" / "perfetto"),
+    )
+    tool = PerfettoTool(config)
+    try:
+        results, err = tool.query(sql, str(trace_path))
+        if err:
+            typer.echo(f"Error: {err}", err=True)
+            raise typer.Exit(1)
+        if json_output:
+            typer.echo(json.dumps({"results": results, "count": len(results or [])}, indent=2))
+        else:
+            if not results:
+                typer.echo("No results")
+            else:
+                # Simple table output
+                if results:
+                    headers = list(results[0].keys())
+                    typer.echo("\t".join(headers))
+                    for row in results:
+                        typer.echo("\t".join(str(row.get(h, "")) for h in headers))
+    except Exception as e:
         typer.echo(f"Error: {e}", err=True)
         raise typer.Exit(1) from None
@@ -4774,13 +5533,39 @@ def ncu_analyze(
 # =============================================================================
-# NSYS Analyze command
+# NSYS commands
 # =============================================================================
+@nsys_app.command("check")
+def nsys_check() -> None:
+    """Check if NSYS (Nsight Systems) is installed and show version.
+    NSYS is required for local analysis. If not installed, shows install instructions.
+    Examples:
+        wafer nvidia nsys check
+    """
+    from .nsys_analyze import check_nsys_installation
+    result = check_nsys_installation()
+    if result.installed:
+        typer.echo(f"✓ NSYS installed: {result.path}")
+        if result.version:
+            typer.echo(f"  Version: {result.version}")
+    else:
+        typer.echo("✗ NSYS not installed")
+        if result.install_command:
+            typer.echo(f"  Install with: {result.install_command}")
 @nsys_app.command("analyze")
 def nsys_analyze(
     filepath: Path = typer.Argument(..., help="Path to .nsys-rep profile file"),
+    output_dir: Path | None = typer.Option(
+        None, "--output-dir", "-o", help="Output directory for analysis files"
+    ),
     json_output: bool = typer.Option(
         False, "--json", help="Output raw JSON instead of formatted text"
     ),
@@ -4789,6 +5574,12 @@ def nsys_analyze(
         "--remote/--local",
         help="Force remote (via API) or local analysis. Default: auto-detect (remote if nsys not installed locally)",
     ),
+    target: str | None = typer.Option(
+        None,
+        "--target",
+        "-t",
+        help="Remote target: 'workspace:id' for workspace execution, or target name from ~/.wafer/targets/",
+    ),
 ) -> None:
     """Analyze an NVIDIA Nsight Systems profile (.nsys-rep file).
@@ -4797,10 +5588,20 @@ def nsys_analyze(
     By default, uses local nsys if available, otherwise runs analysis
     remotely via wafer-api (requires authentication: wafer login).
+    Supports multiple execution modes:
+    - Local: Uses local nsys CLI (no GPU required for analysis)
+    - Remote API: Uploads file and runs analysis on Modal
+    - Workspace: Runs analysis on a Wafer workspace via SSH
+    - Target: Runs analysis on a configured target machine via SSH
     Examples:
         wafer nvidia nsys analyze profile.nsys-rep
         wafer nvidia nsys analyze profile.nsys-rep --json
+        wafer nvidia nsys analyze profile.nsys-rep --local
         wafer nvidia nsys analyze profile.nsys-rep --remote
+        wafer nvidia nsys analyze profile.nsys-rep --target workspace:abc123
+        wafer nvidia nsys analyze profile.nsys-rep --target vultr-b200
+        wafer nvidia nsys analyze profile.nsys-rep -o ./results/
     """
     from .nsys_analyze import analyze_nsys_profile
@@ -4812,11 +5613,20 @@ def nsys_analyze(
         typer.echo(f"Error: Expected .nsys-rep file, got: {filepath.suffix}", err=True)
         raise typer.Exit(1)
+    # Warn if both remote flag and target are specified
+    if target and remote is not None:
+        typer.echo(
+            "Warning: --target overrides --remote/--local flag",
+            err=True,
+        )
     try:
         result = analyze_nsys_profile(
             filepath,
             json_output=json_output,
             remote=remote,
+            target=target,
+            output_dir=output_dir,
         )
         typer.echo(result)
     except FileNotFoundError as e:
@@ -4827,6 +5637,150 @@ def nsys_analyze(
         raise typer.Exit(1) from None
+@nsys_app.command("profile", context_settings={"allow_interspersed_args": False})
+def nsys_profile(
+    command: list[str] = typer.Argument(..., help="Command to profile"),
+    output: str = typer.Option(
+        "profile",
+        "--output",
+        "-o",
+        help="Output filename (without .nsys-rep extension)",
+    ),
+    trace: str | None = typer.Option(
+        None,
+        "--trace",
+        "-t",
+        help="Trace APIs to capture (comma-separated: cuda,nvtx,osrt,cudnn,cublas). Default: cuda",
+    ),
+    duration: int | None = typer.Option(
+        None,
+        "--duration",
+        "-d",
+        help="Maximum profiling duration in seconds",
+    ),
+    target: str | None = typer.Option(
+        None,
+        "--target",
+        help="Remote target: 'workspace:id' for workspace execution, or target name from ~/.wafer/targets/",
+    ),
+    analyze: bool = typer.Option(
+        False,
+        "--analyze",
+        "-a",
+        help="Automatically analyze the profile after completion",
+    ),
+    json_output: bool = typer.Option(
+        False,
+        "--json",
+        help="Output analysis as JSON (only with --analyze)",
+    ),
+    verbose: bool = typer.Option(
+        False,
+        "--verbose",
+        "-v",
+        help="Show verbose progress messages",
+    ),
+    extra_args: str | None = typer.Option(
+        None,
+        "--extra",
+        help="Extra arguments to pass to nsys profile",
+    ),
+) -> None:
+    """Profile a command with NVIDIA Nsight Systems.
+    Runs nsys profile on the specified command and generates a .nsys-rep file.
+    Profiling requires an NVIDIA GPU. Use --target to run on a remote GPU server
+    or workspace.
+    Examples:
+        wafer nvidia nsys profile -- python train.py
+        wafer nvidia nsys profile -o gemm_profile -- ./gemm_kernel
+        wafer nvidia nsys profile --trace cuda,nvtx -- python model.py
+        wafer nvidia nsys profile --duration 60 -- ./long_running_app
+        wafer nvidia nsys profile --target workspace:abc123 -- python test.py
+        wafer nvidia nsys profile --target vultr-b200 -- ./benchmark
+        wafer nvidia nsys profile --analyze -- python train.py
+        wafer nvidia nsys profile --analyze --json -- ./kernel > results.json
+    """
+    # Parse command
+    import shlex
+    from .nsys_analyze import _parse_target
+    from .nsys_profile import (
+        NSYSProfileOptions,
+        profile_and_analyze,
+        profile_local,
+        profile_remote_ssh,
+        profile_workspace,
+    )
+    if isinstance(command, list):
+        # Remove leading "--" if present
+        if command and command[0] == "--":
+            command = command[1:]
+        if len(command) == 1:
+            command_str = command[0]
+        else:
+            command_str = shlex.join(command)
+    else:
+        command_str = command
+    if not command_str:
+        typer.echo("Error: No command specified", err=True)
+        raise typer.Exit(1)
+    # Parse trace options
+    trace_list = trace.split(",") if trace else None
+    # Build options
+    options = NSYSProfileOptions(
+        command=command_str,
+        output=output,
+        trace=trace_list,
+        duration=duration,
+        extra_args=extra_args,
+    )
+    if verbose:
+        typer.echo(f"[nsys] Command: {command_str}", err=True)
+        if target:
+            typer.echo(f"[nsys] Target: {target}", err=True)
+    # Execute
+    if analyze:
+        profile_result, analysis_result = profile_and_analyze(
+            options,
+            target=target,
+            json_output=json_output,
+            verbose=verbose,
+        )
+    else:
+        if target:
+            target_type, target_id = _parse_target(target)
+            if target_type == "workspace":
+                profile_result = profile_workspace(target_id, options, verbose=verbose)
+            else:
+                profile_result = profile_remote_ssh(target_id, options, verbose=verbose)
+        else:
+            profile_result = profile_local(options, verbose=verbose)
+        analysis_result = None
+    # Report results
+    if not profile_result.success:
+        typer.echo(f"Error: {profile_result.error}", err=True)
+        if profile_result.stderr:
+            typer.echo(f"stderr: {profile_result.stderr}", err=True)
+        raise typer.Exit(1)
+    if verbose or not analyze:
+        typer.echo(f"Profile created: {profile_result.output_path}")
+    if analysis_result:
+        if not analysis_result.success:
+            typer.echo(f"Analysis error: {analysis_result.error}", err=True)
+            raise typer.Exit(1)
 # =============================================================================
 # ROCprof-Compute commands
 # =============================================================================
@@ -5959,13 +6913,14 @@ def capture_list_command(
 @corpus_app.command("download")
 def corpus_download(
-    name: str = typer.Argument(..., help="Corpus name (cuda, cutlass, hip)"),
+    name: str = typer.Argument(..., help="Corpus name (cuda, cutlass, hip, amd)"),
     force: bool = typer.Option(False, "--force", "-f", help="Re-download even if exists"),
 ) -> None:
     """Download a documentation corpus for agent filesystem access.
     Examples:
         wafer corpus download cuda
+        wafer corpus download amd
         wafer corpus download cutlass --force
     """
     from .corpus import CORPORA, download_corpus
@@ -6180,78 +7135,12 @@ def tracelens_collective(
 # =============================================================================
-# ISA Analysis Commands
+# Unified ISA Analysis Commands (wafer amd isa ...)
 # =============================================================================
 @isa_app.command("analyze")
 def isa_analyze(
-    file: Path = typer.Argument(..., help="Path to .co file to analyze"),
-    json_output: bool = typer.Option(False, "--json", help="Output as JSON"),
-) -> None:
-    """Analyze AMD GPU code object (.co file).
-    Extracts and analyzes ISA, showing register usage, instruction mix,
-    spills, and other performance-relevant metrics.
-    The .co file is uploaded to the Wafer API server which has ROCm tools
-    installed for analysis.
-    Examples:
-        wafer isa analyze kernel.co
-        wafer isa analyze kernel.co --json
-    """
-    from dataclasses import asdict
-    from wafer_core.tools.isa_analysis_tools import analyze_isa, format_isa_summary
-    from .auth import get_auth_headers
-    from .global_config import get_api_url
-    # Validate file exists
-    if not file.exists():
-        typer.echo(f"Error: File not found: {file}", err=True)
-        raise typer.Exit(1)
-    if not file.suffix == ".co":
-        typer.echo(f"Error: Expected .co file, got: {file.suffix}", err=True)
-        raise typer.Exit(1)
-    # Get API URL and auth
-    api_url = get_api_url()
-    auth_headers = get_auth_headers()
-    if not auth_headers:
-        typer.echo("Error: Not logged in. Run 'wafer login' first.", err=True)
-        raise typer.Exit(1)
-    try:
-        result = analyze_isa(
-            co_file_path=file,
-            api_url=api_url,
-            auth_headers=auth_headers,
-        )
-        if json_output:
-            typer.echo(json.dumps(asdict(result)))
-        else:
-            typer.echo(format_isa_summary(result))
-    except FileNotFoundError as e:
-        typer.echo(f"Error: {e}", err=True)
-        raise typer.Exit(1) from None
-    except Exception as e:
-        typer.echo(f"Error: {e}", err=True)
-        raise typer.Exit(1) from None
-# =============================================================================
-# Kernel Scope Commands (wafer amd kernel-scope ...)
-# =============================================================================
-@kernel_scope_app.command("analyze")
-def kernel_scope_analyze(
     path: Path = typer.Argument(..., help="Path to file or directory to analyze"),
     json_output: bool = typer.Option(False, "--json", "-j", help="Output as JSON"),
     csv_output: bool = typer.Option(False, "--csv", help="Output as CSV"),
@@ -6264,24 +7153,32 @@ def kernel_scope_analyze(
     output_file: Path | None = typer.Option(None, "--output", "-o", help="Write output to file"),
     kernel_index: int = typer.Option(0, "--kernel", "-k", help="Kernel index if multiple in file"),
 ) -> None:
-    """Analyze Triton compilation artifacts (ISA, LLVM-IR, TTGIR).
+    """Analyze AMD GPU ISA files (.co, .s, .ll, .ttgir).
     Performs static analysis to extract performance metrics like register
     pressure, spills, MFMA density, and occupancy limits.
     Supports:
-      - AMDGCN ISA files (.s, .gcn, .asm)
-      - LLVM-IR files (.ll)
-      - TTGIR files (.ttgir, .ttir, .mlir)
+      - AMD GPU code objects (.co) - Requires API authentication
+      - AMDGCN ISA assembly (.s, .gcn, .asm) - Local parsing
+      - LLVM-IR files (.ll) - Local parsing
+      - TTGIR files (.ttgir, .ttir, .mlir) - Local parsing
     Examples:
-        wafer amd kernel-scope analyze kernel.s
-        wafer amd kernel-scope analyze kernel.s --json
-        wafer amd kernel-scope analyze ~/.triton/cache/ --filter 'spills > 0'
-        wafer amd kernel-scope analyze . -r --csv -o metrics.csv
+        wafer amd isa analyze kernel.co              # Code object (needs login)
+        wafer amd isa analyze kernel.s               # ISA assembly
+        wafer amd isa analyze kernel.s --json        # Output as JSON
+        wafer amd isa analyze ~/.triton/cache/ --filter 'spills > 0'
+        wafer amd isa analyze . -r --csv -o metrics.csv
     """
+    from .auth import get_auth_headers
+    from .global_config import get_api_url
     from .kernel_scope import analyze_command
+    # Get API credentials for .co files
+    api_url = get_api_url()
+    auth_headers = get_auth_headers()
     try:
         output = analyze_command(
             path=str(path),
@@ -6291,6 +7188,8 @@ def kernel_scope_analyze(
             filter_expr=filter_expr,
             output_file=str(output_file) if output_file else None,
             kernel_index=kernel_index,
+            api_url=api_url,
+            auth_headers=auth_headers,
         )
         typer.echo(output)
@@ -6305,15 +7204,15 @@ def kernel_scope_analyze(
         raise typer.Exit(1) from None
-@kernel_scope_app.command("metrics")
-def kernel_scope_metrics() -> None:
-    """List available metrics for kernel scope analysis.
+@isa_app.command("metrics")
+def isa_metrics() -> None:
+    """List available metrics for ISA analysis.
-    Shows all metrics that can be extracted from Triton compilation
-    artifacts, along with their derivation.
+    Shows all metrics that can be extracted from AMD GPU ISA files,
+    along with their derivation.
     Examples:
-        wafer amd kernel-scope metrics
+        wafer amd isa metrics
     """
     from .kernel_scope import metrics_command
@@ -6321,15 +7220,15 @@ def kernel_scope_metrics() -> None:
     typer.echo(output)
-@kernel_scope_app.command("targets")
-def kernel_scope_targets() -> None:
+@isa_app.command("targets")
+def isa_targets() -> None:
     """List supported GPU targets and their specifications.
     Shows hardware specs (VGPRs, SGPRs, LDS, etc.) for each supported
     AMD GPU architecture.
     Examples:
-        wafer amd kernel-scope targets
+        wafer amd isa targets
     """
     from .kernel_scope import targets_command

wafer-cli 0.2.9__py3-none-any.whl → 0.2.10__py3-none-any.whl

wafer-cli 0.2.9py3-none-any.whl → 0.2.10py3-none-any.whl