PyPI - wafer-cli - Versions diffs - 0.2.20__py3-none-any.whl → 0.2.22__py3-none-any.whl - Mend

wafer-cli 0.2.20py3-none-any.whl → 0.2.22py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

wafer/cli.py +205 -22
wafer/corpus.py +241 -9
wafer/evaluate.py +426 -8
wafer/templates/optimize_kernel.py +2 -0
wafer/wevin_cli.py +39 -16
{wafer_cli-0.2.20.dist-info → wafer_cli-0.2.22.dist-info}/METADATA +1 -1
{wafer_cli-0.2.20.dist-info → wafer_cli-0.2.22.dist-info}/RECORD +10 -10
{wafer_cli-0.2.20.dist-info → wafer_cli-0.2.22.dist-info}/WHEEL +0 -0
{wafer_cli-0.2.20.dist-info → wafer_cli-0.2.22.dist-info}/entry_points.txt +0 -0
{wafer_cli-0.2.20.dist-info → wafer_cli-0.2.22.dist-info}/top_level.txt +0 -0

wafer/cli.py CHANGED Viewed

@@ -1,6 +1,8 @@
-# ruff: noqa: PLR0913
+# ruff: noqa: PLR0913, E402
 # PLR0913 (too many arguments) is suppressed because Typer CLI commands
 # naturally have many parameters - each --flag becomes a function argument.
+# E402 (module level import not at top) is suppressed because we intentionally
+# load .env files before importing other modules that may read env vars.
 """Wafer CLI - GPU development toolkit for LLM coding agents.
 Core commands:
@@ -27,6 +29,12 @@ from pathlib import Path
 import trio
 import typer
+from dotenv import load_dotenv
+# Auto-load .env from current directory and ~/.wafer/.env
+# This runs at import time so env vars are available before any config is accessed
+load_dotenv()  # cwd/.env
+load_dotenv(Path.home() / ".wafer" / ".env")  # ~/.wafer/.env
 from .config import WaferConfig, WaferEnvironment
 from .inference import infer_upload_files, resolve_environment
@@ -42,6 +50,7 @@ from .problems import (
 app = typer.Typer(
     help="GPU development toolkit for LLM coding agents",
     no_args_is_help=True,
+    pretty_exceptions_show_locals=False,  # Don't dump local vars (makes tracebacks huge)
 )
 # =============================================================================
@@ -58,11 +67,11 @@ def _show_version() -> None:
     """Show CLI version and environment, then exit."""
     from .analytics import _get_cli_version
     from .global_config import load_global_config
     version = _get_cli_version()
     config = load_global_config()
     environment = config.environment
     typer.echo(f"wafer-cli {version} ({environment})")
     raise typer.Exit()
@@ -110,7 +119,7 @@ def main_callback(
     if version:
         _show_version()
         return
     global _command_start_time, _command_outcome
     _command_start_time = time.time()
     _command_outcome = "success"  # Default to success, mark failure on exceptions
@@ -121,6 +130,7 @@ def main_callback(
     analytics.init_analytics()
     # Install exception hook to catch SystemExit and mark failures
+    # Also prints error message FIRST so it's visible even when traceback is truncated
     original_excepthook = sys.excepthook
     def custom_excepthook(
@@ -136,7 +146,11 @@ def main_callback(
                 _command_outcome = "failure"
         else:
             _command_outcome = "failure"
-        # Call original excepthook
+            # Print error summary FIRST (before traceback) so it's visible even if truncated
+            print(
+                f"\n\033[1;31m>>> ERROR: {exc_type.__name__}: {exc_value}\033[0m\n", file=sys.stderr
+            )
+        # Call original excepthook (prints the full traceback)
         original_excepthook(exc_type, exc_value, exc_traceback)
     sys.excepthook = custom_excepthook
@@ -591,7 +605,7 @@ app.add_typer(provider_auth_app, name="auth")
 def provider_auth_login(
     provider: str = typer.Argument(
         ...,
-        help="Provider name: runpod, digitalocean, or modal",
+        help="Provider name: runpod, digitalocean, modal, anthropic, or openai",
     ),
     api_key: str | None = typer.Option(
         None,
@@ -600,15 +614,16 @@ def provider_auth_login(
         help="API key (if not provided, reads from stdin)",
     ),
 ) -> None:
-    """Save API key for a cloud GPU provider.
+    """Save API key for a provider.
     Stores the key in ~/.wafer/auth.json. Environment variables
-    (e.g., WAFER_RUNPOD_API_KEY) take precedence over stored keys.
+    (e.g., ANTHROPIC_API_KEY) take precedence over stored keys.
     Examples:
+        wafer auth login anthropic --api-key sk-ant-xxx
         wafer auth login runpod --api-key rp_xxx
-        wafer auth login digitalocean --api-key dop_v1_xxx
-        echo $API_KEY | wafer auth login runpod
+        wafer auth login openai --api-key sk-xxx
+        echo $API_KEY | wafer auth login anthropic
     """
     import sys
@@ -642,7 +657,7 @@ def provider_auth_login(
 def provider_auth_logout(
     provider: str = typer.Argument(
         ...,
-        help="Provider name: runpod, digitalocean, or modal",
+        help="Provider name: runpod, digitalocean, modal, anthropic, or openai",
     ),
 ) -> None:
     """Remove stored API key for a cloud GPU provider.
@@ -3473,7 +3488,7 @@ def init_runpod(
     gpu_configs = {
         "MI300X": {
             "gpu_type_id": "AMD Instinct MI300X OAM",
-            "image": "runpod/pytorch:2.4.0-py3.10-rocm6.1.0-ubuntu22.04",
+            "image": "rocm/pytorch:rocm7.0.2_ubuntu24.04_py3.12_pytorch_release_2.7.1",
             "compute_capability": "9.4",
         },
         "H100": {
@@ -3569,7 +3584,7 @@ def init_digitalocean(
         "ssh_key": ssh_key,
         "region": region,
         "size_slug": "gpu-mi300x1-192gb-devcloud",
-        "image": "gpu-amd-base",
+        "image": "amd-pytorchrocm7",  # PyTorch (ROCm7) marketplace image
         "provision_timeout": 600,
         "eval_timeout": 600,
         "keep_alive": keep_alive,
@@ -4071,6 +4086,164 @@ def targets_cleanup(
         raise typer.Exit(1) from None
+# Known libraries that can be installed on targets
+# TODO: Consider adding HipKittens to the default RunPod/DO Docker images
+# so this install step isn't needed. For now, this command handles it.
+INSTALLABLE_LIBRARIES: dict[str, dict[str, object]] = {
+    "hipkittens": {
+        "description": "HipKittens - AMD port of ThunderKittens for MI300X",
+        "git_url": "https://github.com/HazyResearch/hipkittens.git",
+        "install_path": "/opt/hipkittens",
+        "requires_amd": True,
+    },
+    # CK is already installed with ROCm 7.0, no action needed
+    "repair-headers": {
+        "description": "Repair ROCm thrust headers (fixes hipify corruption)",
+        "custom_script": "apt-get update -qq && apt-get install --reinstall -y rocthrust >/dev/null 2>&1 && echo REPAIRED",
+        "requires_amd": True,
+    },
+}
+@targets_app.command("install")
+def targets_install(
+    name: str = typer.Argument(..., help="Target name"),
+    library: str = typer.Argument(..., help="Library to install (hipkittens, repair-headers)"),
+) -> None:
+    """Install a library or run maintenance on a target (idempotent).
+    Installs header-only libraries like HipKittens on remote targets.
+    Safe to run multiple times - will skip if already installed.
+    Available libraries:
+        hipkittens     - HipKittens (AMD ThunderKittens port)
+        repair-headers - Fix ROCm thrust headers (after hipify corruption)
+    Examples:
+        wafer config targets install runpod-mi300x hipkittens
+        wafer config targets install runpod-mi300x repair-headers
+        wafer config targets install do-mi300x hipkittens
+    """
+    import subprocess
+    from .targets import load_target
+    from .targets_ops import get_target_ssh_info
+    if library not in INSTALLABLE_LIBRARIES:
+        available = ", ".join(INSTALLABLE_LIBRARIES.keys())
+        typer.echo(f"Error: Unknown library '{library}'. Available: {available}", err=True)
+        raise typer.Exit(1)
+    lib_info = INSTALLABLE_LIBRARIES[library]
+    try:
+        target = load_target(name)
+    except FileNotFoundError as e:
+        typer.echo(f"Error: {e}", err=True)
+        raise typer.Exit(1) from None
+    # Check if target is AMD (for AMD-only libraries)
+    if lib_info.get("requires_amd"):
+        from wafer_core.utils.kernel_utils.targets.config import (
+            DigitalOceanTarget,
+            RunPodTarget,
+        )
+        is_amd = isinstance(target, (RunPodTarget, DigitalOceanTarget))
+        if not is_amd and hasattr(target, "compute_capability"):
+            # Check compute capability for MI300X (gfx942 = 9.4)
+            is_amd = target.compute_capability.startswith("9.")
+        if not is_amd:
+            typer.echo(f"Error: {library} requires an AMD GPU target", err=True)
+            raise typer.Exit(1)
+    typer.echo(f"Installing {library} on {name}...")
+    typer.echo(f"  {lib_info['description']}")
+    async def _install() -> bool:
+        # get_target_ssh_info uses pure trio async (no asyncio bridging needed)
+        # and we use subprocess for SSH, not AsyncSSHClient
+        ssh_info = await get_target_ssh_info(target)
+        ssh_cmd = [
+            "ssh",
+            "-o",
+            "StrictHostKeyChecking=no",
+            "-o",
+            "UserKnownHostsFile=/dev/null",
+            "-o",
+            "ConnectTimeout=30",
+            "-i",
+            str(ssh_info.key_path),
+            "-p",
+            str(ssh_info.port),
+            f"{ssh_info.user}@{ssh_info.host}",
+        ]
+        # Handle custom scripts (like repair-headers) vs git installs
+        if "custom_script" in lib_info:
+            install_script = str(lib_info["custom_script"])
+            success_marker = "REPAIRED"
+        else:
+            install_path = lib_info["install_path"]
+            git_url = lib_info["git_url"]
+            # Idempotent install script
+            install_script = f"""
+if [ -d "{install_path}" ]; then
+    echo "ALREADY_INSTALLED: {install_path} exists"
+    cd {install_path} && git pull --quiet 2>/dev/null || true
+else
+    echo "INSTALLING: cloning to {install_path}"
+    git clone --quiet {git_url} {install_path}
+fi
+echo "DONE"
+"""
+            success_marker = "DONE"
+        def run_ssh() -> subprocess.CompletedProcess[str]:
+            return subprocess.run(
+                ssh_cmd + [install_script],
+                capture_output=True,
+                text=True,
+                timeout=120,
+            )
+        result = await trio.to_thread.run_sync(run_ssh)
+        if result.returncode != 0:
+            typer.echo(f"Error: {result.stderr}", err=True)
+            return False
+        output = result.stdout.strip()
+        if "ALREADY_INSTALLED" in output:
+            typer.echo(f"  Already installed at {lib_info.get('install_path', 'N/A')}")
+        elif "INSTALLING" in output:
+            typer.echo(f"  Installed to {lib_info.get('install_path', 'N/A')}")
+        elif "REPAIRED" in output:
+            typer.echo("  ROCm headers repaired")
+        return success_marker in output
+    try:
+        success = trio.run(_install)
+    except Exception as e:
+        typer.echo(f"Error: {e}", err=True)
+        raise typer.Exit(1) from None
+    if success:
+        typer.echo(f"✓ {library} ready on {name}")
+        # Print usage hint
+        if library == "hipkittens":
+            typer.echo("")
+            typer.echo("Usage in load_inline:")
+            typer.echo('  extra_include_paths=["/opt/hipkittens/include", "/opt/rocm/include/hip"]')
+    else:
+        typer.echo(f"Failed to install {library}", err=True)
+        raise typer.Exit(1)
 @targets_app.command("pods")
 def targets_pods() -> None:
     """List all running RunPod pods.
@@ -4406,9 +4579,13 @@ def workspaces_list(
 @workspaces_app.command("create")
 def workspaces_create(
     name: str = typer.Argument(..., help="Workspace name"),
-    gpu_type: str = typer.Option("B200", "--gpu", "-g", help="GPU type: MI300X (AMD) or B200 (NVIDIA, default)"),
+    gpu_type: str = typer.Option(
+        "B200", "--gpu", "-g", help="GPU type: MI300X (AMD) or B200 (NVIDIA, default)"
+    ),
     image: str | None = typer.Option(None, "--image", "-i", help="Docker image (optional)"),
-    wait: bool = typer.Option(False, "--wait", "-w", help="Wait for provisioning and show SSH credentials"),
+    wait: bool = typer.Option(
+        False, "--wait", "-w", help="Wait for provisioning and show SSH credentials"
+    ),
     json_output: bool = typer.Option(False, "--json", "-j", help="Output as JSON"),
 ) -> None:
     """Create a new workspace.
@@ -4717,19 +4894,25 @@ def workspaces_ssh(
     ssh_host = ws.get("ssh_host")
     ssh_port = ws.get("ssh_port")
     ssh_user = ws.get("ssh_user")
     if not ssh_host or not ssh_port or not ssh_user:
         typer.echo("Error: Workspace not ready. Wait a few seconds and retry.", err=True)
         raise typer.Exit(1)
     # Connect via SSH
-    os.execvp("ssh", [
+    os.execvp(
         "ssh",
-        "-p", str(ssh_port),
-        "-o", "StrictHostKeyChecking=no",
-        "-o", "UserKnownHostsFile=/dev/null",
-        f"{ssh_user}@{ssh_host}",
-    ])
+        [
+            "ssh",
+            "-p",
+            str(ssh_port),
+            "-o",
+            "StrictHostKeyChecking=no",
+            "-o",
+            "UserKnownHostsFile=/dev/null",
+            f"{ssh_user}@{ssh_host}",
+        ],
+    )
 @workspaces_app.command("sync")

wafer/corpus.py CHANGED Viewed

@@ -3,10 +3,12 @@
 Download and manage documentation corpora for agent filesystem access.
 """
+import re
 import shutil
 import tarfile
 import tempfile
 from dataclasses import dataclass
+from html.parser import HTMLParser
 from pathlib import Path
 from typing import Literal
 from urllib.parse import urlparse
@@ -33,7 +35,7 @@ class CorpusConfig:
     name: CorpusName
     description: str
-    source_type: Literal["nvidia_md", "github_repo", "github_multi_repo"]
+    source_type: Literal["nvidia_md", "github_repo", "github_multi_repo", "mixed"]
     urls: list[str] | None = None
     repo: str | None = None
     repo_paths: list[str] | None = None
@@ -67,10 +69,43 @@ CORPORA: dict[CorpusName, CorpusConfig] = {
     ),
     "cutlass": CorpusConfig(
         name="cutlass",
-        description="CUTLASS and CuTe DSL documentation",
-        source_type="github_repo",
-        repo="NVIDIA/cutlass",
-        repo_paths=["media/docs", "python/cutlass/docs"],
+        description="CUTLASS C++ documentation, examples, and tutorials",
+        source_type="mixed",
+        # Official NVIDIA CUTLASS documentation (scraped as markdown)
+        urls=[
+            "https://docs.nvidia.com/cutlass/latest/overview.html",
+            "https://docs.nvidia.com/cutlass/latest/media/docs/cpp/functionality.html",
+            "https://docs.nvidia.com/cutlass/latest/media/docs/cpp/terminology.html",
+            "https://docs.nvidia.com/cutlass/latest/media/docs/cpp/fundamental_types.html",
+            "https://docs.nvidia.com/cutlass/latest/media/docs/cpp/programming_guidelines.html",
+            "https://docs.nvidia.com/cutlass/latest/media/docs/cpp/heuristics.html",
+            "https://docs.nvidia.com/cutlass/latest/media/docs/cpp/efficient_gemm.html",
+            "https://docs.nvidia.com/cutlass/latest/media/docs/cpp/pipeline.html",
+            "https://docs.nvidia.com/cutlass/latest/media/docs/cpp/profiler.html",
+            "https://docs.nvidia.com/cutlass/latest/media/docs/cpp/dependent_kernel_launch.html",
+            "https://docs.nvidia.com/cutlass/latest/media/docs/cpp/blackwell_functionality.html",
+            "https://docs.nvidia.com/cutlass/latest/media/docs/cpp/blackwell_cluster_launch_control.html",
+            "https://docs.nvidia.com/cutlass/latest/media/docs/cpp/cute/00_quickstart.html",
+            "https://docs.nvidia.com/cutlass/latest/media/docs/cpp/cute/01_layout.html",
+            "https://docs.nvidia.com/cutlass/latest/media/docs/cpp/cute/02_layout_algebra.html",
+            "https://docs.nvidia.com/cutlass/latest/media/docs/cpp/cute/03_tensor.html",
+            "https://docs.nvidia.com/cutlass/latest/media/docs/cpp/cute/04_algorithms.html",
+            "https://docs.nvidia.com/cutlass/latest/media/docs/cpp/cute/0t_mma_atom.html",
+            "https://docs.nvidia.com/cutlass/latest/media/docs/cpp/cute/0x_gemm_tutorial.html",
+            "https://docs.nvidia.com/cutlass/latest/media/docs/cpp/cute/0y_predication.html",
+            "https://docs.nvidia.com/cutlass/latest/media/docs/cpp/cute/0z_tma_tensors.html",
+            "https://docs.nvidia.com/cutlass/latest/media/docs/cpp/cutlass_3x_design.html",
+            "https://docs.nvidia.com/cutlass/latest/media/docs/cpp/cutlass_3x_backwards_compatibility.html",
+            "https://docs.nvidia.com/cutlass/latest/media/docs/cpp/gemm_api_3x.html",
+        ],
+        # NVIDIA/cutlass GitHub examples (excluding python/)
+        repos=[
+            RepoSource(
+                repo="NVIDIA/cutlass",
+                paths=["examples"],
+                branch="main",
+            ),
+        ],
     ),
     "hip": CorpusConfig(
         name="hip",
@@ -169,19 +204,195 @@ def _url_to_filepath(url: str, base_dir: Path) -> Path:
     return base_dir / "/".join(path_parts)
+class _HTMLToMarkdown(HTMLParser):
+    """HTML to Markdown converter for NVIDIA documentation pages.
+    Uses stdlib HTMLParser - requires subclassing due to callback-based API.
+    The public interface is the functional `_html_to_markdown()` below.
+    """
+    def __init__(self) -> None:
+        super().__init__()
+        self.output: list[str] = []
+        self.current_tag: str = ""
+        self.in_code_block = False
+        self.in_pre = False
+        self.list_depth = 0
+        self.ordered_list_counters: list[int] = []
+        self.skip_content = False
+        self.link_href: str | None = None
+    def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
+        self.current_tag = tag
+        attrs_dict = dict(attrs)
+        # Skip script, style, nav, footer, header
+        if tag in ("script", "style", "nav", "footer", "header", "aside"):
+            self.skip_content = True
+            return
+        if tag == "h1":
+            self.output.append("\n# ")
+        elif tag == "h2":
+            self.output.append("\n## ")
+        elif tag == "h3":
+            self.output.append("\n### ")
+        elif tag == "h4":
+            self.output.append("\n#### ")
+        elif tag == "h5":
+            self.output.append("\n##### ")
+        elif tag == "h6":
+            self.output.append("\n###### ")
+        elif tag == "p":
+            self.output.append("\n\n")
+        elif tag == "br":
+            self.output.append("\n")
+        elif tag == "strong" or tag == "b":
+            self.output.append("**")
+        elif tag == "em" or tag == "i":
+            self.output.append("*")
+        elif tag == "code" and not self.in_pre:
+            self.output.append("`")
+            self.in_code_block = True
+        elif tag == "pre":
+            self.in_pre = True
+            # Check for language hint in class
+            lang = ""
+            if class_attr := attrs_dict.get("class"):
+                if "python" in class_attr.lower():
+                    lang = "python"
+                elif "cpp" in class_attr.lower() or "c++" in class_attr.lower():
+                    lang = "cpp"
+                elif "cuda" in class_attr.lower():
+                    lang = "cuda"
+            self.output.append(f"\n```{lang}\n")
+        elif tag == "ul":
+            self.list_depth += 1
+            self.output.append("\n")
+        elif tag == "ol":
+            self.list_depth += 1
+            self.ordered_list_counters.append(1)
+            self.output.append("\n")
+        elif tag == "li":
+            indent = "  " * (self.list_depth - 1)
+            if self.ordered_list_counters:
+                num = self.ordered_list_counters[-1]
+                self.output.append(f"{indent}{num}. ")
+                self.ordered_list_counters[-1] += 1
+            else:
+                self.output.append(f"{indent}- ")
+        elif tag == "a":
+            self.link_href = attrs_dict.get("href")
+            self.output.append("[")
+        elif tag == "img":
+            alt = attrs_dict.get("alt", "image")
+            src = attrs_dict.get("src", "")
+            self.output.append(f"![{alt}]({src})")
+        elif tag == "blockquote":
+            self.output.append("\n> ")
+        elif tag == "hr":
+            self.output.append("\n---\n")
+        elif tag == "table":
+            self.output.append("\n")
+        elif tag == "th":
+            self.output.append("| ")
+        elif tag == "td":
+            self.output.append("| ")
+        elif tag == "tr":
+            pass  # Handled in endtag
+    def handle_endtag(self, tag: str) -> None:
+        if tag in ("script", "style", "nav", "footer", "header", "aside"):
+            self.skip_content = False
+            return
+        if tag in ("h1", "h2", "h3", "h4", "h5", "h6"):
+            self.output.append("\n")
+        elif tag == "strong" or tag == "b":
+            self.output.append("**")
+        elif tag == "em" or tag == "i":
+            self.output.append("*")
+        elif tag == "code" and not self.in_pre:
+            self.output.append("`")
+            self.in_code_block = False
+        elif tag == "pre":
+            self.in_pre = False
+            self.output.append("\n```\n")
+        elif tag == "ul":
+            self.list_depth = max(0, self.list_depth - 1)
+        elif tag == "ol":
+            self.list_depth = max(0, self.list_depth - 1)
+            if self.ordered_list_counters:
+                self.ordered_list_counters.pop()
+        elif tag == "li":
+            self.output.append("\n")
+        elif tag == "a":
+            if self.link_href:
+                self.output.append(f"]({self.link_href})")
+            else:
+                self.output.append("]")
+            self.link_href = None
+        elif tag == "p":
+            self.output.append("\n")
+        elif tag == "blockquote":
+            self.output.append("\n")
+        elif tag == "tr":
+            self.output.append("|\n")
+        elif tag == "thead":
+            # Add markdown table separator after header row
+            self.output.append("|---" * 10 + "|\n")
+    def handle_data(self, data: str) -> None:
+        if self.skip_content:
+            return
+        # Preserve whitespace in code blocks
+        if self.in_pre:
+            self.output.append(data)
+        else:
+            # Collapse whitespace outside code
+            text = re.sub(r"\s+", " ", data)
+            if text.strip():
+                self.output.append(text)
+    def get_markdown(self) -> str:
+        """Get the converted markdown, cleaned up."""
+        md = "".join(self.output)
+        # Clean up excessive newlines
+        md = re.sub(r"\n{3,}", "\n\n", md)
+        # Clean up empty table separators
+        md = re.sub(r"\|---\|---.*\|\n(?!\|)", "", md)
+        return md.strip()
+def _html_to_markdown(html: str) -> str:
+    """Convert HTML to Markdown."""
+    parser = _HTMLToMarkdown()
+    parser.feed(html)
+    return parser.get_markdown()
 def _download_nvidia_md(config: CorpusConfig, dest: Path, verbose: bool = True) -> int:
-    """Download NVIDIA docs using .md endpoint."""
+    """Download NVIDIA docs and convert HTML to Markdown.
+    NVIDIA's .md endpoint no longer works, so we scrape HTML and convert to markdown.
+    """
     assert config.urls is not None
     downloaded = 0
     with httpx.Client(timeout=30.0, follow_redirects=True) as client:
         for url in config.urls:
-            md_url = f"{url}.md"
             filepath = _url_to_filepath(url, dest)
             filepath.parent.mkdir(parents=True, exist_ok=True)
             try:
-                resp = client.get(md_url)
+                # Fetch HTML page directly
+                resp = client.get(url)
                 resp.raise_for_status()
-                filepath.write_text(resp.text)
+                # Convert HTML to Markdown
+                markdown = _html_to_markdown(resp.text)
+                # Add source URL as header
+                content = f"<!-- Source: {url} -->\n\n{markdown}"
+                filepath.write_text(content)
                 downloaded += 1
                 if verbose:
                     print(f"  ✓ {filepath.relative_to(dest)}")
@@ -275,6 +486,25 @@ def _download_github_multi_repo(config: CorpusConfig, dest: Path, verbose: bool
     return downloaded
+def _download_mixed(config: CorpusConfig, dest: Path, verbose: bool = True) -> int:
+    """Download from mixed sources (NVIDIA docs + GitHub repos)."""
+    total = 0
+    # Download NVIDIA markdown docs (urls)
+    if config.urls:
+        if verbose:
+            print("  [NVIDIA docs]")
+        total += _download_nvidia_md(config, dest, verbose)
+    # Download GitHub repos
+    if config.repos:
+        if verbose:
+            print("  [GitHub repos]")
+        total += _download_github_multi_repo(config, dest, verbose)
+    return total
 def download_corpus(name: CorpusName, force: bool = False, verbose: bool = True) -> Path:
     """Download a corpus to local cache.
@@ -311,6 +541,8 @@ def download_corpus(name: CorpusName, force: bool = False, verbose: bool = True)
         count = _download_github_repo(config, dest, verbose)
     elif config.source_type == "github_multi_repo":
         count = _download_github_multi_repo(config, dest, verbose)
+    elif config.source_type == "mixed":
+        count = _download_mixed(config, dest, verbose)
     else:
         raise ValueError(f"Unknown source type: {config.source_type}")
     if verbose:

wafer/evaluate.py CHANGED Viewed

@@ -1168,11 +1168,16 @@ def _build_modal_sandbox_script(
     """
     gpu_type = target.gpu_type
-    # Determine PyTorch index based on GPU type
+    # Determine PyTorch index and CUDA arch based on GPU type
     if gpu_type in ("B200", "GB200"):
-        torch_index = "https://download.pytorch.org/whl/nightly/cu128"
+        torch_index = "https://download.pytorch.org/whl/cu130"
+        cuda_arch_list = "10.0"  # Blackwell (sm_100)
+    elif gpu_type == "H100":
+        torch_index = "https://download.pytorch.org/whl/cu130"
+        cuda_arch_list = "9.0"  # Hopper (sm_90)
     else:
         torch_index = "https://download.pytorch.org/whl/cu124"
+        cuda_arch_list = "8.0"  # Default to Ampere (sm_80)
     return f'''
 import asyncio
@@ -1190,7 +1195,7 @@ async def run_eval():
             "nvidia/cuda:12.9.0-devel-ubuntu22.04",
             add_python="3.12",
         )
-        .apt_install("git", "build-essential", "cmake")
+        .apt_install("git", "build-essential", "cmake", "ripgrep")
         .pip_install(
             "torch",
             index_url="{torch_index}",
@@ -1203,6 +1208,12 @@ async def run_eval():
         )
         .env({{
             "CUDA_HOME": "/usr/local/cuda",
+            # C++ compiler needs explicit include path for cuda_runtime.h
+            "CPLUS_INCLUDE_PATH": "/usr/local/cuda/include",
+            # Linker needs lib path
+            "LIBRARY_PATH": "/usr/local/cuda/lib64",
+            # Force PyTorch to compile for correct GPU architecture
+            "TORCH_CUDA_ARCH_LIST": "{cuda_arch_list}",
         }})
     )
@@ -2790,6 +2801,15 @@ if torch.cuda.is_available():
     gc.collect()
     torch.cuda.empty_cache()
     torch.cuda.reset_peak_memory_stats()
+    # Enable TF32 for fair benchmarking against reference kernels.
+    # PyTorch 1.12+ disables TF32 for matmul by default, which handicaps
+    # reference kernels using cuBLAS. We enable it so reference kernels
+    # run at their best performance (using tensor cores when applicable).
+    # This ensures speedup comparisons are against optimized baselines.
+    torch.backends.cuda.matmul.allow_tf32 = True
+    torch.backends.cudnn.allow_tf32 = True
+    print("[KernelBench] TF32 enabled for fair benchmarking")
 def _calculate_timing_stats(times: list[float]) -> dict:
@@ -3453,6 +3473,368 @@ def _validate_kernelbench_files(args: KernelBenchEvaluateArgs) -> str | None:
     return None
+def _build_modal_kernelbench_script(
+    target: ModalTarget,
+    impl_code_b64: str,
+    ref_code_b64: str,
+    eval_script_b64: str,
+    run_benchmarks: bool,
+    run_defensive: bool,
+    defense_code_b64: str | None,
+    seed: int,
+    inputs_code_b64: str | None = None,
+) -> str:
+    """Build Python script to create Modal sandbox and run KernelBench evaluation.
+    This runs in a subprocess to isolate Modal's asyncio from trio.
+    """
+    gpu_type = target.gpu_type
+    # Determine PyTorch index and CUDA arch based on GPU type
+    if gpu_type in ("B200", "GB200"):
+        torch_index = "https://download.pytorch.org/whl/cu130"
+        cuda_arch_list = "10.0"  # Blackwell (sm_100)
+    elif gpu_type == "H100":
+        # H100 uses CUDA 13.0 (matches modal_app.py)
+        torch_index = "https://download.pytorch.org/whl/cu130"
+        cuda_arch_list = "9.0"  # Hopper (sm_90)
+    else:
+        torch_index = "https://download.pytorch.org/whl/cu124"
+        cuda_arch_list = "8.0"  # Default to Ampere (sm_80)
+    # Install CUTLASS headers (for cute/tensor.hpp and cutlass/util/*.h) from GitHub
+    # The nvidia-cutlass-dsl pip package doesn't include the C++ headers needed for nvcc
+    # IMPORTANT: symlink to /usr/local/cuda/include because nvcc searches there by default
+    cutlass_install = '''
+        .run_commands([
+            # Clone CUTLASS headers from GitHub (shallow clone, full include tree)
+            # Use simple shallow clone - sparse-checkout can be buggy in some environments
+            "git clone --depth 1 https://github.com/NVIDIA/cutlass.git /opt/cutlass",
+            # Verify the util headers exist (for debugging)
+            "ls -la /opt/cutlass/include/cutlass/util/ | head -5",
+            # Symlink headers to CUDA include path (nvcc searches here by default)
+            "ln -sf /opt/cutlass/include/cute /usr/local/cuda/include/cute",
+            "ln -sf /opt/cutlass/include/cutlass /usr/local/cuda/include/cutlass",
+        ])
+        .pip_install(
+            "nvidia-cutlass-dsl",
+            index_url="https://pypi.nvidia.com",
+            extra_index_url="https://pypi.org/simple",
+        )
+    '''
+    inputs_write = ""
+    if inputs_code_b64:
+        inputs_write = f'''
+        # Write custom inputs
+        proc = sandbox.exec("python", "-c", f"""
+import base64
+with open('/workspace/custom_inputs.py', 'w') as f:
+    f.write(base64.b64decode('{inputs_code_b64}').decode())
+print('Custom inputs written')
+""")
+        proc.wait()
+'''
+    defense_write = ""
+    if run_defensive and defense_code_b64:
+        defense_write = f'''
+        # Write defense module
+        proc = sandbox.exec("python", "-c", f"""
+import base64
+with open('/workspace/defense.py', 'w') as f:
+    f.write(base64.b64decode('{defense_code_b64}').decode())
+print('Defense module written')
+""")
+        proc.wait()
+'''
+    # Build eval command
+    eval_cmd_parts = [
+        "python /workspace/kernelbench_eval.py",
+        "--impl /workspace/implementation.py",
+        "--reference /workspace/reference.py",
+        "--output /workspace/results.json",
+        f"--seed {seed}",
+    ]
+    if run_benchmarks:
+        eval_cmd_parts.append("--benchmark")
+    if run_defensive and defense_code_b64:
+        eval_cmd_parts.append("--defensive")
+        eval_cmd_parts.append("--defense-module /workspace/defense.py")
+    if inputs_code_b64:
+        eval_cmd_parts.append("--inputs /workspace/custom_inputs.py")
+    eval_cmd = " ".join(eval_cmd_parts)
+    return f'''
+import asyncio
+import base64
+import json
+import sys
+import modal
+async def run_eval():
+    app = modal.App.lookup("wafer-evaluate", create_if_missing=True)
+    # Build image with PyTorch, CUTLASS DSL and dependencies
+    image = (
+        modal.Image.from_registry(
+            "nvidia/cuda:12.9.0-devel-ubuntu22.04",
+            add_python="3.12",
+        )
+        .apt_install("git", "build-essential", "cmake", "ninja-build", "ripgrep")
+        .pip_install(
+            "torch",
+            index_url="{torch_index}",
+            extra_index_url="https://pypi.org/simple",
+        )
+        .pip_install(
+            "numpy",
+            "triton",
+            "ninja",
+        )
+        {cutlass_install}
+        .env({{
+            "CUDA_HOME": "/usr/local/cuda",
+            # C++ compiler needs explicit include path for cuda_runtime.h
+            "CPLUS_INCLUDE_PATH": "/usr/local/cuda/include",
+            # Linker needs lib path
+            "LIBRARY_PATH": "/usr/local/cuda/lib64",
+            # Force PyTorch to compile for correct GPU architecture
+            "TORCH_CUDA_ARCH_LIST": "{cuda_arch_list}",
+        }})
+    )
+    # Create sandbox
+    sandbox = modal.Sandbox.create(
+        app=app,
+        image=image,
+        gpu="{gpu_type}",
+        timeout={target.timeout_seconds},
+    )
+    try:
+        # Create workspace directory
+        sandbox.exec("mkdir", "-p", "/workspace").wait()
+        # Write files to sandbox
+        proc = sandbox.exec("python", "-c", f"""
+import base64
+with open('/workspace/implementation.py', 'w') as f:
+    f.write(base64.b64decode('{impl_code_b64}').decode())
+with open('/workspace/reference.py', 'w') as f:
+    f.write(base64.b64decode('{ref_code_b64}').decode())
+with open('/workspace/kernelbench_eval.py', 'w') as f:
+    f.write(base64.b64decode('{eval_script_b64}').decode())
+print('Files written')
+""")
+        proc.wait()
+        if proc.returncode != 0:
+            print(json.dumps({{"success": False, "error": f"Failed to write files: {{proc.stderr.read()}}"}}))
+            return
+{inputs_write}
+{defense_write}
+        # Run evaluation
+        print(f"Running KernelBench evaluation on {{'{gpu_type}'}}...")
+        proc = sandbox.exec("bash", "-c", "{eval_cmd}")
+        # Stream output
+        for line in proc.stdout:
+            print(line, end="")
+        for line in proc.stderr:
+            print(line, end="", file=sys.stderr)
+        proc.wait()
+        if proc.returncode != 0:
+            print(json.dumps({{"success": False, "error": f"Evaluation failed with exit code {{proc.returncode}}"}}))
+            return
+        # Read results
+        result_proc = sandbox.exec("cat", "/workspace/results.json")
+        result_data = result_proc.stdout.read()
+        result_proc.wait()
+        if result_data:
+            results = json.loads(result_data)
+            print("EVAL_RESULT_JSON:" + json.dumps(results))
+        else:
+            print(json.dumps({{"success": False, "error": "No results.json found"}}))
+    finally:
+        sandbox.terminate()
+asyncio.run(run_eval())
+'''
+async def run_evaluate_kernelbench_modal(
+    args: KernelBenchEvaluateArgs,
+    target: ModalTarget,
+) -> EvaluateResult:
+    """Run KernelBench format evaluation on Modal sandbox.
+    Creates a Modal sandbox, uploads files, runs KernelBench eval, and parses results.
+    Uses subprocess to isolate Modal's asyncio from trio.
+    """
+    import base64
+    import subprocess
+    import sys
+    import trio
+    print(f"Creating Modal sandbox ({target.gpu_type}) for KernelBench evaluation...")
+    # Encode files as base64
+    impl_code_b64 = base64.b64encode(args.implementation.read_bytes()).decode()
+    ref_code_b64 = base64.b64encode(args.reference.read_bytes()).decode()
+    eval_script_b64 = base64.b64encode(KERNELBENCH_EVAL_SCRIPT.encode()).decode()
+    # Encode custom inputs if provided
+    inputs_code_b64 = None
+    if args.inputs:
+        inputs_code_b64 = base64.b64encode(args.inputs.read_bytes()).decode()
+    # Encode defense module if defensive mode is enabled
+    defense_code_b64 = None
+    if args.defensive:
+        defense_path = (
+            Path(__file__).parent.parent.parent.parent
+            / "packages"
+            / "wafer-core"
+            / "wafer_core"
+            / "utils"
+            / "kernel_utils"
+            / "defense.py"
+        )
+        if defense_path.exists():
+            defense_code_b64 = base64.b64encode(defense_path.read_bytes()).decode()
+        else:
+            print(f"Warning: defense.py not found at {defense_path}, falling back to basic defense")
+    # Build the script
+    script = _build_modal_kernelbench_script(
+        target=target,
+        impl_code_b64=impl_code_b64,
+        ref_code_b64=ref_code_b64,
+        eval_script_b64=eval_script_b64,
+        run_benchmarks=args.benchmark,
+        run_defensive=args.defensive,
+        defense_code_b64=defense_code_b64,
+        seed=args.seed,
+        inputs_code_b64=inputs_code_b64,
+    )
+    def _run_subprocess() -> tuple[str, str, int]:
+        result = subprocess.run(
+            [sys.executable, "-c", script],
+            capture_output=True,
+            text=True,
+            timeout=target.timeout_seconds + 120,  # Extra buffer for sandbox creation + image build
+        )
+        return result.stdout, result.stderr, result.returncode
+    try:
+        stdout, stderr, returncode = await trio.to_thread.run_sync(_run_subprocess)
+    except subprocess.TimeoutExpired:
+        return EvaluateResult(
+            success=False,
+            all_correct=False,
+            correctness_score=0.0,
+            geomean_speedup=0.0,
+            passed_tests=0,
+            total_tests=0,
+            error_message=f"Modal KernelBench evaluation timed out after {target.timeout_seconds}s",
+        )
+    except Exception as e:
+        return EvaluateResult(
+            success=False,
+            all_correct=False,
+            correctness_score=0.0,
+            geomean_speedup=0.0,
+            passed_tests=0,
+            total_tests=0,
+            error_message=f"Failed to run Modal sandbox: {e}",
+        )
+    # Print output for debugging
+    if stdout:
+        for line in stdout.split("\n"):
+            if not line.startswith("EVAL_RESULT_JSON:"):
+                print(line)
+    if stderr:
+        print(stderr, file=sys.stderr)
+    if returncode != 0:
+        return EvaluateResult(
+            success=False,
+            all_correct=False,
+            correctness_score=0.0,
+            geomean_speedup=0.0,
+            passed_tests=0,
+            total_tests=0,
+            error_message=f"Modal sandbox failed (exit {returncode}): {stderr or stdout}",
+        )
+    # Parse results from stdout
+    result_json = None
+    for line in stdout.split("\n"):
+        if line.startswith("EVAL_RESULT_JSON:"):
+            result_json = line[len("EVAL_RESULT_JSON:"):]
+            break
+    if not result_json:
+        return EvaluateResult(
+            success=False,
+            all_correct=False,
+            correctness_score=0.0,
+            geomean_speedup=0.0,
+            passed_tests=0,
+            total_tests=0,
+            error_message="No results found in Modal output",
+        )
+    try:
+        results = json.loads(result_json)
+    except json.JSONDecodeError as e:
+        return EvaluateResult(
+            success=False,
+            all_correct=False,
+            correctness_score=0.0,
+            geomean_speedup=0.0,
+            passed_tests=0,
+            total_tests=0,
+            error_message=f"Failed to parse results JSON: {e}",
+        )
+    # Check for error in results
+    if "error" in results and results.get("success") is False:
+        return EvaluateResult(
+            success=False,
+            all_correct=False,
+            correctness_score=0.0,
+            geomean_speedup=0.0,
+            passed_tests=0,
+            total_tests=0,
+            error_message=results.get("error", "Unknown error"),
+        )
+    # Extract metrics from results
+    return EvaluateResult(
+        success=True,
+        all_correct=results.get("all_correct", False),
+        correctness_score=float(results.get("correctness_score", 0.0)),
+        geomean_speedup=float(results.get("geomean_speedup", 0.0)),
+        passed_tests=int(results.get("passed_tests", 0)),
+        total_tests=int(results.get("total_tests", 0)),
+        error_message=results.get("error"),
+        test_results=results.get("test_results", []),
+        compilation_time_s=results.get("compilation_time_s"),
+        profiling_stats=results.get("profiling_stats"),
+    )
 async def run_evaluate_kernelbench_docker(
     args: KernelBenchEvaluateArgs,
     target: BaremetalTarget | VMTarget,
@@ -4246,6 +4628,20 @@ async def run_evaluate_kernelbench_runpod(
         )
+async def run_evaluate_kernelbench_baremetal_direct(
+    args: KernelBenchEvaluateArgs,
+    target: BaremetalTarget,
+) -> EvaluateResult:
+    """Run KernelBench format evaluation directly on NVIDIA target (no Docker).
+    For targets that already have PyTorch/CUDA installed (e.g., workspace containers).
+    Uses CUDA_VISIBLE_DEVICES for GPU selection.
+    """
+    # Reuse the AMD function but with CUDA env vars
+    # The logic is identical, just the GPU env var is different
+    return await _run_evaluate_kernelbench_baremetal_direct_impl(args, target, gpu_env_var="CUDA_VISIBLE_DEVICES")
 async def run_evaluate_kernelbench_baremetal_amd(
     args: KernelBenchEvaluateArgs,
     target: BaremetalTarget,
@@ -4255,6 +4651,18 @@ async def run_evaluate_kernelbench_baremetal_amd(
     Runs evaluation script directly on host (no Docker) for AMD GPUs
     that have PyTorch/ROCm installed.
     """
+    return await _run_evaluate_kernelbench_baremetal_direct_impl(args, target, gpu_env_var="HIP_VISIBLE_DEVICES")
+async def _run_evaluate_kernelbench_baremetal_direct_impl(
+    args: KernelBenchEvaluateArgs,
+    target: BaremetalTarget,
+    gpu_env_var: str = "HIP_VISIBLE_DEVICES",
+) -> EvaluateResult:
+    """Internal implementation for direct baremetal evaluation.
+    Runs evaluation script directly on host (no Docker).
+    """
     from datetime import datetime
     from wafer_core.async_ssh import AsyncSSHClient
@@ -4405,11 +4813,15 @@ async def run_evaluate_kernelbench_baremetal_amd(
         eval_cmd = " ".join(python_cmd_parts)
-        # Set environment for AMD GPU and run
-        # PYTORCH_ROCM_ARCH: compile only for target arch (5-7x faster compile)
-        rocm_arch = _get_rocm_arch(target.compute_capability)
-        arch_env = f"PYTORCH_ROCM_ARCH={rocm_arch}" if rocm_arch else ""
-        env_vars = f"HIP_VISIBLE_DEVICES={gpu_id} ROCM_PATH=/opt/rocm PYTHONUNBUFFERED=1 {arch_env}"
+        # Set environment for GPU and run
+        if gpu_env_var == "HIP_VISIBLE_DEVICES":
+            # AMD: PYTORCH_ROCM_ARCH for faster compile
+            rocm_arch = _get_rocm_arch(target.compute_capability)
+            arch_env = f"PYTORCH_ROCM_ARCH={rocm_arch}" if rocm_arch else ""
+            env_vars = f"HIP_VISIBLE_DEVICES={gpu_id} ROCM_PATH=/opt/rocm PYTHONUNBUFFERED=1 {arch_env}"
+        else:
+            # NVIDIA: just set CUDA_VISIBLE_DEVICES
+            env_vars = f"CUDA_VISIBLE_DEVICES={gpu_id} PYTHONUNBUFFERED=1"
         full_cmd = f"cd {run_path} && {env_vars} {eval_cmd}"
         # Handle prepare-only mode
@@ -4560,10 +4972,16 @@ async def run_evaluate_kernelbench(args: KernelBenchEvaluateArgs) -> EvaluateRes
     elif isinstance(target, RunPodTarget):
         # RunPod AMD MI300X - uses ROCm Docker with device passthrough
         return await run_evaluate_kernelbench_runpod(args, target)
+    elif isinstance(target, ModalTarget):
+        # Modal serverless - runs in Modal sandbox
+        return await run_evaluate_kernelbench_modal(args, target)
     elif isinstance(target, BaremetalTarget | VMTarget):
         # Check if this is an AMD target (gfx* compute capability) - run directly
         if target.compute_capability and target.compute_capability.startswith("gfx"):
             return await run_evaluate_kernelbench_baremetal_amd(args, target)
+        # Check for direct execution flag (workspace containers that already have everything)
+        if getattr(target, "direct", False):
+            return await run_evaluate_kernelbench_baremetal_direct(args, target)
         # NVIDIA targets - require docker_image to be set
         if not target.docker_image:
             return EvaluateResult(

wafer/templates/optimize_kernel.py CHANGED Viewed

@@ -68,4 +68,6 @@ IMPORTANT: Always verify correctness with wafer evaluate before claiming success
         "kernel": "./kernel.cu",
         "target": "H100",
     },
+    # Enable skill discovery (agent can load wafer-guide, etc.)
+    include_skills=True,
 )

wafer/wevin_cli.py CHANGED Viewed

@@ -274,7 +274,12 @@ def _build_environment(
     from wafer_core.sandbox import SandboxMode
     working_dir = Path(corpus_path) if corpus_path else Path.cwd()
-    resolved_tools = tools_override or tpl.tools
+    resolved_tools = list(tools_override or tpl.tools)
+    # Add skill tool if skills are enabled
+    if tpl.include_skills and "skill" not in resolved_tools:
+        resolved_tools.append("skill")
     sandbox_mode = SandboxMode.DISABLED if no_sandbox else SandboxMode.ENABLED
     env: Environment = CodingEnvironment(
         working_dir=working_dir,
@@ -378,6 +383,7 @@ def main(  # noqa: PLR0913, PLR0915
     # Handle --get-session: load session by ID and print
     if get_session:
         async def _get_session() -> None:
             try:
                 session, err = await session_store.get(get_session)
@@ -398,16 +404,18 @@ def main(  # noqa: PLR0913, PLR0915
                         error_msg = f"Failed to serialize messages: {e}"
                         print(json.dumps({"error": error_msg}))
                         sys.exit(1)
-                    print(json.dumps({
-                        "session_id": session.session_id,
-                        "status": session.status.value,
-                        "model": session.endpoint.model if session.endpoint else None,
-                        "created_at": session.created_at,
-                        "updated_at": session.updated_at,
-                        "messages": messages_data,
-                        "tags": session.tags,
-                    }))
+                    print(
+                        json.dumps({
+                            "session_id": session.session_id,
+                            "status": session.status.value,
+                            "model": session.endpoint.model if session.endpoint else None,
+                            "created_at": session.created_at,
+                            "updated_at": session.updated_at,
+                            "messages": messages_data,
+                            "tags": session.tags,
+                        })
+                    )
                 else:
                     print(f"Session: {session.session_id}")
                     print(f"Status: {session.status.value}")
@@ -495,7 +503,7 @@ def main(  # noqa: PLR0913, PLR0915
             print(f"Error loading template: {err}", file=sys.stderr)
             sys.exit(1)
         tpl = loaded_template
-        system_prompt = tpl.interpolate_prompt(template_args or {})
+        base_system_prompt = tpl.interpolate_prompt(template_args or {})
         # Show template info when starting without a prompt
         if not prompt and tpl.description:
             print(f"Template: {tpl.name}", file=sys.stderr)
@@ -503,7 +511,20 @@ def main(  # noqa: PLR0913, PLR0915
             print(file=sys.stderr)
     else:
         tpl = _get_default_template()
-        system_prompt = tpl.system_prompt
+        base_system_prompt = tpl.system_prompt
+    # Append skill metadata if skills are enabled
+    if tpl.include_skills:
+        from wafer_core.rollouts.skills import discover_skills, format_skill_metadata_for_prompt
+        skill_metadata = discover_skills()
+        if skill_metadata:
+            skill_section = format_skill_metadata_for_prompt(skill_metadata)
+            system_prompt = base_system_prompt + "\n\n" + skill_section
+        else:
+            system_prompt = base_system_prompt
+    else:
+        system_prompt = base_system_prompt
     # CLI args override template values
     resolved_single_turn = single_turn if single_turn is not None else tpl.single_turn
@@ -550,7 +571,7 @@ def main(  # noqa: PLR0913, PLR0915
             else:
                 if json_output:
                     # Emit session_start if we have a session_id (from --resume)
-                    model_name = endpoint.model if hasattr(endpoint, 'model') else None
+                    model_name = endpoint.model if hasattr(endpoint, "model") else None
                     frontend = StreamingChunkFrontend(session_id=session_id, model=model_name)
                 else:
                     frontend = NoneFrontend(show_tool_calls=True, show_thinking=False)
@@ -565,9 +586,11 @@ def main(  # noqa: PLR0913, PLR0915
                 # Emit session_start for new sessions (if session_id was None and we got one)
                 # Check first state to emit as early as possible
                 if json_output and isinstance(frontend, StreamingChunkFrontend):
-                    first_session_id = states[0].session_id if states and states[0].session_id else None
+                    first_session_id = (
+                        states[0].session_id if states and states[0].session_id else None
+                    )
                     if first_session_id and not session_id:  # New session created
-                        model_name = endpoint.model if hasattr(endpoint, 'model') else None
+                        model_name = endpoint.model if hasattr(endpoint, "model") else None
                         frontend.emit_session_start(first_session_id, model_name)
                 # Print resume command with full wafer agent prefix
                 if states and states[-1].session_id:

{wafer_cli-0.2.20.dist-info → wafer_cli-0.2.22.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: wafer-cli
-Version: 0.2.20
+Version: 0.2.22
 Summary: CLI tool for running commands on remote GPUs and GPU kernel optimization agent
 Requires-Python: >=3.11
 Requires-Dist: typer>=0.12.0

{wafer_cli-0.2.20.dist-info → wafer_cli-0.2.22.dist-info}/RECORD RENAMED Viewed

@@ -5,10 +5,10 @@ wafer/api_client.py,sha256=i_Az2b2llC3DSW8yOL-BKqa7LSKuxOr8hSN40s-oQXY,6313
 wafer/auth.py,sha256=dwss_se5P-FFc9IN38q4kh_dBrA6k-CguDBkivgcdj0,14003
 wafer/autotuner.py,sha256=41WYP41pTDvMijv2h42vm89bcHtDMJXObDlWmn6xpFU,44416
 wafer/billing.py,sha256=jbLB2lI4_9f2KD8uEFDi_ixLlowe5hasC0TIZJyIXRg,7163
-wafer/cli.py,sha256=cNScdwOsyaSHnaRPtzSIcES6IEx4kWpMqMpZMIbrp3g,254768
+wafer/cli.py,sha256=j4ODOVT_r-kyc21YOI8Yl8bkiZMGuqDpXRs7CvpNaek,261443
 wafer/config.py,sha256=h5Eo9_yfWqWGoPNdVQikI9GoZVUeysunSYiixf1mKcw,3411
-wafer/corpus.py,sha256=x5aFhCsTSAtgzFG9AMFpqq92Ej63mXofL-vvvpjj1sM,12913
-wafer/evaluate.py,sha256=s1NszUBtxdWRonbi8YR3XWfCiCjNm14g2Pp1lu4kmtY,176125
+wafer/corpus.py,sha256=oQegXA43MuyRvYxOsWhmqeP5vMb5IKFHOvM-1RcahPA,22301
+wafer/evaluate.py,sha256=SxxhiPkO6aDdfktRzJXpbWMVmIGn_gw-o5C6Zwj2zRc,190930
 wafer/global_config.py,sha256=fhaR_RU3ufMksDmOohH1OLeQ0JT0SDW1hEip_zaP75k,11345
 wafer/gpu_run.py,sha256=TwqXy72T7f2I7e6n5WWod3xgxCPnDhU0BgLsB4CUoQY,9716
 wafer/inference.py,sha256=tZCO5i05FKY27ewis3CSBHFBeFbXY3xwj0DSjdoMY9s,4314
@@ -26,16 +26,16 @@ wafer/target_lock.py,sha256=SDKhNzv2N7gsphGflcNni9FE5YYuAMuEthngAJEo4Gs,7809
 wafer/targets.py,sha256=9r-iRWoKSH5cQl1LcamaX-T7cNVOg99ngIm_hlRk-qU,26922
 wafer/targets_ops.py,sha256=jN1oIBx0mutxRNE9xpIc7SaBxPkVmOyus2eqn0kEKNI,21475
 wafer/tracelens.py,sha256=g9ZIeFyNojZn4uTd3skPqIrRiL7aMJOz_-GOd3aiyy4,7998
-wafer/wevin_cli.py,sha256=VnGVt__7kpVe2n_UctURSIpael_2TgsAwmqoQjz6CN0,22412
+wafer/wevin_cli.py,sha256=Nuk7zTCiJrnpmYtdg5Hu0NbzONCqs54xtON6K7AVB9U,23189
 wafer/workspaces.py,sha256=iUdioK7kA3z_gOTMNVDn9Q87c6qpkdXF4bOhJWkUPg8,32375
 wafer/skills/wafer-guide/SKILL.md,sha256=KWetJw2TVTbz11_nzqazqOJWWRlbHRFShs4sOoreiWo,3255
 wafer/templates/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 wafer/templates/ask_docs.py,sha256=Lxs-faz9v5m4Qa4NjF2X_lE8KwM9ES9MNJkxo7ep56o,2256
-wafer/templates/optimize_kernel.py,sha256=u6AL7Q3uttqlnBLzcoFdsiPq5lV2TV3bgqwCYYlK9gk,2357
+wafer/templates/optimize_kernel.py,sha256=OvZgN5tm_OymO3lK8Dr0VO48e-5PfNVIIoACrPxpmqk,2446
 wafer/templates/optimize_kernelbench.py,sha256=aoOA13zWEl89r6QW03xF9NKxQ7j4mWe9rwua6-mlr4Y,4780
 wafer/templates/trace_analyze.py,sha256=XE1VqzVkIUsZbXF8EzQdDYgg-AZEYAOFpr6B_vnRELc,2880
-wafer_cli-0.2.20.dist-info/METADATA,sha256=rZ94ea_wCkSGAhT0X1wN9DFhCr5ojeXucvROQLX0Ox4,560
-wafer_cli-0.2.20.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
-wafer_cli-0.2.20.dist-info/entry_points.txt,sha256=WqB7hB__WhtPY8y1cO2sZiUz7fCq6Ik-usAigpeFvWE,41
-wafer_cli-0.2.20.dist-info/top_level.txt,sha256=2MK1IVMWfpLL8BZCQ3E9aG6L6L666gSA_teYlwan4fs,6
-wafer_cli-0.2.20.dist-info/RECORD,,
+wafer_cli-0.2.22.dist-info/METADATA,sha256=vjYzyQtphWxQ0JID0k5tFWoLwVjlR6X0B4UAuMhLhQc,560
+wafer_cli-0.2.22.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
+wafer_cli-0.2.22.dist-info/entry_points.txt,sha256=WqB7hB__WhtPY8y1cO2sZiUz7fCq6Ik-usAigpeFvWE,41
+wafer_cli-0.2.22.dist-info/top_level.txt,sha256=2MK1IVMWfpLL8BZCQ3E9aG6L6L666gSA_teYlwan4fs,6
+wafer_cli-0.2.22.dist-info/RECORD,,

{wafer_cli-0.2.20.dist-info → wafer_cli-0.2.22.dist-info}/WHEEL RENAMED Viewed

File without changes

{wafer_cli-0.2.20.dist-info → wafer_cli-0.2.22.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{wafer_cli-0.2.20.dist-info → wafer_cli-0.2.22.dist-info}/top_level.txt RENAMED Viewed

File without changes

wafer-cli 0.2.20__py3-none-any.whl → 0.2.22__py3-none-any.whl

wafer-cli 0.2.20py3-none-any.whl → 0.2.22py3-none-any.whl