PyPI - wafer-cli - Versions diffs - 0.2.32__py3-none-any.whl → 0.2.33__py3-none-any.whl - Mend

wafer-cli 0.2.32py3-none-any.whl → 0.2.33py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

wafer/GUIDE.md +1 -1
wafer/agent_defaults.py +157 -2
wafer/billing.py +6 -6
wafer/cli.py +432 -348
wafer/corpus.py +6 -72
wafer/evaluate.py +143 -81
wafer/global_config.py +0 -13
wafer/kernel_scope.py +1 -1
wafer/ncu_analyze.py +1 -1
wafer/nsys_analyze.py +1 -1
wafer/skills/wafer-guide/SKILL.md +6 -22
wafer/ssh_keys.py +6 -6
wafer/targets_ops.py +2 -29
wafer/templates/aiter_optimize.py +59 -0
wafer/templates/optimize_kernel.py +2 -4
wafer/templates/optimize_kernelbench.py +62 -17
wafer/templates/optimize_vllm.py +156 -0
wafer/trace_compare.py +48 -139
wafer/wevin_cli.py +1 -12
wafer/workspaces.py +8 -8
wafer_cli-0.2.33.dist-info/METADATA +260 -0
{wafer_cli-0.2.32.dist-info → wafer_cli-0.2.33.dist-info}/RECORD +25 -23
wafer_cli-0.2.32.dist-info/METADATA +0 -107
{wafer_cli-0.2.32.dist-info → wafer_cli-0.2.33.dist-info}/WHEEL +0 -0
{wafer_cli-0.2.32.dist-info → wafer_cli-0.2.33.dist-info}/entry_points.txt +0 -0
{wafer_cli-0.2.32.dist-info → wafer_cli-0.2.33.dist-info}/top_level.txt +0 -0

wafer/templates/aiter_optimize.py ADDED Viewed

@@ -0,0 +1,59 @@
+"""Template for optimizing AMD aiter operators.
+Usage:
+    wafer agent -t aiter-optimize --args op=gemm_a8w8 --args target=mi300x "Optimize this operator"
+    wafer agent -t aiter-optimize --args op=mha --args target=runpod-mi300x-rocm7 "Improve MHA performance"
+"""
+try:
+    from wafer.agent_defaults import (
+        AITER_BASH_ALLOWLIST,
+        AITER_ENABLED_TOOLS,
+        AITER_SYSTEM_PROMPT,
+    )
+except ImportError:
+    # Fallback for when wafer-cli package isn't installed
+    AITER_ENABLED_TOOLS = ["read", "write", "edit", "glob", "grep", "bash"]
+    AITER_BASH_ALLOWLIST = [
+        "ls", "cat", "head", "tail", "wc", "find", "grep", "rg", "pwd", "tree",
+        "which", "diff", "sort", "mkdir", "cp", "mv", "git diff", "git status",
+        "git log", "hipcc", "g++", "gcc", "clang", "python", "python3", "pip",
+        "pytest", "./", "wafer evaluate aiter", "wafer amd rocprof-compute",
+        "wafer amd rocprof-sdk", "wafer amd rocprof-systems", "wafer amd isa",
+        "wafer agent -t ask-docs", "timeout",
+    ]
+    AITER_SYSTEM_PROMPT = "You are a GPU kernel optimization expert for AMD MI300X and aiter."
+try:
+    from wafer_core.rollouts.templates import TemplateConfig
+except ImportError:
+    from rollouts.templates import TemplateConfig
+# Format system prompt with template variables ($op, $target become {op}, {target})
+# The template loader will substitute these at runtime
+_SYSTEM_PROMPT = AITER_SYSTEM_PROMPT.replace("{op}", "$op").replace("{target_flag}", "--target $target")
+template = TemplateConfig(
+    # Identity
+    name="aiter-optimize",
+    description="Optimize AMD aiter operators for better performance on MI300X",
+    # System prompt - uses shared prompt from agent_defaults
+    system_prompt=_SYSTEM_PROMPT,
+    # Tools - full coding environment
+    tools=AITER_ENABLED_TOOLS,
+    bash_allowlist=AITER_BASH_ALLOWLIST,
+    # Network access required for wafer evaluate (connects to remote GPU)
+    allow_network=True,
+    # Model config - use thinking for optimization analysis
+    model="anthropic/claude-sonnet-4-5-20250929",
+    max_tokens=16384,
+    thinking=True,
+    thinking_budget=10000,
+    # Multi-turn for iterative optimization
+    single_turn=False,
+    # Template variables
+    defaults={
+        "op": "gemm_a8w8",
+        "target": "mi300x",  # Required - user must specify their target
+    },
+)

wafer/templates/optimize_kernel.py CHANGED Viewed

@@ -35,8 +35,7 @@ Strategy:
 Commands:
 - `wafer evaluate --impl <file> --reference <ref> --test-cases <tests>` - Run evaluation
 - `wafer evaluate --impl <file> --reference <ref> --test-cases <tests> --profile` - With NCU profiling
-- `wafer workspaces exec -- <command>` - Run arbitrary commands on remote GPU
-- `wafer targets exec <target> -- <command>` - Run commands on a configured target via SSH
+- `wafer remote-run "<command>"` - Run arbitrary commands on remote GPU
 Output:
 - Summary of optimizations applied
@@ -49,8 +48,7 @@ IMPORTANT: Always verify correctness with wafer evaluate before claiming success
     tools=["read", "write", "edit", "glob", "grep", "bash"],
     bash_allowlist=[
         "wafer evaluate",
-        "wafer workspaces exec",
-        "wafer targets exec",
+        "wafer remote-run",
         "wafer nvidia ncu",
         "wafer nvidia nsys",
         "wafer nvidia perfetto",

wafer/templates/optimize_kernelbench.py CHANGED Viewed

@@ -1,4 +1,4 @@
-"""Template for KernelBench optimization.
+"""Template for KernelBench optimization - matches eval system prompt.
 Usage:
     # Run on a specific problem
@@ -26,18 +26,12 @@ try:
 except ImportError:
     from rollouts.templates import TemplateConfig
-from wafer.agent_defaults import ENABLED_TOOLS, KERNELBENCH_BASH_ALLOWLIST
-# Task-specific instructions only — must stay in sync with the eval's SYSTEM_PROMPT
-# in research/evals/optimize_kernelbench_eval/.../base_config.py.
-# Run test_eval_cli_parity.py to verify.
-# Wafer CLI command docs are auto-generated from --help text and composed
-# at runtime by wevin_cli.py (see wafer.cli_instructions.build_cli_instructions).
-# TODO: Consider having both eval and template import SYSTEM_PROMPT from a shared
-# module so there's only one copy to maintain.
+# System prompt matches optimize_kernelbench_eval/base_config.py SYSTEM_PROMPT
 SYSTEM_PROMPT = """\
 You are a GPU kernel optimization expert. Your task is to write optimized GPU kernels that are correct and faster than the PyTorch baseline.
+IMPORTANT: You do NOT have a local GPU. You MUST use `wafer evaluate kernelbench` to test kernels on remote GPU hardware.
 ## Kernel Format (KernelBench)
 The reference file contains a PyTorch `Model` class. You must write a `ModelNew` class that:
@@ -49,14 +43,49 @@ The reference file also provides:
 - `get_inputs()` - generates test inputs for forward()
 - `get_init_inputs()` - generates constructor arguments
+## Available Tools
+- read(file_path): Read source files
+- write(file_path, content): Write your optimized kernel
+- glob(pattern): Find files by pattern
+- grep(pattern): Search code
+- bash(command): Run shell commands including wafer CLI
 ## Workflow
 1. Read the reference problem file to understand what `Model` does
 2. Analyze the computation and identify optimization opportunities
 3. Write an optimized `ModelNew` class with custom $backend_upper kernels using `__global__` kernel definitions and `torch.utils.cpp_extension.load_inline`
-4. Test with: `wafer evaluate kernelbench $target_flag --backend $backend --impl optimized.py --reference <problem.py> --benchmark`
+4. Test with: `wafer evaluate kernelbench $target_flag --backend $backend --impl <your_file.py> --reference <problem.py> --benchmark`
 5. Iterate based on feedback until correct and fast
+## Example Command
+```bash
+wafer evaluate kernelbench \\
+    $target_flag \\
+    --backend $backend \\
+    --impl optimized_kernel.py \\
+    --reference $reference \\
+    --benchmark
+```
+## Profiling Tools (USE THESE!)
+When your kernel is slower than expected, use profiling to understand WHY:
+- `wafer rocprof profile --impl <file> --reference <ref>` - AMD GPU profiling
+- `wafer nvidia ncu --impl <file> --reference <ref>` - NVIDIA NCU profiling
+## CRITICAL: Reactive Debugging
+After EVERY `wafer evaluate` call:
+1. Check the speedup result
+2. If speedup < 1.0x (slowdown), STOP and analyze:
+   - Run profiling to identify the bottleneck
+   - Ask: "Why is this slow?" before trying another approach
+3. Don't just try random optimizations - understand the root cause
 Your kernel MUST:
 - Pass correctness tests (outputs match reference within tolerance)
 - Achieve speedup > 1.0x over PyTorch baseline
@@ -67,16 +96,32 @@ You MUST run `wafer evaluate kernelbench` to verify your kernel. Your score depe
 template = TemplateConfig(
     # Identity
     name="optimize-kernelbench",
-    description="Optimize KernelBench problems",
-    # System prompt (task-specific; CLI docs appended at runtime)
+    description="Optimize KernelBench problems (matches eval system prompt)",
+    # System prompt
     system_prompt=SYSTEM_PROMPT,
     # Tools
-    tools=ENABLED_TOOLS,
-    bash_allowlist=KERNELBENCH_BASH_ALLOWLIST,
-    # Model config
+    tools=["read", "write", "edit", "glob", "grep", "bash"],
+    bash_allowlist=[
+        "wafer evaluate",
+        "wafer nvidia ncu",
+        "wafer nvidia nsys",
+        "wafer rocprof",
+        "wafer compiler-analyze",
+        "python",
+        "python3",
+        "timeout",
+        "ls",
+        "cat",
+        "head",
+        "tail",
+        "wc",
+        "pwd",
+        "which",
+    ],
+    # Model config - match eval settings
     model="anthropic/claude-opus-4-5-20251101",
     max_tokens=8192,
-    # No thinking by default, can override with --thinking
+    # No thinking by default (match eval), can override with --thinking
     thinking=False,
     # Multi-turn for iterative optimization
     single_turn=False,

wafer/templates/optimize_vllm.py ADDED Viewed

@@ -0,0 +1,156 @@
+"""Template for vLLM kernel optimization.
+Usage:
+    # Optimize fused_moe kernel
+    wafer agent -t optimize-vllm \
+        --args vllm_dir=/path/to/vllm \
+        --args op=fused_moe \
+        --args target=my-gpu-server \
+        "Optimize the fused MoE kernel for better throughput"
+    # With custom test and benchmark commands
+    wafer agent -t optimize-vllm \
+        --args vllm_dir=./vllm \
+        --args op=paged_attention \
+        --args test_cmd="pytest tests/kernels/attention/test_attention.py -v" \
+        --args bench_cmd="python benchmarks/kernels/benchmark_paged_attention.py" \
+        --json
+Variables:
+    - vllm_dir: Path to vLLM repository (required)
+    - op: Target op to optimize (required, e.g., fused_moe, paged_attention)
+    - target: Target name (default: uses default target)
+    - pool: Target pool name (alternative to target)
+    - test_cmd: Pytest command for correctness (auto-generated from op if not provided)
+    - bench_cmd: Kernel microbenchmark command (auto-generated from op if not provided)
+"""
+try:
+    from wafer_core.rollouts.templates import TemplateConfig
+except ImportError:
+    from rollouts.templates import TemplateConfig
+from wafer.agent_defaults import VLLM_BASH_ALLOWLIST, VLLM_ENABLED_TOOLS
+# Default test commands per op (from vLLM's test structure)
+DEFAULT_TEST_CMDS = {
+    "fused_moe": "pytest tests/kernels/moe/test_moe.py -v",
+    "paged_attention": "pytest tests/kernels/attention/test_attention.py -v",
+    "flash_attn": "pytest tests/kernels/attention/test_flash_attn.py -v",
+    "flashinfer": "pytest tests/kernels/attention/test_flashinfer.py -v",
+    "rms_norm": "pytest tests/kernels/core/test_layernorm.py -v -k rms",
+    "layernorm": "pytest tests/kernels/core/test_layernorm.py -v",
+    "rotary_embedding": "pytest tests/kernels/core/test_rotary_embedding.py -v",
+    "activation": "pytest tests/kernels/core/test_activation.py -v",
+    "fused_topk": "pytest tests/kernels/moe/test_fused_topk.py -v",
+    "fp8_quant": "pytest tests/kernels/quantization/test_fp8_quant.py -v",
+    "int8_quant": "pytest tests/kernels/quantization/test_int8_quant.py -v",
+}
+# Default benchmark commands per op.
+# Uses pytest with --durations to measure kernel execution time.
+# vLLM v0.15+ kernel benchmarks require config context, so pytest
+# (which sets up fixtures) is the reliable path.
+DEFAULT_BENCH_CMDS = {
+    "fused_moe": "pytest tests/kernels/moe/test_moe.py --timeout=300 --durations=0 -q",
+    "paged_attention": "pytest tests/kernels/attention/test_attention.py --timeout=300 --durations=0 -q",
+    "rms_norm": "pytest tests/kernels/core/test_layernorm.py -k rms --timeout=120 --durations=0 -q",
+    "layernorm": "pytest tests/kernels/core/test_layernorm.py --timeout=120 --durations=0 -q",
+    "rotary_embedding": "pytest tests/kernels/core/test_rotary_embedding.py --timeout=120 --durations=0 -q",
+    "activation": "pytest tests/kernels/core/test_activation.py --timeout=120 --durations=0 -q",
+    "fused_topk": "pytest tests/kernels/moe/test_fused_topk.py --timeout=120 --durations=0 -q",
+    "fp8_quant": "pytest tests/kernels/quantization/test_fp8_quant.py --timeout=120 --durations=0 -q",
+    "int8_quant": "pytest tests/kernels/quantization/test_int8_quant.py --timeout=120 --durations=0 -q",
+}
+SYSTEM_PROMPT = """\
+You are a GPU kernel optimization expert. Your task is to improve the performance
+of a specific vLLM kernel while maintaining correctness.
+## Target
+You are optimizing the `$op` kernel in vLLM.
+- vLLM directory: `$vllm_dir`
+- Correctness test: `$test_cmd`
+- Benchmark: `$bench_cmd`
+## Workflow
+1. **Understand the kernel**: Read the kernel implementation in `$vllm_dir`
+   - For MoE: `vllm/model_executor/layers/fused_moe/`
+   - For attention: `vllm/attention/backends/`
+   - For normalization: `vllm/_custom_ops.py` or specific layer files
+   - For quantization: `vllm/_custom_ops.py`
+2. **Run baseline benchmark**: Establish baseline performance
+   ```bash
+   cd $vllm_dir && $bench_cmd
+   ```
+3. **Analyze and optimize**: Identify optimization opportunities
+   - Memory access patterns (coalescing, shared memory usage)
+   - Occupancy and register pressure
+   - Algorithm improvements
+   - Hardware-specific optimizations (tensor cores, etc.)
+4. **Modify the kernel**: Make your changes to improve performance
+5. **Validate correctness**: Run the test suite
+   ```bash
+   cd $vllm_dir && $test_cmd
+   ```
+6. **Measure improvement**: Run benchmark again and compare
+7. **Iterate**: If correctness fails or performance regresses, adjust and retry
+## Evaluation
+Use the wafer evaluate command to run both correctness and benchmark:
+```bash
+wafer evaluate vllm --vllm-dir $vllm_dir --op $op \\
+    --test-cmd "$test_cmd" \\
+    --bench-cmd "$bench_cmd" \\
+    $target_flag --json
+```
+## Constraints
+- The correctness test MUST pass after your changes
+- Focus on the specific kernel identified (`$op`)
+- Document your changes and reasoning
+- Your score depends on actual measured throughput improvement
+## Key Metrics
+- **time_us**: kernel execution time in microseconds (lower is better)
+- **tflops**: teraflops achieved (higher is better)
+- **bandwidth_gbps**: memory bandwidth in GB/s (higher is better)"""
+template = TemplateConfig(
+    # Identity
+    name="optimize-vllm",
+    description="Optimize vLLM kernels for better inference performance",
+    # System prompt (task-specific; CLI docs appended at runtime)
+    system_prompt=SYSTEM_PROMPT,
+    # Tools
+    tools=VLLM_ENABLED_TOOLS,
+    bash_allowlist=VLLM_BASH_ALLOWLIST,
+    # Model config
+    model="anthropic/claude-opus-4-5-20251101",
+    max_tokens=8192,
+    # No thinking by default, can override with --thinking
+    thinking=False,
+    # Multi-turn for iterative optimization
+    single_turn=False,
+    # Template variables
+    defaults={
+        "vllm_dir": "./vllm",
+        "op": "fused_moe",
+        "target": "",
+        "pool": "",
+        "test_cmd": "",  # Auto-filled from DEFAULT_TEST_CMDS[op] if empty
+        "bench_cmd": "",  # Auto-filled from DEFAULT_BENCH_CMDS[op] if empty
+        "target_flag": "",  # Auto-computed: --target X or --pool Y
+    },
+)

wafer/trace_compare.py CHANGED Viewed

@@ -6,22 +6,19 @@ All core logic is in wafer_core.lib.trace_compare.
 import sys
 from pathlib import Path
-from typing import Any
 import typer
-import json
-import sys
 from wafer_core.lib.trace_compare import (
-    analyze_trace_pair,
+    analyze_fusion_differences,
+    analyze_traces,
     format_csv,
+    format_fusion_csv,
+    format_fusion_json,
+    format_fusion_text,
     format_json,
     format_text,
-    ArchitectureType,
-    detect_architecture,
 )
-from wafer_core.lib.trace_compare.loader import StreamingMetadata
 def compare_traces(
@@ -33,7 +30,6 @@ def compare_traces(
     show_layers: bool = False,
     show_all: bool = False,
     show_stack_traces: bool = False,
-    recommendations: bool = False,
 ) -> None:
     """Compare two GPU traces and generate performance report.
@@ -56,60 +52,21 @@ def compare_traces(
         typer.secho(f"❌ File not found: {trace2}", fg=typer.colors.RED, err=True)
         raise typer.Exit(1)
-    # Progress callback for JSON format (emits NDJSON to stdout)
-    def progress_callback(stage: str, fraction: float) -> None:
-        if output_format == 'json':
-            progress_msg = json.dumps({"type": "progress", "stage": stage, "fraction": fraction})
-            print(progress_msg, file=sys.stdout, flush=True)
-        elif output_format != 'json':
-            percent = int(fraction * 100)
-            typer.echo(f"📊 {stage}: {percent}%", err=True)
-    # Metadata callback for JSON format (emits NDJSON with early GPU info)
-    def metadata_callback(meta1: StreamingMetadata, meta2: StreamingMetadata) -> None:
-        if output_format == 'json':
-            metadata_msg = json.dumps({
-                "type": "metadata",
-                "trace1": {
-                    "platform": meta1.platform,
-                    "gpu": meta1.gpu_name,
-                    "file_size_mb": round(meta1.file_size_mb, 1),
-                },
-                "trace2": {
-                    "platform": meta2.platform,
-                    "gpu": meta2.gpu_name,
-                    "file_size_mb": round(meta2.file_size_mb, 1),
-                },
-            })
-            print(metadata_msg, file=sys.stdout, flush=True)
-        else:
-            typer.echo(f"📊 Trace 1: {meta1.platform} - {meta1.gpu_name} ({meta1.file_size_mb:.1f}MB)", err=True)
-            typer.echo(f"📊 Trace 2: {meta2.platform} - {meta2.gpu_name} ({meta2.file_size_mb:.1f}MB)", err=True)
-    # Analyze traces using unified API
+    # Analyze traces
+    # Only show progress messages for non-JSON formats (JSON needs clean stdout)
     if output_format != 'json':
         typer.echo("📊 Loading traces...")
+    # Determine how many stack traces to collect
+    max_stacks = 0 if (show_stack_traces and show_all) else (3 if show_stack_traces else 3)
     try:
-        result_obj = analyze_trace_pair(
+        results = analyze_traces(
             trace1,
             trace2,
-            phase=phase,
-            include_stacks=True,
-            on_progress=progress_callback,
-            on_metadata=metadata_callback,
+            phase_filter=phase,
+            max_stacks=max_stacks,
         )
-        results = {
-            "metadata": result_obj.metadata,
-            "operations": result_obj.operations,
-            "layers": result_obj.layers,
-            "warnings": [{"code": w.code, "severity": w.severity, "message": w.message, "suggestion": w.suggestion} for w in result_obj.warnings],
-            "architecture": result_obj.architecture.value,
-            "layer_alignments": result_obj.layer_alignments,
-            "fusion_analysis": result_obj.fusion_analysis,
-            "same_kernel_analysis": result_obj.same_kernel_analysis,
-        }
     except ValueError as e:
         typer.secho(f"❌ {e}", fg=typer.colors.RED, err=True)
         raise typer.Exit(1)
@@ -117,26 +74,17 @@ def compare_traces(
         typer.secho(f"❌ Error analyzing traces: {e}", fg=typer.colors.RED, err=True)
         raise typer.Exit(1)
+    # Show loading confirmation
     if output_format != 'json':
         meta = results["metadata"]
+        # Determine which trace is AMD and which is NVIDIA
         if meta['trace1_platform'] == 'AMD':
             amd_gpu, nvidia_gpu = meta['trace1_gpu'], meta['trace2_gpu']
         else:
             amd_gpu, nvidia_gpu = meta['trace2_gpu'], meta['trace1_gpu']
         typer.echo(f"✅ Loaded: AMD ({amd_gpu}) vs NVIDIA ({nvidia_gpu})")
-        # Display warnings
-        warnings = results.get("warnings", [])
-        if warnings:
-            typer.echo()
-            for warning in warnings:
-                icon = "❌" if warning["severity"] == "error" else "⚠️" if warning["severity"] == "warning" else "ℹ️"
-                typer.secho(f"{icon}  {warning['message']}", fg=typer.colors.YELLOW if warning["severity"] == "warning" else typer.colors.BLUE)
-                if warning.get("suggestion"):
-                    typer.secho(f"   Suggestion: {warning['suggestion']}", fg=typer.colors.BLUE)
     typer.echo()
     # Generate output based on format
     if output_format == "text":
         output_str = format_text(results, show_layers=show_layers, show_all=show_all, show_stack_traces=show_stack_traces)
@@ -160,23 +108,21 @@ def compare_traces(
         typer.echo(output_str)
-def compare_align(
+def compare_fusion(
     trace1: Path,
     trace2: Path,
     output: Path | None = None,
-    output_format: str = "json",
-    phase: str = "all",
-    layer: int | None = None,
+    format_type: str = "text",
+    min_group_size: int = 50,
 ) -> None:
-    """Align kernels at layer level for exact kernel-to-kernel comparison.
+    """Analyze kernel fusion differences between AMD and NVIDIA traces.
     Args:
         trace1: Path to first trace file (AMD or NVIDIA)
         trace2: Path to second trace file (AMD or NVIDIA)
         output: Optional output file path (default: stdout)
-        output_format: Output format ('json' only for now)
-        phase: Filter by phase ('all', 'prefill', or 'decode')
-        layer: Focus on specific layer number (optional)
+        format_type: Output format ('text', 'csv', or 'json')
+        min_group_size: Minimum correlation group size to analyze
     """
     # Validate files exist
     if not trace1.exists():
@@ -187,86 +133,49 @@ def compare_align(
         typer.secho(f"❌ File not found: {trace2}", fg=typer.colors.RED, err=True)
         raise typer.Exit(1)
-    # Progress callback for JSON format (emits NDJSON to stdout)
-    def progress_callback(stage: str, fraction: float) -> None:
-        if output_format == 'json':
-            progress_msg = json.dumps({"type": "progress", "stage": stage, "fraction": fraction})
-            print(progress_msg, file=sys.stdout, flush=True)
-        else:
-            percent = int(fraction * 100)
-            typer.echo(f"📊 {stage}: {percent}%", err=True)
-    # Metadata callback for JSON format
-    def metadata_callback(meta1: StreamingMetadata, meta2: StreamingMetadata) -> None:
-        if output_format == 'json':
-            metadata_msg = json.dumps({
-                "type": "metadata",
-                "trace1": {
-                    "platform": meta1.platform,
-                    "gpu": meta1.gpu_name,
-                    "file_size_mb": round(meta1.file_size_mb, 1),
-                },
-                "trace2": {
-                    "platform": meta2.platform,
-                    "gpu": meta2.gpu_name,
-                    "file_size_mb": round(meta2.file_size_mb, 1),
-                },
-            })
-            print(metadata_msg, file=sys.stdout, flush=True)
-        else:
-            typer.echo(f"📊 Trace 1: {meta1.platform} - {meta1.gpu_name} ({meta1.file_size_mb:.1f}MB)", err=True)
-            typer.echo(f"📊 Trace 2: {meta2.platform} - {meta2.gpu_name} ({meta2.file_size_mb:.1f}MB)", err=True)
-    # Analyze traces using unified API
-    if output_format != 'json':
+    # Analyze fusion
+    # Only show progress messages for non-JSON formats (JSON needs clean stdout)
+    if format_type != 'json':
         typer.echo("📊 Loading traces...")
     try:
-        result_obj = analyze_trace_pair(
+        results = analyze_fusion_differences(
             trace1,
             trace2,
-            phase=phase,
-            include_stacks=True,
-            on_progress=progress_callback,
-            on_metadata=metadata_callback,
+            min_group_size=min_group_size,
         )
-        results = {
-            "metadata": result_obj.metadata,
-            "layer_alignments": result_obj.layer_alignments or [],
-            "fusion_analysis": result_obj.fusion_analysis or {},
-            "same_kernel_analysis": result_obj.same_kernel_analysis or {},
-            "operations": result_obj.operations,
-            "layers": result_obj.layers,
-            "warnings": [{"code": w.code, "severity": w.severity, "message": w.message, "suggestion": w.suggestion} for w in result_obj.warnings],
-            "architecture": result_obj.architecture.value,
-        }
-        if layer is not None:
-            results["layer_alignments"] = [
-                la for la in results["layer_alignments"] if la.get("layer") == layer
-            ]
-    except ValueError as e:
-        typer.secho(f"❌ {e}", fg=typer.colors.RED, err=True)
-        raise typer.Exit(1)
     except Exception as e:
-        typer.secho(f"❌ Error analyzing traces: {e}", fg=typer.colors.RED, err=True)
+        typer.secho(
+            f"❌ Error analyzing traces: {e}", fg=typer.colors.RED, err=True
+        )
         import traceback
         traceback.print_exc()
         raise typer.Exit(1)
-    if output_format != 'json':
+    # Show loading confirmation
+    if format_type != 'json':
         meta = results["metadata"]
-        typer.echo(f"✅ Loaded: {meta.get('amd_gpu', 'Unknown')} vs {meta.get('nvidia_gpu', 'Unknown')}")
-        typer.echo(f"✅ Found {len(results['layer_alignments'])} layers")
+        # Note: fusion analyzer always uses trace1=AMD, trace2=NVIDIA
+        typer.echo(f"✅ Loaded: {meta['trace1_gpu']} vs {meta['trace2_gpu']}")
+        typer.echo(
+            f"Found {meta['trace1_correlation_groups']} trace1 groups and "
+            f"{meta['trace2_correlation_groups']} trace2 groups with ≥{min_group_size} kernels"
+        )
+        typer.echo(f"✅ Matched {meta['matched_groups']} correlation groups")
         typer.echo()
-    if output_format == "json":
-        output_str = format_json(results)
+    # Generate output
+    if format_type == "text":
+        output_str = format_fusion_text(results)
+    elif format_type == "csv":
+        output_str = format_fusion_csv(results)
+    elif format_type == "json":
+        output_str = format_fusion_json(results)
     else:
-        typer.secho(f"❌ Format {output_format} not yet supported for align command. Use 'json'.", fg=typer.colors.RED, err=True)
+        typer.secho(f"❌ Unknown format: {format_type}", fg=typer.colors.RED, err=True)
         raise typer.Exit(1)
+    # Write output
     if output:
         output.write_text(output_str)
         typer.secho(f"✅ Report saved to {output}", fg=typer.colors.GREEN)

wafer/wevin_cli.py CHANGED Viewed

@@ -550,7 +550,7 @@ def main(  # noqa: PLR0913, PLR0915
     api_base, api_key, api_key_refresh = _get_wafer_auth(no_proxy=no_proxy)
     if not api_base or not api_key:
         print("Error: No API credentials found", file=sys.stderr)
-        print("  Run 'wafer auth login' or set ANTHROPIC_API_KEY", file=sys.stderr)
+        print("  Run 'wafer login' or set ANTHROPIC_API_KEY", file=sys.stderr)
         sys.exit(1)
     assert api_base is not None
@@ -573,17 +573,6 @@ def main(  # noqa: PLR0913, PLR0915
         tpl = _get_default_template()
         base_system_prompt = tpl.system_prompt
-    # Compose CLI instructions from --help text for allowed wafer commands
-    # TODO: The eval path doesn't have the skills layer below. If include_skills
-    # is ever enabled for optimize-kernelbench, the eval would need it too for parity.
-    # See test_eval_cli_parity.py for coverage notes.
-    if tpl.bash_allowlist:
-        from wafer.cli_instructions import build_cli_instructions
-        cli_instructions = build_cli_instructions(tpl.bash_allowlist)
-        if cli_instructions:
-            base_system_prompt = base_system_prompt + "\n\n" + cli_instructions
     # Append skill metadata if skills are enabled
     if tpl.include_skills:
         from wafer_core.rollouts.skills import discover_skills, format_skill_metadata_for_prompt

wafer-cli 0.2.32__py3-none-any.whl → 0.2.33__py3-none-any.whl

wafer-cli 0.2.32py3-none-any.whl → 0.2.33py3-none-any.whl