PyPI - wafer-cli - Versions diffs - 0.2.25__py3-none-any.whl → 0.2.26__py3-none-any.whl - Mend

wafer-cli 0.2.25py3-none-any.whl → 0.2.26py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

wafer/cli.py CHANGED Viewed

@@ -7787,6 +7787,9 @@ def compare_analyze(
     stack_traces: bool = typer.Option(
         False, "--stack-traces", help="Show Python stack traces for operations"
     ),
+    recommendations: bool = typer.Option(
+        False, "--recommendations", help="Generate prioritized recommendations for kernel team"
+    ),
     json: bool = typer.Option(
         False, "--json", hidden=True, help="Ignored (for compatibility with cliExecutor)"
     ),
@@ -7839,6 +7842,7 @@ def compare_analyze(
         show_layers=layers,
         show_all=all,
         show_stack_traces=stack_traces,
+        recommendations=recommendations,
     )
     _mark_command_success()
@@ -7883,14 +7887,69 @@ def compare_fusion_cmd(
         # CSV output to file
         wafer compare fusion amd_trace.json nvidia_trace.json --format csv -o fusion.csv
     """
-    from .trace_compare import compare_fusion
+    from .trace_compare import compare_align
+    compare_align(
+        trace1=trace1,
+        trace2=trace2,
+        output=output,
+        output_format=format,
+        phase="all",
+    )
+    _mark_command_success()
-    compare_fusion(
+@compare_app.command("align")
+def compare_align_cmd(
+    trace1: Path = typer.Argument(..., help="First trace file (AMD or NVIDIA)", exists=True),
+    trace2: Path = typer.Argument(..., help="Second trace file (AMD or NVIDIA)", exists=True),
+    format: str = typer.Option(
+        "json",
+        "--format",
+        "-f",
+        help="Output format: json",
+    ),
+    output: Path | None = typer.Option(
+        None, "--output", "-o", help="Output file (default: stdout)"
+    ),
+    phase: str = typer.Option(
+        "all",
+        "--phase",
+        help="Filter by phase: all, prefill, decode",
+    ),
+    layer: int | None = typer.Option(
+        None,
+        "--layer",
+        help="Focus on specific layer number",
+    ),
+) -> None:
+    """Align kernels at layer level for exact kernel-to-kernel comparison.
+    Provides kernel-to-kernel mapping across AMD and NVIDIA platforms,
+    showing which kernels correspond to each other at each layer position.
+    Examples:
+        # Basic alignment (stdout JSON)
+        wafer compare align amd_trace.json nvidia_trace.json
+        # Save to file
+        wafer compare align amd_trace.json nvidia_trace.json -o alignment.json
+        # Focus on decode phase only
+        wafer compare align amd_trace.json nvidia_trace.json --phase decode
+        # Focus on specific layer
+        wafer compare align amd_trace.json nvidia_trace.json --layer 5
+    """
+    from .trace_compare import compare_align
+    compare_align(
         trace1=trace1,
         trace2=trace2,
         output=output,
-        format_type=format,
-        min_group_size=min_group_size,
+        output_format=format,
+        phase=phase,
+        layer=layer,
     )
     _mark_command_success()

wafer/corpus.py CHANGED Viewed

@@ -109,14 +109,34 @@ CORPORA: dict[CorpusName, CorpusConfig] = {
     ),
     "hip": CorpusConfig(
         name="hip",
-        description="HIP programming guide and API reference",
-        source_type="github_repo",
-        repo="ROCm/HIP",
-        repo_paths=["docs"],
+        description="HIP programming guide, API reference, and examples",
+        source_type="github_multi_repo",
+        repos=[
+            # HIP - main documentation and API
+            RepoSource(
+                repo="ROCm/HIP",
+                paths=["docs"],
+            ),
+            # HIP examples - code samples
+            RepoSource(
+                repo="ROCm/HIP-Examples",
+                paths=["HIP-Examples-Applications", "mini-nbody"],
+            ),
+            # clr - HIP/OpenCL runtime (low-level)
+            RepoSource(
+                repo="ROCm/clr",
+                paths=["hipamd/include", "rocclr/device/gpu"],
+            ),
+            # ROCm docs - official documentation
+            RepoSource(
+                repo="ROCm/ROCm",
+                paths=["docs"],
+            ),
+        ],
     ),
     "amd": CorpusConfig(
         name="amd",
-        description="AMD GPU kernel development (rocWMMA, CK, AITER, rocBLAS, HipKittens, vLLM)",
+        description="AMD GPU kernel development (rocWMMA, CK, AITER, rocBLAS, HipKittens, vLLM, FlashAttention)",
         source_type="github_multi_repo",
         repos=[
             # rocWMMA - wave matrix multiply-accumulate (WMMA) intrinsics
@@ -186,6 +206,46 @@ CORPORA: dict[CorpusName, CorpusConfig] = {
                 repo="huggingface/hf-rocm-kernels",
                 paths=["csrc", "hf_rocm_kernels", "docs"],
             ),
+            # ROCm/flash-attention - FlashAttention for AMD GPUs
+            RepoSource(
+                repo="ROCm/flash-attention",
+                paths=["csrc", "docs"],
+            ),
+            # ROCm/triton - Triton compiler for AMD GPUs
+            RepoSource(
+                repo="ROCm/triton",
+                paths=["python/tutorials", "third_party/amd"],
+            ),
+            # ROCm/rccl - ROCm Communication Collectives Library (multi-GPU)
+            RepoSource(
+                repo="ROCm/rccl",
+                paths=["docs"],
+            ),
+            # ROCm/rocprofiler-sdk - AMD GPU profiling SDK
+            RepoSource(
+                repo="ROCm/rocprofiler-sdk",
+                paths=["docs", "samples"],
+            ),
+            # ROCm/omniperf - AMD GPU profiling tool
+            RepoSource(
+                repo="ROCm/omniperf",
+                paths=["docs", "src/omniperf_analyze"],
+            ),
+            # ROCm/omnitrace - Application tracing for AMD
+            RepoSource(
+                repo="ROCm/omnitrace",
+                paths=["docs"],
+            ),
+            # AMD GPUOpen Performance Guides
+            RepoSource(
+                repo="GPUOpen-Tools/gpu_performance_api",
+                paths=["docs"],
+            ),
+            # AMD LLVM - AMD GPU compiler backend
+            RepoSource(
+                repo="ROCm/llvm-project",
+                paths=["amd/device-libs/README.md", "llvm/docs/AMDGPUUsage.rst"],
+            ),
         ],
     ),
 }

wafer/trace_compare.py CHANGED Viewed

@@ -6,19 +6,22 @@ All core logic is in wafer_core.lib.trace_compare.
 import sys
 from pathlib import Path
+from typing import Any
 import typer
+import json
+import sys
 from wafer_core.lib.trace_compare import (
-    analyze_fusion_differences,
-    analyze_traces,
+    analyze_trace_pair,
     format_csv,
-    format_fusion_csv,
-    format_fusion_json,
-    format_fusion_text,
     format_json,
     format_text,
+    ArchitectureType,
+    detect_architecture,
 )
+from wafer_core.lib.trace_compare.loader import StreamingMetadata
 def compare_traces(
@@ -30,6 +33,7 @@ def compare_traces(
     show_layers: bool = False,
     show_all: bool = False,
     show_stack_traces: bool = False,
+    recommendations: bool = False,
 ) -> None:
     """Compare two GPU traces and generate performance report.
@@ -52,21 +56,60 @@ def compare_traces(
         typer.secho(f"❌ File not found: {trace2}", fg=typer.colors.RED, err=True)
         raise typer.Exit(1)
-    # Analyze traces
-    # Only show progress messages for non-JSON formats (JSON needs clean stdout)
+    # Progress callback for JSON format (emits NDJSON to stdout)
+    def progress_callback(stage: str, fraction: float) -> None:
+        if output_format == 'json':
+            progress_msg = json.dumps({"type": "progress", "stage": stage, "fraction": fraction})
+            print(progress_msg, file=sys.stdout, flush=True)
+        elif output_format != 'json':
+            percent = int(fraction * 100)
+            typer.echo(f"📊 {stage}: {percent}%", err=True)
+    # Metadata callback for JSON format (emits NDJSON with early GPU info)
+    def metadata_callback(meta1: StreamingMetadata, meta2: StreamingMetadata) -> None:
+        if output_format == 'json':
+            metadata_msg = json.dumps({
+                "type": "metadata",
+                "trace1": {
+                    "platform": meta1.platform,
+                    "gpu": meta1.gpu_name,
+                    "file_size_mb": round(meta1.file_size_mb, 1),
+                },
+                "trace2": {
+                    "platform": meta2.platform,
+                    "gpu": meta2.gpu_name,
+                    "file_size_mb": round(meta2.file_size_mb, 1),
+                },
+            })
+            print(metadata_msg, file=sys.stdout, flush=True)
+        else:
+            typer.echo(f"📊 Trace 1: {meta1.platform} - {meta1.gpu_name} ({meta1.file_size_mb:.1f}MB)", err=True)
+            typer.echo(f"📊 Trace 2: {meta2.platform} - {meta2.gpu_name} ({meta2.file_size_mb:.1f}MB)", err=True)
+    # Analyze traces using unified API
     if output_format != 'json':
         typer.echo("📊 Loading traces...")
-    # Determine how many stack traces to collect
-    max_stacks = 0 if (show_stack_traces and show_all) else (3 if show_stack_traces else 3)
     try:
-        results = analyze_traces(
+        result_obj = analyze_trace_pair(
             trace1,
             trace2,
-            phase_filter=phase,
-            max_stacks=max_stacks,
+            phase=phase,
+            include_stacks=True,
+            on_progress=progress_callback,
+            on_metadata=metadata_callback,
         )
+        results = {
+            "metadata": result_obj.metadata,
+            "operations": result_obj.operations,
+            "layers": result_obj.layers,
+            "warnings": [{"code": w.code, "severity": w.severity, "message": w.message, "suggestion": w.suggestion} for w in result_obj.warnings],
+            "architecture": result_obj.architecture.value,
+            "layer_alignments": result_obj.layer_alignments,
+            "fusion_analysis": result_obj.fusion_analysis,
+            "same_kernel_analysis": result_obj.same_kernel_analysis,
+        }
     except ValueError as e:
         typer.secho(f"❌ {e}", fg=typer.colors.RED, err=True)
         raise typer.Exit(1)
@@ -74,17 +117,26 @@ def compare_traces(
         typer.secho(f"❌ Error analyzing traces: {e}", fg=typer.colors.RED, err=True)
         raise typer.Exit(1)
-    # Show loading confirmation
     if output_format != 'json':
         meta = results["metadata"]
-        # Determine which trace is AMD and which is NVIDIA
         if meta['trace1_platform'] == 'AMD':
             amd_gpu, nvidia_gpu = meta['trace1_gpu'], meta['trace2_gpu']
         else:
             amd_gpu, nvidia_gpu = meta['trace2_gpu'], meta['trace1_gpu']
         typer.echo(f"✅ Loaded: AMD ({amd_gpu}) vs NVIDIA ({nvidia_gpu})")
+        # Display warnings
+        warnings = results.get("warnings", [])
+        if warnings:
+            typer.echo()
+            for warning in warnings:
+                icon = "❌" if warning["severity"] == "error" else "⚠️" if warning["severity"] == "warning" else "ℹ️"
+                typer.secho(f"{icon}  {warning['message']}", fg=typer.colors.YELLOW if warning["severity"] == "warning" else typer.colors.BLUE)
+                if warning.get("suggestion"):
+                    typer.secho(f"   Suggestion: {warning['suggestion']}", fg=typer.colors.BLUE)
     typer.echo()
     # Generate output based on format
     if output_format == "text":
         output_str = format_text(results, show_layers=show_layers, show_all=show_all, show_stack_traces=show_stack_traces)
@@ -108,21 +160,23 @@ def compare_traces(
         typer.echo(output_str)
-def compare_fusion(
+def compare_align(
     trace1: Path,
     trace2: Path,
     output: Path | None = None,
-    format_type: str = "text",
-    min_group_size: int = 50,
+    output_format: str = "json",
+    phase: str = "all",
+    layer: int | None = None,
 ) -> None:
-    """Analyze kernel fusion differences between AMD and NVIDIA traces.
+    """Align kernels at layer level for exact kernel-to-kernel comparison.
     Args:
         trace1: Path to first trace file (AMD or NVIDIA)
         trace2: Path to second trace file (AMD or NVIDIA)
         output: Optional output file path (default: stdout)
-        format_type: Output format ('text', 'csv', or 'json')
-        min_group_size: Minimum correlation group size to analyze
+        output_format: Output format ('json' only for now)
+        phase: Filter by phase ('all', 'prefill', or 'decode')
+        layer: Focus on specific layer number (optional)
     """
     # Validate files exist
     if not trace1.exists():
@@ -133,49 +187,86 @@ def compare_fusion(
         typer.secho(f"❌ File not found: {trace2}", fg=typer.colors.RED, err=True)
         raise typer.Exit(1)
-    # Analyze fusion
-    # Only show progress messages for non-JSON formats (JSON needs clean stdout)
-    if format_type != 'json':
+    # Progress callback for JSON format (emits NDJSON to stdout)
+    def progress_callback(stage: str, fraction: float) -> None:
+        if output_format == 'json':
+            progress_msg = json.dumps({"type": "progress", "stage": stage, "fraction": fraction})
+            print(progress_msg, file=sys.stdout, flush=True)
+        else:
+            percent = int(fraction * 100)
+            typer.echo(f"📊 {stage}: {percent}%", err=True)
+    # Metadata callback for JSON format
+    def metadata_callback(meta1: StreamingMetadata, meta2: StreamingMetadata) -> None:
+        if output_format == 'json':
+            metadata_msg = json.dumps({
+                "type": "metadata",
+                "trace1": {
+                    "platform": meta1.platform,
+                    "gpu": meta1.gpu_name,
+                    "file_size_mb": round(meta1.file_size_mb, 1),
+                },
+                "trace2": {
+                    "platform": meta2.platform,
+                    "gpu": meta2.gpu_name,
+                    "file_size_mb": round(meta2.file_size_mb, 1),
+                },
+            })
+            print(metadata_msg, file=sys.stdout, flush=True)
+        else:
+            typer.echo(f"📊 Trace 1: {meta1.platform} - {meta1.gpu_name} ({meta1.file_size_mb:.1f}MB)", err=True)
+            typer.echo(f"📊 Trace 2: {meta2.platform} - {meta2.gpu_name} ({meta2.file_size_mb:.1f}MB)", err=True)
+    # Analyze traces using unified API
+    if output_format != 'json':
         typer.echo("📊 Loading traces...")
     try:
-        results = analyze_fusion_differences(
+        result_obj = analyze_trace_pair(
             trace1,
             trace2,
-            min_group_size=min_group_size,
+            phase=phase,
+            include_stacks=True,
+            on_progress=progress_callback,
+            on_metadata=metadata_callback,
         )
+        results = {
+            "metadata": result_obj.metadata,
+            "layer_alignments": result_obj.layer_alignments or [],
+            "fusion_analysis": result_obj.fusion_analysis or {},
+            "same_kernel_analysis": result_obj.same_kernel_analysis or {},
+            "operations": result_obj.operations,
+            "layers": result_obj.layers,
+            "warnings": [{"code": w.code, "severity": w.severity, "message": w.message, "suggestion": w.suggestion} for w in result_obj.warnings],
+            "architecture": result_obj.architecture.value,
+        }
+        if layer is not None:
+            results["layer_alignments"] = [
+                la for la in results["layer_alignments"] if la.get("layer") == layer
+            ]
+    except ValueError as e:
+        typer.secho(f"❌ {e}", fg=typer.colors.RED, err=True)
+        raise typer.Exit(1)
     except Exception as e:
-        typer.secho(
-            f"❌ Error analyzing traces: {e}", fg=typer.colors.RED, err=True
-        )
+        typer.secho(f"❌ Error analyzing traces: {e}", fg=typer.colors.RED, err=True)
         import traceback
         traceback.print_exc()
         raise typer.Exit(1)
-    # Show loading confirmation
-    if format_type != 'json':
+    if output_format != 'json':
         meta = results["metadata"]
-        # Note: fusion analyzer always uses trace1=AMD, trace2=NVIDIA
-        typer.echo(f"✅ Loaded: {meta['trace1_gpu']} vs {meta['trace2_gpu']}")
-        typer.echo(
-            f"Found {meta['trace1_correlation_groups']} trace1 groups and "
-            f"{meta['trace2_correlation_groups']} trace2 groups with ≥{min_group_size} kernels"
-        )
-        typer.echo(f"✅ Matched {meta['matched_groups']} correlation groups")
+        typer.echo(f"✅ Loaded: {meta.get('amd_gpu', 'Unknown')} vs {meta.get('nvidia_gpu', 'Unknown')}")
+        typer.echo(f"✅ Found {len(results['layer_alignments'])} layers")
         typer.echo()
-    # Generate output
-    if format_type == "text":
-        output_str = format_fusion_text(results)
-    elif format_type == "csv":
-        output_str = format_fusion_csv(results)
-    elif format_type == "json":
-        output_str = format_fusion_json(results)
+    if output_format == "json":
+        output_str = format_json(results)
     else:
-        typer.secho(f"❌ Unknown format: {format_type}", fg=typer.colors.RED, err=True)
+        typer.secho(f"❌ Format {output_format} not yet supported for align command. Use 'json'.", fg=typer.colors.RED, err=True)
         raise typer.Exit(1)
-    # Write output
     if output:
         output.write_text(output_str)
         typer.secho(f"✅ Report saved to {output}", fg=typer.colors.GREEN)

{wafer_cli-0.2.25.dist-info → wafer_cli-0.2.26.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: wafer-cli
-Version: 0.2.25
+Version: 0.2.26
 Summary: CLI for running GPU workloads, managing remote workspaces, and evaluating/optimizing kernels
 Requires-Python: >=3.11
 Description-Content-Type: text/markdown

{wafer_cli-0.2.25.dist-info → wafer_cli-0.2.26.dist-info}/RECORD RENAMED Viewed

@@ -6,10 +6,10 @@ wafer/api_client.py,sha256=i_Az2b2llC3DSW8yOL-BKqa7LSKuxOr8hSN40s-oQXY,6313
 wafer/auth.py,sha256=dwss_se5P-FFc9IN38q4kh_dBrA6k-CguDBkivgcdj0,14003
 wafer/autotuner.py,sha256=41WYP41pTDvMijv2h42vm89bcHtDMJXObDlWmn6xpFU,44416
 wafer/billing.py,sha256=hEEwtrtIsbPQ3lLJNcyTLMsapUbcuvcVW_e9_0SxzVo,7199
-wafer/cli.py,sha256=vboIOEGLWrNUejSWfO0bcQ0IJOAR6Inva7r7PeYb6jI,277592
+wafer/cli.py,sha256=s3m6SJzK1vRJxaQCrd_I4rcxrt3skty0GBdFHzIBc6U,279424
 wafer/cli_instructions.py,sha256=bziUKDNDAXABVMvKPLEMXm-hFSD2TcFSh-FKRYa949k,4693
 wafer/config.py,sha256=h5Eo9_yfWqWGoPNdVQikI9GoZVUeysunSYiixf1mKcw,3411
-wafer/corpus.py,sha256=B7xHNP_ssGbkL0DpXeXisycm_SxrLv5s4oss735GRWI,22567
+wafer/corpus.py,sha256=CY9T7wXENNDJxnrtI-XsQmXeptrFfKG4x-lngrc9_3s,24748
 wafer/evaluate.py,sha256=HMFQD-uwC6Wky1t_0JxYZaoHWgLaTBkjxOxgpZVnGrc,190519
 wafer/global_config.py,sha256=fhaR_RU3ufMksDmOohH1OLeQ0JT0SDW1hEip_zaP75k,11345
 wafer/gpu_run.py,sha256=TwqXy72T7f2I7e6n5WWod3xgxCPnDhU0BgLsB4CUoQY,9716
@@ -27,7 +27,7 @@ wafer/ssh_keys.py,sha256=MxiHlSm6wuDUFzkOQtx5K7OIbx_a6bXxE-m8OpwLx98,8130
 wafer/target_lock.py,sha256=SDKhNzv2N7gsphGflcNni9FE5YYuAMuEthngAJEo4Gs,7809
 wafer/targets.py,sha256=9r-iRWoKSH5cQl1LcamaX-T7cNVOg99ngIm_hlRk-qU,26922
 wafer/targets_ops.py,sha256=jN1oIBx0mutxRNE9xpIc7SaBxPkVmOyus2eqn0kEKNI,21475
-wafer/trace_compare.py,sha256=IBVSGI8u5A10haDzL4eQ0R24fM1G_dd1F3-4iEkG1EQ,6349
+wafer/trace_compare.py,sha256=COuxxKY874DteOSLUvJuJFREPMBSybq9dtANi3ATsg4,10803
 wafer/tracelens.py,sha256=g9ZIeFyNojZn4uTd3skPqIrRiL7aMJOz_-GOd3aiyy4,7998
 wafer/wevin_cli.py,sha256=eo1ETsXIsCftXSG5AxEYYZipNGcXayKyIevs5F6MjXg,26140
 wafer/workspaces.py,sha256=J-TXGwHXSZlzRWCew63KNvk6HLJ-zTSELRgzjryTkMk,35710
@@ -38,8 +38,8 @@ wafer/templates/optimize_kernel.py,sha256=4-MaKm_C9BQHQEllrNLLYkcdhJpcj6D-8zbJ4F
 wafer/templates/optimize_kernelbench.py,sha256=T3co9Y9eSLWDrZG66gwQVFMdnGVoyUQos-TxnMMBLL8,3747
 wafer/templates/trace_analyze.py,sha256=B7CiRlsokERzBjLL-k49kGjpU2zlJZqzTE05xbRS1WI,2878
 wafer/tests/test_eval_cli_parity.py,sha256=SGmaj2NGBZ7GdDF53bXsECvQbV21iHZw8YeL_MJOLk0,7206
-wafer_cli-0.2.25.dist-info/METADATA,sha256=sJEMfe-FTEqIj_Ij9gYek8BMB3_MD1vbMvJ9VhK8Qow,2799
-wafer_cli-0.2.25.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
-wafer_cli-0.2.25.dist-info/entry_points.txt,sha256=WqB7hB__WhtPY8y1cO2sZiUz7fCq6Ik-usAigpeFvWE,41
-wafer_cli-0.2.25.dist-info/top_level.txt,sha256=2MK1IVMWfpLL8BZCQ3E9aG6L6L666gSA_teYlwan4fs,6
-wafer_cli-0.2.25.dist-info/RECORD,,
+wafer_cli-0.2.26.dist-info/METADATA,sha256=IM8Eatar1KYIBo1hHEBjvpX6J272f0PWfV4mwhV1jIY,2799
+wafer_cli-0.2.26.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
+wafer_cli-0.2.26.dist-info/entry_points.txt,sha256=WqB7hB__WhtPY8y1cO2sZiUz7fCq6Ik-usAigpeFvWE,41
+wafer_cli-0.2.26.dist-info/top_level.txt,sha256=2MK1IVMWfpLL8BZCQ3E9aG6L6L666gSA_teYlwan4fs,6
+wafer_cli-0.2.26.dist-info/RECORD,,

{wafer_cli-0.2.25.dist-info → wafer_cli-0.2.26.dist-info}/WHEEL RENAMED Viewed

File without changes

{wafer_cli-0.2.25.dist-info → wafer_cli-0.2.26.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{wafer_cli-0.2.25.dist-info → wafer_cli-0.2.26.dist-info}/top_level.txt RENAMED Viewed

File without changes

wafer-cli 0.2.25__py3-none-any.whl → 0.2.26__py3-none-any.whl

wafer-cli 0.2.25py3-none-any.whl → 0.2.26py3-none-any.whl