PyPI - metal-debug - Versions diffs - 0.1.0__tar.gz - Mend

metal-debug 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

metal_debug-0.1.0/LICENSE +21 -0
metal_debug-0.1.0/PKG-INFO +229 -0
metal_debug-0.1.0/README.md +207 -0
metal_debug-0.1.0/pyproject.toml +35 -0
metal_debug-0.1.0/python/metal_debug.egg-info/PKG-INFO +229 -0
metal_debug-0.1.0/python/metal_debug.egg-info/SOURCES.txt +9 -0
metal_debug-0.1.0/python/metal_debug.egg-info/dependency_links.txt +1 -0
metal_debug-0.1.0/python/metal_debug.egg-info/requires.txt +7 -0
metal_debug-0.1.0/python/metal_debug.egg-info/top_level.txt +1 -0
metal_debug-0.1.0/python/metal_debug.py +377 -0
metal_debug-0.1.0/setup.cfg +4 -0

metal_debug-0.1.0/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2026 imperatormk
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

metal_debug-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,229 @@
+Metadata-Version: 2.4
+Name: metal-debug
+Version: 0.1.0
+Summary: Printf-style debugging for Metal compute shaders
+Author: imperatormk
+License-Expression: MIT
+Project-URL: Repository, https://github.com/imperatormk/metal-debug
+Keywords: metal,gpu,debug,apple,shader,compute
+Classifier: Development Status :: 3 - Alpha
+Classifier: Operating System :: MacOS
+Classifier: Programming Language :: Python :: 3
+Classifier: Topic :: Software Development :: Debuggers
+Requires-Python: >=3.9
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: numpy
+Provides-Extra: torch
+Requires-Dist: torch; extra == "torch"
+Provides-Extra: tui
+Requires-Dist: textual>=1.0; extra == "tui"
+Dynamic: license-file
+# metal-debug
+Printf-style debugging for Metal compute shaders. No Xcode GPU debugger, no buffer dumps, no guessing.
+Add `#include "metal_debug.h"` to your shader, drop in a debug buffer, see what every thread computed.
+```metal
+#include "metal_debug.h"
+kernel void my_kernel(
+    device float *A       [[buffer(0)]],
+    device float *B       [[buffer(1)]],
+    device float *C       [[buffer(2)]],
+    device uint  *dbg_buf [[buffer(30)]],
+    uint id [[thread_position_in_grid]]
+) {
+    float a = A[id], b = B[id];
+    dbg_printf(dbg_buf, id, 0, a);       // log input A
+    dbg_printf(dbg_buf, id, 1, b);       // log input B
+    float result = a * b;
+    dbg_watch_nan(dbg_buf, id, 2, result);  // only logs if NaN/Inf
+    dbg_assert(dbg_buf, id, 3, result > 0); // GPU-side assertion
+    C[id] = result;
+}
+```
+Host side (ObjC):
+```objc
+MetalDebugSession *dbg = [[MetalDebugSession alloc] initWithDevice:device maxEntries:4096];
+[encoder setBuffer:dbg.buffer offset:0 atIndex:30];
+// ... dispatch ...
+[dbg dump];
+```
+Output:
+```
+[metal-debug] 24 entries
+  thread[0] 0: 3.5
+  thread[0] 1: 2.0
+  thread[1] 0: 1.2
+  thread[1] 1: -0.5
+  thread[1] 3: ASSERTION FAILED
+```
+## Features
+| Feature | GPU API | Description |
+|---------|---------|-------------|
+| Printf | `dbg_printf(buf, tid, tag, val)` | Log float/int/uint/half/vec values |
+| Conditional | `dbg_printf_if(buf, cond, tid, tag, val)` | Only log when condition is true |
+| NaN watchpoint | `dbg_watch_nan(buf, tid, tag, val)` | Log only NaN/Inf values |
+| Range watchpoint | `dbg_watch_range(buf, tid, tag, val, lo, hi)` | Log values outside range |
+| Assertions | `dbg_assert(buf, tid, tag, cond)` | Record assertion failures |
+| Breakpoints | `dbg_break(buf, tid, tag, cond)` | Set flag for host to detect |
+| Stats | `dbg_stats(buf, tag, val)` | Cross-thread min/max/mean/count |
+| Histogram | `dbg_histogram(buf, tag, val, lo, hi)` | Value distribution with bar chart |
+| Named tags | Preprocessor or host-side | `"loss"` instead of `tag=42` |
+| 2D grid view | Host-side | Display values as threadgroup grid |
+| Diff mode | Host-side | Compare two kernel runs |
+| Zero-overhead disable | `#define METAL_DEBUG_DISABLE` | Compiles out all debug calls |
+## How it works
+1. **GPU side**: `metal_debug.h` is a single header. Debug calls write `(thread_id, tag, type, value)` entries into a device buffer using atomic counters.
+2. **Host side**: `MetalDebugSession` allocates the buffer, binds it at slot 30, and reads/formats entries after execution.
+3. **No recompilation needed** when changing buffer size — `max_entries` is stored in the buffer itself and read by the GPU at runtime.
+## Build & test
+```bash
+git clone <this repo>
+cd metal-debug
+make test    # compiles + runs 9 test kernels on your GPU
+```
+## Integration
+### ObjC / C++
+Copy `src/metal_debug.h` into your project. Link `runtime/MetalDebugSession.{h,m}` into your app.
+```objc
+#import "MetalDebugSession.h"
+MetalDebugSession *dbg = [[MetalDebugSession alloc]
+    initWithDevice:device maxEntries:4096];
+[dbg setName:@"loss" forTag:0];
+[encoder setBuffer:dbg.buffer offset:0 atIndex:30];
+// dispatch kernel...
+[dbg dump];                    // all entries, sorted by thread
+[dbg dumpTag:0];               // filter by tag
+[dbg dumpGrid:0 width:8 height:8]; // 2D threadgroup view
+[dbg dumpStats:0];             // min/max/mean
+[dbg dumpHistogram:0 lo:0 hi:1]; // value distribution
+if ([dbg breakpointHit])
+    [dbg dumpBreakpoint];      // what went wrong
+[dbg reset];                   // reuse for next dispatch
+```
+### Swift
+```swift
+import Metal
+let dbg = MetalDebugSession(device: device, maxEntries: 4096)
+encoder.setBuffer(dbg.buffer, offset: 0, index: 30)
+// dispatch kernel...
+dbg.dump()
+```
+See `examples/SwiftDemo/` for a complete Swift example.
+### Python (PyTorch MPS / Triton)
+```python
+from metal_debug import MetalDebugSession
+dbg = MetalDebugSession(max_entries=4096)
+# pass dbg.tensor as buffer(30) to your Metal/Triton kernel
+torch.mps.synchronize()
+dbg.dump()
+```
+### Interactive TUI
+Explore debug traces interactively — filter, navigate, see grid views and stats live:
+```bash
+pip install textual
+# Launch with demo data
+python python/tui.py --demo
+# Launch with a debug buffer dump
+python python/tui.py trace.bin
+```
+Or from Python after a kernel dispatch:
+```python
+dbg.explore(grid_width=8, grid_height=8)
+```
+Keyboard shortcuts:
+| Key | Action |
+|-----|--------|
+| `↑/↓` | Navigate entries |
+| `g` | Show 2D grid for selected tag |
+| `a` | Show assertions only |
+| `b` | Jump to breakpoint thread |
+| `c` | Clear filters |
+| `m` | Toggle mouse (enable copy/paste) |
+| `escape` | Focus table from filter input |
+| `q` | Quit |
+### Source preprocessor
+Auto-inject the debug buffer parameter into kernel signatures and use string tags:
+```bash
+python3 src/metal_debug_preprocess.py my_kernel.metal -o my_kernel_debug.metal
+xcrun metal -I path/to/metal-debug/src -o out.metallib my_kernel_debug.metal
+```
+Before:
+```metal
+kernel void foo(device float *A [[buffer(0)]], uint id [[thread_position_in_grid]]) {
+    dbg(id, "value", A[id]);
+}
+```
+After preprocessing:
+```metal
+kernel void foo(device float *A [[buffer(0)]], uint id [[thread_position_in_grid]],
+                device uint *_dbg_buf [[buffer(30)]]) {
+    dbg(id, 47248/*value*/, A[id]);
+}
+```
+## Convenience macros
+If you use `DBG_PARAM` in your kernel signature, the short macros work:
+```metal
+kernel void my_kernel(device float *A [[buffer(0)]], DBG_PARAM,
+                      uint id [[thread_position_in_grid]]) {
+    dbg(id, 0, A[id]);              // printf
+    dbg_if(id == 0, id, 1, A[id]);  // conditional
+    dbg_nan(id, 2, A[id]);          // NaN watchpoint
+    dbg_check(id, 3, A[id] > 0);   // assertion
+    dbg_stat(0, A[id]);             // stats accumulator
+    dbg_hist(0, A[id], 0, 100);     // histogram
+    dbg_brk(id, 4, A[id] < 0);     // breakpoint
+}
+```
+## License
+MIT

metal_debug-0.1.0/README.md ADDED Viewed

@@ -0,0 +1,207 @@
+# metal-debug
+Printf-style debugging for Metal compute shaders. No Xcode GPU debugger, no buffer dumps, no guessing.
+Add `#include "metal_debug.h"` to your shader, drop in a debug buffer, see what every thread computed.
+```metal
+#include "metal_debug.h"
+kernel void my_kernel(
+    device float *A       [[buffer(0)]],
+    device float *B       [[buffer(1)]],
+    device float *C       [[buffer(2)]],
+    device uint  *dbg_buf [[buffer(30)]],
+    uint id [[thread_position_in_grid]]
+) {
+    float a = A[id], b = B[id];
+    dbg_printf(dbg_buf, id, 0, a);       // log input A
+    dbg_printf(dbg_buf, id, 1, b);       // log input B
+    float result = a * b;
+    dbg_watch_nan(dbg_buf, id, 2, result);  // only logs if NaN/Inf
+    dbg_assert(dbg_buf, id, 3, result > 0); // GPU-side assertion
+    C[id] = result;
+}
+```
+Host side (ObjC):
+```objc
+MetalDebugSession *dbg = [[MetalDebugSession alloc] initWithDevice:device maxEntries:4096];
+[encoder setBuffer:dbg.buffer offset:0 atIndex:30];
+// ... dispatch ...
+[dbg dump];
+```
+Output:
+```
+[metal-debug] 24 entries
+  thread[0] 0: 3.5
+  thread[0] 1: 2.0
+  thread[1] 0: 1.2
+  thread[1] 1: -0.5
+  thread[1] 3: ASSERTION FAILED
+```
+## Features
+| Feature | GPU API | Description |
+|---------|---------|-------------|
+| Printf | `dbg_printf(buf, tid, tag, val)` | Log float/int/uint/half/vec values |
+| Conditional | `dbg_printf_if(buf, cond, tid, tag, val)` | Only log when condition is true |
+| NaN watchpoint | `dbg_watch_nan(buf, tid, tag, val)` | Log only NaN/Inf values |
+| Range watchpoint | `dbg_watch_range(buf, tid, tag, val, lo, hi)` | Log values outside range |
+| Assertions | `dbg_assert(buf, tid, tag, cond)` | Record assertion failures |
+| Breakpoints | `dbg_break(buf, tid, tag, cond)` | Set flag for host to detect |
+| Stats | `dbg_stats(buf, tag, val)` | Cross-thread min/max/mean/count |
+| Histogram | `dbg_histogram(buf, tag, val, lo, hi)` | Value distribution with bar chart |
+| Named tags | Preprocessor or host-side | `"loss"` instead of `tag=42` |
+| 2D grid view | Host-side | Display values as threadgroup grid |
+| Diff mode | Host-side | Compare two kernel runs |
+| Zero-overhead disable | `#define METAL_DEBUG_DISABLE` | Compiles out all debug calls |
+## How it works
+1. **GPU side**: `metal_debug.h` is a single header. Debug calls write `(thread_id, tag, type, value)` entries into a device buffer using atomic counters.
+2. **Host side**: `MetalDebugSession` allocates the buffer, binds it at slot 30, and reads/formats entries after execution.
+3. **No recompilation needed** when changing buffer size — `max_entries` is stored in the buffer itself and read by the GPU at runtime.
+## Build & test
+```bash
+git clone <this repo>
+cd metal-debug
+make test    # compiles + runs 9 test kernels on your GPU
+```
+## Integration
+### ObjC / C++
+Copy `src/metal_debug.h` into your project. Link `runtime/MetalDebugSession.{h,m}` into your app.
+```objc
+#import "MetalDebugSession.h"
+MetalDebugSession *dbg = [[MetalDebugSession alloc]
+    initWithDevice:device maxEntries:4096];
+[dbg setName:@"loss" forTag:0];
+[encoder setBuffer:dbg.buffer offset:0 atIndex:30];
+// dispatch kernel...
+[dbg dump];                    // all entries, sorted by thread
+[dbg dumpTag:0];               // filter by tag
+[dbg dumpGrid:0 width:8 height:8]; // 2D threadgroup view
+[dbg dumpStats:0];             // min/max/mean
+[dbg dumpHistogram:0 lo:0 hi:1]; // value distribution
+if ([dbg breakpointHit])
+    [dbg dumpBreakpoint];      // what went wrong
+[dbg reset];                   // reuse for next dispatch
+```
+### Swift
+```swift
+import Metal
+let dbg = MetalDebugSession(device: device, maxEntries: 4096)
+encoder.setBuffer(dbg.buffer, offset: 0, index: 30)
+// dispatch kernel...
+dbg.dump()
+```
+See `examples/SwiftDemo/` for a complete Swift example.
+### Python (PyTorch MPS / Triton)
+```python
+from metal_debug import MetalDebugSession
+dbg = MetalDebugSession(max_entries=4096)
+# pass dbg.tensor as buffer(30) to your Metal/Triton kernel
+torch.mps.synchronize()
+dbg.dump()
+```
+### Interactive TUI
+Explore debug traces interactively — filter, navigate, see grid views and stats live:
+```bash
+pip install textual
+# Launch with demo data
+python python/tui.py --demo
+# Launch with a debug buffer dump
+python python/tui.py trace.bin
+```
+Or from Python after a kernel dispatch:
+```python
+dbg.explore(grid_width=8, grid_height=8)
+```
+Keyboard shortcuts:
+| Key | Action |
+|-----|--------|
+| `↑/↓` | Navigate entries |
+| `g` | Show 2D grid for selected tag |
+| `a` | Show assertions only |
+| `b` | Jump to breakpoint thread |
+| `c` | Clear filters |
+| `m` | Toggle mouse (enable copy/paste) |
+| `escape` | Focus table from filter input |
+| `q` | Quit |
+### Source preprocessor
+Auto-inject the debug buffer parameter into kernel signatures and use string tags:
+```bash
+python3 src/metal_debug_preprocess.py my_kernel.metal -o my_kernel_debug.metal
+xcrun metal -I path/to/metal-debug/src -o out.metallib my_kernel_debug.metal
+```
+Before:
+```metal
+kernel void foo(device float *A [[buffer(0)]], uint id [[thread_position_in_grid]]) {
+    dbg(id, "value", A[id]);
+}
+```
+After preprocessing:
+```metal
+kernel void foo(device float *A [[buffer(0)]], uint id [[thread_position_in_grid]],
+                device uint *_dbg_buf [[buffer(30)]]) {
+    dbg(id, 47248/*value*/, A[id]);
+}
+```
+## Convenience macros
+If you use `DBG_PARAM` in your kernel signature, the short macros work:
+```metal
+kernel void my_kernel(device float *A [[buffer(0)]], DBG_PARAM,
+                      uint id [[thread_position_in_grid]]) {
+    dbg(id, 0, A[id]);              // printf
+    dbg_if(id == 0, id, 1, A[id]);  // conditional
+    dbg_nan(id, 2, A[id]);          // NaN watchpoint
+    dbg_check(id, 3, A[id] > 0);   // assertion
+    dbg_stat(0, A[id]);             // stats accumulator
+    dbg_hist(0, A[id], 0, 100);     // histogram
+    dbg_brk(id, 4, A[id] < 0);     // breakpoint
+}
+```
+## License
+MIT

metal_debug-0.1.0/pyproject.toml ADDED Viewed

@@ -0,0 +1,35 @@
+[build-system]
+requires = ["setuptools>=68.0"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "metal-debug"
+version = "0.1.0"
+description = "Printf-style debugging for Metal compute shaders"
+readme = "README.md"
+license = "MIT"
+requires-python = ">=3.9"
+authors = [{ name = "imperatormk" }]
+keywords = ["metal", "gpu", "debug", "apple", "shader", "compute"]
+classifiers = [
+    "Development Status :: 3 - Alpha",
+    "Operating System :: MacOS",
+    "Programming Language :: Python :: 3",
+    "Topic :: Software Development :: Debuggers",
+]
+dependencies = [
+    "numpy",
+]
+[project.optional-dependencies]
+torch = ["torch"]
+tui = ["textual>=1.0"]
+[project.urls]
+Repository = "https://github.com/imperatormk/metal-debug"
+[tool.setuptools]
+py-modules = ["metal_debug"]
+[tool.setuptools.package-dir]
+"" = "python"

metal_debug-0.1.0/python/metal_debug.egg-info/PKG-INFO ADDED Viewed

@@ -0,0 +1,229 @@
+Metadata-Version: 2.4
+Name: metal-debug
+Version: 0.1.0
+Summary: Printf-style debugging for Metal compute shaders
+Author: imperatormk
+License-Expression: MIT
+Project-URL: Repository, https://github.com/imperatormk/metal-debug
+Keywords: metal,gpu,debug,apple,shader,compute
+Classifier: Development Status :: 3 - Alpha
+Classifier: Operating System :: MacOS
+Classifier: Programming Language :: Python :: 3
+Classifier: Topic :: Software Development :: Debuggers
+Requires-Python: >=3.9
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: numpy
+Provides-Extra: torch
+Requires-Dist: torch; extra == "torch"
+Provides-Extra: tui
+Requires-Dist: textual>=1.0; extra == "tui"
+Dynamic: license-file
+# metal-debug
+Printf-style debugging for Metal compute shaders. No Xcode GPU debugger, no buffer dumps, no guessing.
+Add `#include "metal_debug.h"` to your shader, drop in a debug buffer, see what every thread computed.
+```metal
+#include "metal_debug.h"
+kernel void my_kernel(
+    device float *A       [[buffer(0)]],
+    device float *B       [[buffer(1)]],
+    device float *C       [[buffer(2)]],
+    device uint  *dbg_buf [[buffer(30)]],
+    uint id [[thread_position_in_grid]]
+) {
+    float a = A[id], b = B[id];
+    dbg_printf(dbg_buf, id, 0, a);       // log input A
+    dbg_printf(dbg_buf, id, 1, b);       // log input B
+    float result = a * b;
+    dbg_watch_nan(dbg_buf, id, 2, result);  // only logs if NaN/Inf
+    dbg_assert(dbg_buf, id, 3, result > 0); // GPU-side assertion
+    C[id] = result;
+}
+```
+Host side (ObjC):
+```objc
+MetalDebugSession *dbg = [[MetalDebugSession alloc] initWithDevice:device maxEntries:4096];
+[encoder setBuffer:dbg.buffer offset:0 atIndex:30];
+// ... dispatch ...
+[dbg dump];
+```
+Output:
+```
+[metal-debug] 24 entries
+  thread[0] 0: 3.5
+  thread[0] 1: 2.0
+  thread[1] 0: 1.2
+  thread[1] 1: -0.5
+  thread[1] 3: ASSERTION FAILED
+```
+## Features
+| Feature | GPU API | Description |
+|---------|---------|-------------|
+| Printf | `dbg_printf(buf, tid, tag, val)` | Log float/int/uint/half/vec values |
+| Conditional | `dbg_printf_if(buf, cond, tid, tag, val)` | Only log when condition is true |
+| NaN watchpoint | `dbg_watch_nan(buf, tid, tag, val)` | Log only NaN/Inf values |
+| Range watchpoint | `dbg_watch_range(buf, tid, tag, val, lo, hi)` | Log values outside range |
+| Assertions | `dbg_assert(buf, tid, tag, cond)` | Record assertion failures |
+| Breakpoints | `dbg_break(buf, tid, tag, cond)` | Set flag for host to detect |
+| Stats | `dbg_stats(buf, tag, val)` | Cross-thread min/max/mean/count |
+| Histogram | `dbg_histogram(buf, tag, val, lo, hi)` | Value distribution with bar chart |
+| Named tags | Preprocessor or host-side | `"loss"` instead of `tag=42` |
+| 2D grid view | Host-side | Display values as threadgroup grid |
+| Diff mode | Host-side | Compare two kernel runs |
+| Zero-overhead disable | `#define METAL_DEBUG_DISABLE` | Compiles out all debug calls |
+## How it works
+1. **GPU side**: `metal_debug.h` is a single header. Debug calls write `(thread_id, tag, type, value)` entries into a device buffer using atomic counters.
+2. **Host side**: `MetalDebugSession` allocates the buffer, binds it at slot 30, and reads/formats entries after execution.
+3. **No recompilation needed** when changing buffer size — `max_entries` is stored in the buffer itself and read by the GPU at runtime.
+## Build & test
+```bash
+git clone <this repo>
+cd metal-debug
+make test    # compiles + runs 9 test kernels on your GPU
+```
+## Integration
+### ObjC / C++
+Copy `src/metal_debug.h` into your project. Link `runtime/MetalDebugSession.{h,m}` into your app.
+```objc
+#import "MetalDebugSession.h"
+MetalDebugSession *dbg = [[MetalDebugSession alloc]
+    initWithDevice:device maxEntries:4096];
+[dbg setName:@"loss" forTag:0];
+[encoder setBuffer:dbg.buffer offset:0 atIndex:30];
+// dispatch kernel...
+[dbg dump];                    // all entries, sorted by thread
+[dbg dumpTag:0];               // filter by tag
+[dbg dumpGrid:0 width:8 height:8]; // 2D threadgroup view
+[dbg dumpStats:0];             // min/max/mean
+[dbg dumpHistogram:0 lo:0 hi:1]; // value distribution
+if ([dbg breakpointHit])
+    [dbg dumpBreakpoint];      // what went wrong
+[dbg reset];                   // reuse for next dispatch
+```
+### Swift
+```swift
+import Metal
+let dbg = MetalDebugSession(device: device, maxEntries: 4096)
+encoder.setBuffer(dbg.buffer, offset: 0, index: 30)
+// dispatch kernel...
+dbg.dump()
+```
+See `examples/SwiftDemo/` for a complete Swift example.
+### Python (PyTorch MPS / Triton)
+```python
+from metal_debug import MetalDebugSession
+dbg = MetalDebugSession(max_entries=4096)
+# pass dbg.tensor as buffer(30) to your Metal/Triton kernel
+torch.mps.synchronize()
+dbg.dump()
+```
+### Interactive TUI
+Explore debug traces interactively — filter, navigate, see grid views and stats live:
+```bash
+pip install textual
+# Launch with demo data
+python python/tui.py --demo
+# Launch with a debug buffer dump
+python python/tui.py trace.bin
+```
+Or from Python after a kernel dispatch:
+```python
+dbg.explore(grid_width=8, grid_height=8)
+```
+Keyboard shortcuts:
+| Key | Action |
+|-----|--------|
+| `↑/↓` | Navigate entries |
+| `g` | Show 2D grid for selected tag |
+| `a` | Show assertions only |
+| `b` | Jump to breakpoint thread |
+| `c` | Clear filters |
+| `m` | Toggle mouse (enable copy/paste) |
+| `escape` | Focus table from filter input |
+| `q` | Quit |
+### Source preprocessor
+Auto-inject the debug buffer parameter into kernel signatures and use string tags:
+```bash
+python3 src/metal_debug_preprocess.py my_kernel.metal -o my_kernel_debug.metal
+xcrun metal -I path/to/metal-debug/src -o out.metallib my_kernel_debug.metal
+```
+Before:
+```metal
+kernel void foo(device float *A [[buffer(0)]], uint id [[thread_position_in_grid]]) {
+    dbg(id, "value", A[id]);
+}
+```
+After preprocessing:
+```metal
+kernel void foo(device float *A [[buffer(0)]], uint id [[thread_position_in_grid]],
+                device uint *_dbg_buf [[buffer(30)]]) {
+    dbg(id, 47248/*value*/, A[id]);
+}
+```
+## Convenience macros
+If you use `DBG_PARAM` in your kernel signature, the short macros work:
+```metal
+kernel void my_kernel(device float *A [[buffer(0)]], DBG_PARAM,
+                      uint id [[thread_position_in_grid]]) {
+    dbg(id, 0, A[id]);              // printf
+    dbg_if(id == 0, id, 1, A[id]);  // conditional
+    dbg_nan(id, 2, A[id]);          // NaN watchpoint
+    dbg_check(id, 3, A[id] > 0);   // assertion
+    dbg_stat(0, A[id]);             // stats accumulator
+    dbg_hist(0, A[id], 0, 100);     // histogram
+    dbg_brk(id, 4, A[id] < 0);     // breakpoint
+}
+```
+## License
+MIT

metal_debug-0.1.0/python/metal_debug.egg-info/SOURCES.txt ADDED Viewed

@@ -0,0 +1,9 @@
+LICENSE
+README.md
+pyproject.toml
+python/metal_debug.py
+python/metal_debug.egg-info/PKG-INFO
+python/metal_debug.egg-info/SOURCES.txt
+python/metal_debug.egg-info/dependency_links.txt
+python/metal_debug.egg-info/requires.txt
+python/metal_debug.egg-info/top_level.txt

metal_debug-0.1.0/python/metal_debug.egg-info/dependency_links.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+

metal_debug-0.1.0/python/metal_debug.egg-info/requires.txt ADDED Viewed

@@ -0,0 +1,7 @@
+numpy
+[torch]
+torch
+[tui]
+textual>=1.0

metal_debug-0.1.0/python/metal_debug.egg-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ metal_debug

metal_debug-0.1.0/python/metal_debug.py ADDED Viewed

@@ -0,0 +1,377 @@
+"""
+metal_debug — Python/PyTorch wrapper for Metal compute shader debugging.
+Usage:
+    from metal_debug import MetalDebugSession
+    dbg = MetalDebugSession(max_entries=1024)
+    # Your kernel gets the debug buffer as an extra argument at buffer(30)
+    # Pass dbg.tensor as that argument
+    my_kernel(A, B, C, ..., dbg.tensor, threads=..., group_size=...)
+    torch.mps.synchronize()
+    dbg.dump()                    # print all entries
+    dbg.dump(tag=2)               # filter by tag
+    dbg.dump(thread=0)            # filter by thread
+    dbg.stats(tag=0)              # min/max/mean/count
+    dbg.histogram(tag=0, lo=0, hi=100)
+    dbg.grid(tag=50, width=4, height=4)
+    dbg.diff(other_session)       # compare two runs
+    if dbg.breakpoint_hit:
+        dbg.dump_breakpoint()
+Compile your .metal with:
+    xcrun metal -DMETAL_DEBUG_MAX_ENTRIES=1024 -I path/to/metal-debug/src ...
+"""
+import torch
+import struct
+import json
+from pathlib import Path
+from typing import Optional
+# Must match GPU-side constants
+METAL_DEBUG_HIST_BINS = 32
+METAL_DEBUG_STATS_TAGS = 256
+TYPE_FLOAT = 0
+TYPE_INT = 1
+TYPE_UINT = 2
+TYPE_HALF = 3
+TYPE_ASSERT_FAIL = 4
+def _decode_value(type_id: int, bits: int) -> str:
+    """Decode a debug entry value from its type and bit pattern."""
+    if type_id == TYPE_FLOAT:
+        val = struct.unpack('f', struct.pack('I', bits))[0]
+        return f"{val:.6g}"
+    elif type_id == TYPE_INT:
+        val = struct.unpack('i', struct.pack('I', bits))[0]
+        return str(val)
+    elif type_id == TYPE_UINT:
+        return str(bits)
+    elif type_id == TYPE_HALF:
+        # Decode half from low 16 bits
+        h = bits & 0xFFFF
+        sign = (h >> 15) & 1
+        exp = (h >> 10) & 0x1F
+        mant = h & 0x3FF
+        if exp == 0:
+            val = mant * 2**-24
+        elif exp == 31:
+            val = float('nan') if mant else float('inf')
+        else:
+            val = (mant + 1024) * 2**(exp - 25)
+        if sign:
+            val = -val
+        return f"{val:.4g} (half)"
+    elif type_id == TYPE_ASSERT_FAIL:
+        if bits == 0:
+            return "ASSERTION FAILED"
+        elif bits == 0xDEAD:
+            return "BREAKPOINT"
+        else:
+            val = struct.unpack('f', struct.pack('I', bits))[0]
+            return f"ASSERTION FAILED (val={val:.6g})"
+    return f"0x{bits:08x}"
+def _decode_float(bits: int) -> float:
+    return struct.unpack('f', struct.pack('I', bits))[0]
+class MetalDebugSession:
+    """Host-side debug session for Metal compute shaders."""
+    def __init__(self, max_entries: int = 1024, device: str = "mps"):
+        self.max_entries = max_entries
+        self._device = device
+        self._tag_names: dict[int, str] = {}
+        # Calculate buffer layout
+        # Header: [0]=counter, [1]=max_entries
+        self._entries_base = 2
+        self._stats_base = self._entries_base + max_entries * 4
+        self._hist_base = self._stats_base + METAL_DEBUG_STATS_TAGS * 4
+        self._break_base = self._hist_base + 16 * METAL_DEBUG_HIST_BINS
+        total_uints = self._break_base + 3
+        # Allocate as MPS tensor
+        self.tensor = torch.zeros(total_uints, dtype=torch.int32, device=device)
+        self._init_buffer()
+    def _init_buffer(self):
+        """Write max_entries to buf[1] and initialize stats min/max."""
+        cpu = self.tensor.cpu()
+        buf = cpu.numpy().view('uint32')
+        # Write max_entries so GPU can read it at runtime
+        buf[1] = self.max_entries
+        # Init stats
+        flt_max_bits = struct.unpack('I', struct.pack('f', 3.4028235e+38))[0]
+        flt_min_bits = struct.unpack('I', struct.pack('f', -3.4028235e+38))[0]
+        for tag in range(METAL_DEBUG_STATS_TAGS):
+            base = self._stats_base + tag * 4
+            buf[base + 1] = flt_max_bits
+            buf[base + 2] = flt_min_bits
+        self.tensor = torch.from_numpy(buf.view('int32').copy()).to(self._device)
+    def reset(self):
+        """Clear the debug buffer for the next dispatch."""
+        self.tensor = torch.zeros_like(self.tensor)
+        self._init_buffer()
+    # ── Tag names ────────────────────────────────────────────────────────────
+    def set_tag_name(self, tag: int, name: str):
+        self._tag_names[tag] = name
+    def load_tag_names(self, path: str):
+        """Load tag names from a .tags.json file (generated by preprocessor)."""
+        with open(path) as f:
+            data = json.load(f)
+        for tag_str, name in data.items():
+            self._tag_names[int(tag_str)] = name
+    def _tag_label(self, tag: int) -> str:
+        name = self._tag_names.get(tag)
+        return f"{name}({tag})" if name else str(tag)
+    # ── Read entries ─────────────────────────────────────────────────────────
+    def _buf(self):
+        """Get buffer as CPU uint32 numpy array."""
+        return self.tensor.cpu().numpy().view('uint32')
+    def entry_count(self) -> int:
+        buf = self._buf()
+        return min(int(buf[0]), self.max_entries)
+    def entries(self) -> list[dict]:
+        buf = self._buf()
+        count = min(int(buf[0]), self.max_entries)
+        result = []
+        for i in range(count):
+            base = self._entries_base + i * 4
+            entry = {
+                'thread': int(buf[base]),
+                'tag': int(buf[base + 1]),
+                'type': int(buf[base + 2]),
+                'value_bits': int(buf[base + 3]),
+                'value': _decode_value(int(buf[base + 2]), int(buf[base + 3])),
+                'tag_name': self._tag_label(int(buf[base + 1])),
+            }
+            result.append(entry)
+        return result
+    # ── Dump ─────────────────────────────────────────────────────────────────
+    def dump(self, tag: Optional[int] = None, thread: Optional[int] = None,
+             thread_range: Optional[tuple[int, int]] = None):
+        """Print debug entries, optionally filtered."""
+        entries = self.entries()
+        if tag is not None:
+            entries = [e for e in entries if e['tag'] == tag]
+        if thread is not None:
+            entries = [e for e in entries if e['thread'] == thread]
+        if thread_range is not None:
+            lo, hi = thread_range
+            entries = [e for e in entries if lo <= e['thread'] <= hi]
+        # Sort by thread, then tag
+        entries.sort(key=lambda e: (e['thread'], e['tag']))
+        total = self._buf()[0]
+        overflow = f" (OVERFLOW: {total} attempted)" if total > self.max_entries else ""
+        print(f"[metal-debug] {len(entries)} entries{overflow}")
+        for e in entries:
+            print(f"  thread[{e['thread']}] {e['tag_name']}: {e['value']}")
+    # ── Assertions ───────────────────────────────────────────────────────────
+    @property
+    def has_assertion_failures(self) -> bool:
+        return any(e['type'] == TYPE_ASSERT_FAIL for e in self.entries())
+    def dump_assertions(self):
+        failures = [e for e in self.entries() if e['type'] == TYPE_ASSERT_FAIL]
+        if not failures:
+            print("[metal-debug] No assertion failures.")
+            return
+        print(f"[metal-debug] ASSERTION FAILURES:")
+        for e in failures:
+            print(f"  thread[{e['thread']}] {e['tag_name']}: {e['value']}")
+    # ── Stats ────────────────────────────────────────────────────────────────
+    def stats(self, tag: int) -> dict:
+        buf = self._buf()
+        base = self._stats_base + tag * 4
+        count = int(buf[base])
+        if count == 0:
+            return {'count': 0, 'min': 0, 'max': 0, 'mean': 0}
+        min_val = _decode_float(int(buf[base + 1]))
+        max_val = _decode_float(int(buf[base + 2]))
+        fixed_sum = struct.unpack('i', struct.pack('I', int(buf[base + 3])))[0]
+        mean = (fixed_sum / 1024.0) / count
+        return {'count': count, 'min': min_val, 'max': max_val, 'mean': mean}
+    def dump_stats(self, tag: int):
+        s = self.stats(tag)
+        label = self._tag_label(tag)
+        if s['count'] == 0:
+            print(f"[metal-debug] stats {label}: no data")
+            return
+        print(f"[metal-debug] stats {label}: "
+              f"count={s['count']}, min={s['min']:.6g}, "
+              f"max={s['max']:.6g}, mean={s['mean']:.6g}")
+    # ── Histogram ────────────────────────────────────────────────────────────
+    def histogram(self, tag: int, lo: float, hi: float):
+        buf = self._buf()
+        base = self._hist_base + tag * METAL_DEBUG_HIST_BINS
+        bins = [int(buf[base + i]) for i in range(METAL_DEBUG_HIST_BINS)]
+        total = sum(bins)
+        if total == 0:
+            print(f"[metal-debug] histogram {self._tag_label(tag)}: no data")
+            return
+        max_count = max(bins)
+        bin_width = (hi - lo) / METAL_DEBUG_HIST_BINS
+        bar_max = 40
+        print(f"[metal-debug] histogram {self._tag_label(tag)} "
+              f"({total} values, [{lo:.4g}, {hi:.4g}]):")
+        for i, c in enumerate(bins):
+            if c == 0:
+                continue
+            b_lo = lo + i * bin_width
+            b_hi = b_lo + bin_width
+            bar_len = max(1, int(c / max_count * bar_max)) if c > 0 else 0
+            bar = "█" * bar_len
+            print(f"  [{b_lo:7.3f}, {b_hi:7.3f}) {c:6d} |{bar}")
+    # ── Breakpoints ──────────────────────────────────────────────────────────
+    @property
+    def breakpoint_hit(self) -> bool:
+        buf = self._buf()
+        return buf[self._break_base] != 0
+    def dump_breakpoint(self):
+        buf = self._buf()
+        if buf[self._break_base] == 0:
+            print("[metal-debug] No breakpoint hit.")
+            return
+        tid = int(buf[self._break_base + 1])
+        tag = int(buf[self._break_base + 2])
+        print("[metal-debug] *** BREAKPOINT HIT ***")
+        print(f"  First trigger: thread[{tid}] {self._tag_label(tag)}")
+        print(f"  Debug state at break:")
+        entries = [e for e in self.entries() if e['thread'] == tid]
+        entries.sort(key=lambda e: e['tag'])
+        for e in entries:
+            print(f"    {e['tag_name']}: {e['value']}")
+        # Count breakpoint entries
+        all_entries = self.entries()
+        hit_count = sum(1 for e in all_entries
+                       if e['type'] == TYPE_ASSERT_FAIL and e['value_bits'] == 0xDEAD)
+        if hit_count > 1:
+            print(f"  ({hit_count} threads hit this breakpoint)")
+    # ── Grid view ────────────────────────────────────────────────────────────
+    def grid(self, tag: int, width: int, height: int):
+        entries = [e for e in self.entries() if e['tag'] == tag]
+        # Build thread_id → value map (last write wins)
+        values = {}
+        for e in entries:
+            values[e['thread']] = e['value']
+        if not values:
+            print(f"[metal-debug] grid {self._tag_label(tag)}: no data")
+            return
+        max_len = max(3, max(len(v) for v in values.values()))
+        max_len = min(max_len, 10)
+        print(f"[metal-debug] grid {self._tag_label(tag)} ({width}x{height}):")
+        # Header
+        header = "     " + "".join(f" {x:>{max_len}}" for x in range(width))
+        print(header)
+        separator = "     " + "".join(" " + "─" * max_len for _ in range(width))
+        print(separator)
+        for y in range(height):
+            row = f" {y:3d}│"
+            for x in range(width):
+                tid = y * width + x
+                val = values.get(tid, "·")
+                if len(val) > max_len:
+                    val = val[:max_len]
+                row += f" {val:>{max_len}}"
+            print(row)
+    # ── Diff ─────────────────────────────────────────────────────────────────
+    def snapshot(self, label: str = "") -> tuple[list[dict], str]:
+        return (self.entries(), label)
+    # ── Interactive TUI ────────────────────────────────────────────────────
+    def explore(self, grid_width: int = 0, grid_height: int = 0,
+                hist_tag: int = 0, hist_lo: float = 0, hist_hi: float = 100):
+        """Launch interactive TUI explorer."""
+        from tui import explore
+        explore(self, grid_width=grid_width, grid_height=grid_height,
+                hist_tag=hist_tag, hist_lo=hist_lo, hist_hi=hist_hi)
+    # ── Diff ─────────────────────────────────────────────────────────────────
+    @staticmethod
+    def diff(snap_a: tuple, snap_b: tuple):
+        entries_a, label_a = snap_a
+        entries_b, label_b = snap_b
+        map_a = {(e['thread'], e['tag']): e['value'] for e in entries_a}
+        map_b = {(e['thread'], e['tag']): e['value'] for e in entries_b}
+        all_keys = sorted(set(map_a.keys()) | set(map_b.keys()))
+        print(f'[metal-debug] diff: "{label_a or "A"}" vs "{label_b or "B"}"')
+        added = removed = changed = same = 0
+        for key in all_keys:
+            tid, tag = key
+            va = map_a.get(key)
+            vb = map_b.get(key)
+            if va and not vb:
+                print(f"  - thread[{tid}] tag={tag}: {va}")
+                removed += 1
+            elif not va and vb:
+                print(f"  + thread[{tid}] tag={tag}: {vb}")
+                added += 1
+            elif va != vb:
+                print(f"  ~ thread[{tid}] tag={tag}: {va} → {vb}")
+                changed += 1
+            else:
+                same += 1
+        print(f"\n  Summary: {same} same, {changed} changed, "
+              f"{added} added, {removed} removed")

metal_debug-0.1.0/setup.cfg ADDED Viewed

@@ -0,0 +1,4 @@
+[egg_info]
+tag_build =
+tag_date = 0