PyPI - warpscope - Versions diffs - 0.1.0__py3-none-any.whl - Mend

warpscope 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

warpscope/__init__.py +48 -0
warpscope/_decode.py +141 -0
warpscope/_perfetto.py +83 -0
warpscope/_wire.py +57 -0
warpscope/buffer.py +79 -0
warpscope/cli.py +43 -0
warpscope/include/warpscope.cuh +115 -0
warpscope/include/warpscope_host.hpp +168 -0
warpscope-0.1.0.dist-info/METADATA +156 -0
warpscope-0.1.0.dist-info/RECORD +14 -0
warpscope-0.1.0.dist-info/WHEEL +5 -0
warpscope-0.1.0.dist-info/entry_points.txt +2 -0
warpscope-0.1.0.dist-info/licenses/LICENSE +37 -0
warpscope-0.1.0.dist-info/top_level.txt +1 -0

warpscope/__init__.py ADDED Viewed

@@ -0,0 +1,48 @@
+# SPDX-License-Identifier: MIT
+# Copyright (c) 2026 warpscope contributors
+#
+# Credits:
+#   - Original idea & design: 侯博涵 (Hou Bohan)
+#       https://zhuanlan.zhihu.com/p/2054305616391304228
+#   - Wire format / decode / Perfetto export adapted from Apache TVM TIRx CudaProfiler:
+#       https://github.com/apache/tvm/blob/main/python/tvm/tirx/bench.py
+#       https://tvm.apache.org/docs/tirx/native_basics/cuda/profiling.html
+#   - Implementation written by Claude Opus (Anthropic).
+"""warpscope - in-kernel %globaltimer profiler for warp-specialized CUDA kernels.
+Bracket logical stages inside a CUDA kernel with the device header
+``warpscope.cuh``, pass a zeroed uint64 buffer, then decode it here into per-
+(block, group) durations or a Perfetto/Chrome trace file.
+"""
+from ._decode import Instant, Span, TraceResult, decode
+from ._wire import (
+    EVENT_TYPE_BEGIN,
+    EVENT_TYPE_END,
+    EVENT_TYPE_FINALIZE,
+    EVENT_TYPE_INSTANT,
+    WIRE_VERSION,
+    decode_tag,
+    pack_tag,
+)
+from .buffer import Profiler
+from .cli import include_dir
+__version__ = "0.1.0"
+__all__ = [
+    "Profiler",
+    "decode",
+    "TraceResult",
+    "Span",
+    "Instant",
+    "include_dir",
+    "decode_tag",
+    "pack_tag",
+    "WIRE_VERSION",
+    "EVENT_TYPE_BEGIN",
+    "EVENT_TYPE_END",
+    "EVENT_TYPE_INSTANT",
+    "EVENT_TYPE_FINALIZE",
+    "__version__",
+]

warpscope/_decode.py ADDED Viewed

@@ -0,0 +1,141 @@
+# SPDX-License-Identifier: MIT
+# Copyright (c) 2026 warpscope contributors
+#
+# Credits:
+#   - Original idea & design: 侯博涵 (Hou Bohan)
+#       https://zhuanlan.zhihu.com/p/2054305616391304228
+#   - Decode logic adapted from Apache TVM TIRx CudaProfiler (Apache-2.0):
+#       https://github.com/apache/tvm/blob/main/python/tvm/tirx/bench.py
+#       https://tvm.apache.org/docs/tirx/native_basics/cuda/profiling.html
+#   - Implementation written by Claude Opus (Anthropic).
+"""Decode a warpscope profiler buffer into spans/instants."""
+from __future__ import annotations
+from dataclasses import dataclass, field
+from typing import Dict, List, Optional
+import numpy as np
+from ._wire import (
+    EVENT_TYPE_BEGIN,
+    EVENT_TYPE_END,
+    EVENT_TYPE_FINALIZE,
+    EVENT_TYPE_INSTANT,
+    decode_tag,
+)
+@dataclass
+class Span:
+    block: int
+    group: int
+    event_id: int
+    begin_ns: int
+    dur_ns: int
+@dataclass
+class Instant:
+    block: int
+    group: int
+    event_id: int
+    ts_ns: int
+@dataclass
+class TraceResult:
+    num_blocks: int
+    num_groups: int
+    spans: List[Span] = field(default_factory=list)
+    instants: List[Instant] = field(default_factory=list)
+    event_names: Optional[Dict[int, str]] = None
+    group_names: Optional[Dict[int, str]] = None
+    def event_name(self, event_id: int) -> str:
+        if self.event_names and event_id in self.event_names:
+            return self.event_names[event_id]
+        return f"event_{event_id}"
+    def group_name(self, group: int) -> str:
+        if self.group_names and group in self.group_names:
+            return self.group_names[group]
+        return f"group_{group}"
+    def print_durations(self) -> None:
+        lanes: Dict[tuple, List[Span]] = {}
+        for s in self.spans:
+            lanes.setdefault((s.block, s.group), []).append(s)
+        for (block, group) in sorted(lanes):
+            parts = ", ".join(f"{self.event_name(s.event_id)}={s.dur_ns}ns" for s in lanes[(block, group)])
+            print(f"block {block} {self.group_name(group)}: {parts}")
+    def summary(self) -> Dict[tuple, Dict[str, float]]:
+        """Mean/max duration per (group, event) across all blocks."""
+        agg: Dict[tuple, List[int]] = {}
+        for s in self.spans:
+            agg.setdefault((s.group, s.event_id), []).append(s.dur_ns)
+        out = {}
+        for (group, ev), vals in agg.items():
+            arr = np.asarray(vals, dtype=np.float64)
+            out[(self.group_name(group), self.event_name(ev))] = {
+                "count": int(arr.size),
+                "mean_ns": float(arr.mean()),
+                "max_ns": float(arr.max()),
+            }
+        return out
+    def to_perfetto(self, path: str) -> str:
+        """Alias for to_chrome_trace (Perfetto UI also reads Chrome JSON)."""
+        return self.to_chrome_trace(path)
+    def to_chrome_trace(self, path: str) -> str:
+        from ._perfetto import write_chrome_trace
+        write_chrome_trace(self, path)
+        return path
+def decode(
+    buf,
+    event_names: Optional[Dict[int, str]] = None,
+    group_names: Optional[Dict[int, str]] = None,
+) -> TraceResult:
+    """Decode a buffer (numpy array / torch tensor / sequence of uint64)."""
+    if hasattr(buf, "detach"):  # torch tensor
+        buf = buf.detach().cpu().numpy()
+    arr = np.ascontiguousarray(buf).view(np.uint64).ravel()
+    if arr.size == 0:
+        return TraceResult(0, 0, event_names=event_names, group_names=group_names)
+    header = int(arr[0])
+    num_groups = header >> 32
+    num_blocks = header & 0xFFFFFFFF
+    result = TraceResult(int(num_blocks), int(num_groups),
+                         event_names=event_names, group_names=group_names)
+    if num_groups == 0:
+        return result
+    opens: Dict[tuple, int] = {}
+    finished = set()
+    for i in range(1, arr.size):
+        w = int(arr[i])
+        if w == 0:
+            continue
+        ts = w >> 32
+        tag = w & 0xFFFFFFFF
+        block, group, ev, typ = decode_tag(tag, num_groups)
+        if typ == EVENT_TYPE_FINALIZE:
+            finished.add((block, group))
+            continue
+        if (block, group) in finished:
+            continue
+        if typ == EVENT_TYPE_BEGIN:
+            opens[(block, group, ev)] = ts
+        elif typ == EVENT_TYPE_END:
+            t0 = opens.pop((block, group, ev), None)
+            if t0 is not None:
+                result.spans.append(Span(block, group, ev, t0, (ts - t0) & 0xFFFFFFFF))
+        elif typ == EVENT_TYPE_INSTANT:
+            result.instants.append(Instant(block, group, ev, ts))
+    return result

warpscope/_perfetto.py ADDED Viewed

@@ -0,0 +1,83 @@
+# SPDX-License-Identifier: MIT
+# Copyright (c) 2026 warpscope contributors
+#
+# Credits:
+#   - Original idea & design: 侯博涵 (Hou Bohan)
+#       https://zhuanlan.zhihu.com/p/2054305616391304228
+#   - Perfetto export logic adapted from Apache TVM TIRx `export_to_perfetto_trace`
+#     (Apache-2.0; itself adapted from flashinfer):
+#       https://github.com/apache/tvm/blob/main/python/tvm/tirx/bench.py
+#       https://tvm.apache.org/docs/tirx/native_basics/cuda/profiling.html
+#   - Implementation written by Claude Opus (Anthropic).
+"""Trace-file writers for warpscope.
+Default output is Chrome Trace Event JSON, which opens in both chrome://tracing and
+https://ui.perfetto.dev with zero extra dependencies. An optional native
+.perfetto-trace backend is available via tg4perfetto if installed.
+"""
+from __future__ import annotations
+import json
+def write_chrome_trace(result, path: str) -> None:
+    """Write a Chrome Trace Event JSON file. pid = block, tid = group; ts/dur in us."""
+    events = []
+    for s in result.spans:
+        events.append({
+            "name": result.event_name(s.event_id),
+            "ph": "X",
+            "ts": s.begin_ns / 1000.0,
+            "dur": s.dur_ns / 1000.0,
+            "pid": s.block,
+            "tid": s.group,
+        })
+    for ins in result.instants:
+        events.append({
+            "name": result.event_name(ins.event_id),
+            "ph": "i",
+            "ts": ins.ts_ns / 1000.0,
+            "pid": ins.block,
+            "tid": ins.group,
+            "s": "t",
+        })
+    for b in range(result.num_blocks):
+        events.append({
+            "name": "process_name", "ph": "M", "pid": b,
+            "args": {"name": f"block_{b}"},
+        })
+        for g in range(result.num_groups):
+            events.append({
+                "name": "thread_name", "ph": "M", "pid": b, "tid": g,
+                "args": {"name": result.group_name(g)},
+            })
+    with open(path, "w", encoding="utf-8") as f:
+        json.dump({"displayTimeUnit": "ns", "traceEvents": events}, f)
+def write_perfetto_native(result, path: str) -> None:
+    """Write a native .perfetto-trace via tg4perfetto (optional dependency)."""
+    try:
+        from tg4perfetto import TraceGenerator
+    except ImportError as err:  # pragma: no cover
+        raise ImportError(
+            "tg4perfetto is required for native perfetto traces; "
+            "use to_chrome_trace() for a zero-dependency JSON instead, or "
+            "pip install git+https://github.com/ihavnoid/tg4perfetto.git"
+        ) from err
+    tgen = TraceGenerator(path)
+    lanes = {}
+    for b in range(result.num_blocks):
+        pid = tgen.create_group(f"block_{b}")
+        for g in range(result.num_groups):
+            lanes[(b, g)] = pid.create_group(result.group_name(g))
+    tracks = {}
+    for s in result.spans:
+        key = (s.block, s.group, s.event_id)
+        if key not in tracks:
+            tracks[key] = lanes[(s.block, s.group)].create_track()
+        tracks[key].open(s.begin_ns, result.event_name(s.event_id))
+        tracks[key].close(s.begin_ns + s.dur_ns)
+    tgen.flush()

warpscope/_wire.py ADDED Viewed

@@ -0,0 +1,57 @@
+# SPDX-License-Identifier: MIT
+# Copyright (c) 2026 warpscope contributors
+#
+# Credits:
+#   - Original idea & design: 侯博涵 (Hou Bohan)
+#       https://zhuanlan.zhihu.com/p/2054305616391304228
+#   - Wire format adapted from Apache TVM TIRx CudaProfiler (Apache-2.0):
+#       https://github.com/apache/tvm/blob/main/python/tvm/tirx/bench.py
+#       https://tvm.apache.org/docs/tirx/native_basics/cuda/profiling.html
+#   - Implementation written by Claude Opus (Anthropic).
+"""warpscope wire format (v1) — the shared ABI between device and host.
+record = (globaltimer_lo32 << 32) | tag32
+tag32  = (block_group << 12) | (event_id << 2) | event_type
+block_group = block_idx * num_groups + group_id
+"""
+WIRE_VERSION = 1
+EVENT_TYPE_BEGIN = 0
+EVENT_TYPE_END = 1
+EVENT_TYPE_INSTANT = 2
+EVENT_TYPE_FINALIZE = 3
+_EVENT_ID_BITS = 10
+_EVENT_ID_MASK = (1 << _EVENT_ID_BITS) - 1  # 0x3FF
+_EVENT_TYPE_MASK = 0x3
+_TAG_BITS = 32
+def pack_tag(block: int, group: int, num_groups: int, event_id: int, event_type: int) -> int:
+    """Pack a 32-bit tag (the low half of a record)."""
+    if not (0 <= event_id <= _EVENT_ID_MASK):
+        raise ValueError(f"event_id {event_id} exceeds {_EVENT_ID_MASK}")
+    if not (0 <= event_type <= _EVENT_TYPE_MASK):
+        raise ValueError(f"event_type {event_type} out of range")
+    block_group = block * num_groups + group
+    tag = (block_group << 12) | ((event_id & _EVENT_ID_MASK) << 2) | (event_type & _EVENT_TYPE_MASK)
+    if tag >> _TAG_BITS:
+        raise ValueError("tag overflows 32 bits (too many lanes); reduce num_blocks*num_groups")
+    return tag
+def decode_tag(tag: int, num_groups: int):
+    """Return (block, group, event_id, event_type) from a 32-bit tag."""
+    block_group = tag >> 12
+    event_id = (tag >> 2) & _EVENT_ID_MASK
+    event_type = tag & _EVENT_TYPE_MASK
+    return block_group // num_groups, block_group % num_groups, event_id, event_type
+def pack_record(timestamp_ns: int, tag: int) -> int:
+    return ((timestamp_ns & 0xFFFFFFFF) << 32) | (tag & 0xFFFFFFFF)
+def pack_header(num_groups: int, num_blocks: int) -> int:
+    return ((num_groups & 0xFFFFFFFF) << 32) | (num_blocks & 0xFFFFFFFF)

warpscope/buffer.py ADDED Viewed

@@ -0,0 +1,79 @@
+"""Host-side profiler buffer management."""
+from __future__ import annotations
+from typing import Dict, Optional
+import numpy as np
+from ._decode import TraceResult, decode
+class Profiler:
+    """Allocates and decodes a warpscope profiler buffer.
+    Parameters
+    ----------
+    num_blocks : int
+        Number of CTAs that will record (persistent grid: one block per SM).
+    num_groups : int
+        Logical sub-tracks per block (e.g. one per warp-group / role).
+    max_records_per_lane : int
+        Per-(block, group) record cap. Must be passed identically to the device
+        ``init(..., max_records_per_lane=...)`` so host buffer size and device cap agree.
+    device : str
+        "cuda"/"cuda:N" allocates a torch tensor on GPU (requires torch). "cpu"
+        allocates a numpy array (for testing/decoding pre-captured buffers).
+    """
+    def __init__(self, num_blocks: int, num_groups: int,
+                 max_records_per_lane: int = 64, device: str = "cuda"):
+        self.num_blocks = int(num_blocks)
+        self.num_groups = int(num_groups)
+        self.max_records_per_lane = int(max_records_per_lane)
+        self.write_stride = self.num_blocks * self.num_groups
+        self.num_slots = 1 + self.write_stride * self.max_records_per_lane
+        self.device = device
+        self._torch = None
+        self.tensor = None
+        self.array = None
+        if device != "cpu":
+            try:
+                import torch
+                self._torch = torch
+            except ImportError as err:
+                raise ImportError(
+                    f"device={device!r} requires torch; install torch or use device='cpu'"
+                ) from err
+            self.tensor = torch.zeros(self.num_slots, dtype=torch.int64, device=device)
+        else:
+            self.array = np.zeros(self.num_slots, dtype=np.uint64)
+    @property
+    def ptr(self) -> int:
+        """Device/host pointer to pass to the kernel as the profiler buffer arg."""
+        if self.tensor is not None:
+            return self.tensor.data_ptr()
+        return self.array.ctypes.data
+    def reset(self) -> None:
+        if self.tensor is not None:
+            self.tensor.zero_()
+        else:
+            self.array.fill(0)
+    def numpy(self) -> np.ndarray:
+        if self.tensor is not None:
+            return self.tensor.detach().cpu().numpy().view(np.uint64)
+        return self.array
+    def decode(self,
+               event_names: Optional[Dict[int, str]] = None,
+               group_names: Optional[Dict[int, str]] = None) -> TraceResult:
+        return decode(self.numpy(), event_names=event_names, group_names=group_names)
+    def __repr__(self) -> str:
+        return (f"Profiler(num_blocks={self.num_blocks}, num_groups={self.num_groups}, "
+                f"max_records_per_lane={self.max_records_per_lane}, "
+                f"write_stride={self.write_stride}, num_slots={self.num_slots})")

warpscope/cli.py ADDED Viewed

@@ -0,0 +1,43 @@
+"""warpscope command-line helpers (build-system integration)."""
+from __future__ import annotations
+import argparse
+import os
+import sys
+def include_dir() -> str:
+    """Absolute path to the bundled CUDA/C++ headers (for `nvcc -I`)."""
+    return os.path.join(os.path.dirname(os.path.abspath(__file__)), "include")
+def main(argv=None) -> int:
+    parser = argparse.ArgumentParser(
+        prog="warpscope",
+        description="In-kernel %globaltimer profiler: header path and version helpers.",
+    )
+    parser.add_argument("--include", action="store_true",
+                        help="Print the include directory containing warpscope.cuh")
+    parser.add_argument("--cuh", action="store_true",
+                        help="Print the full path to warpscope.cuh")
+    parser.add_argument("--version", action="store_true", help="Print the package version")
+    args = parser.parse_args(argv)
+    if args.include:
+        print(include_dir())
+        return 0
+    if args.cuh:
+        print(os.path.join(include_dir(), "warpscope.cuh"))
+        return 0
+    if args.version:
+        from . import __version__
+        print(__version__)
+        return 0
+    parser.print_help()
+    return 0
+if __name__ == "__main__":
+    sys.exit(main())

warpscope/include/warpscope.cuh ADDED Viewed

@@ -0,0 +1,115 @@
+// warpscope - in-kernel %globaltimer profiler (device header)
+//
+// Single-file, dependency-free, NVRTC-safe. Include this in any CUDA / CUTLASS /
+// NVRTC / JIT kernel, bracket logical stages with start()/end(), and pass a zeroed
+// uint64 buffer as an ordinary kernel argument. Decode on the host with
+// warpscope_host.hpp (C++) or the `warpscope` Python package.
+//
+// Wire format (v1, shared ABI):
+//   record = (globaltimer_lo32 << 32) | tag32
+//   tag32  = (block_group << 12) | (event_id << 2) | event_type
+//   block_group = blockIdx.x * num_groups + group_id
+//   event_type : 0=begin, 1=end, 2=instant, 3=finalize
+//   buf[0] header = (uint64(num_groups) << 32) | num_blocks   (written by block0/thread0)
+//
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2026 warpscope contributors
+//
+// Credits:
+//   - Original idea & in-kernel CUDA profiler design: 侯博涵 (Hou Bohan)
+//       https://zhuanlan.zhihu.com/p/2054305616391304228
+//   - Wire format adapted from Apache TVM TIRx `CudaProfiler` (Apache-2.0):
+//       https://github.com/apache/tvm/blob/main/python/tvm/tirx/bench.py
+//       https://tvm.apache.org/docs/tirx/native_basics/cuda/profiling.html
+//   - Implementation written by Claude Opus (Anthropic).
+#pragma once
+#include <cstdint>
+#define WARPSCOPE_WIRE_VERSION 1
+namespace ws {
+// Low 32 bits of the global nanosecond timer. Wraps every ~4.29 s.
+__device__ __forceinline__ uint32_t globaltimer_lo() {
+    uint32_t t;
+    asm volatile("mov.u32 %0, %%globaltimer_lo;" : "=r"(t));
+    return t;
+}
+// Strongly-typed event id wrapper so call sites read clearly.
+struct Ev { uint32_t id; };
+enum EventType : uint32_t {
+    kBegin    = 0,
+    kEnd      = 1,
+    kInstant  = 2,
+    kFinalize = 3,
+};
+// One profiler instance per (block, group) recording thread.
+// kEnable == false compiles every method to nothing (zero-overhead off path).
+template <bool kEnable = true>
+struct Profiler {
+    uint64_t* buf      = nullptr;
+    uint32_t  stride   = 0;            // write_stride = num_blocks * num_groups
+    uint32_t  cursor   = 0;            // this lane's next slot
+    uint32_t  tag_base = 0;            // block_group << 12
+    uint32_t  cap      = 0xffffffffu;  // last writable slot (record cap)
+    bool      leader   = false;        // exactly one writer per (block, group)
+    // Call once, before any start(). `is_leader` must be true for exactly one
+    // thread of this (block, group); only that thread writes records.
+    __device__ void init(uint64_t* p, uint32_t write_stride, uint32_t group_id,
+                         uint32_t num_groups, uint32_t num_blocks, bool is_leader,
+                         uint32_t max_records_per_lane = 0xffffffffu) {
+        if constexpr (!kEnable) return;
+        buf = p;
+        stride = write_stride;
+        leader = is_leader;
+        const uint32_t bg = blockIdx.x * num_groups + group_id;
+        tag_base = bg << 12;
+        cursor   = 1u + bg;
+        cap      = (max_records_per_lane == 0xffffffffu)
+                 ? 0xffffffffu
+                 : (1u + bg + (max_records_per_lane - 1u) * write_stride);
+        if (blockIdx.x == 0 && threadIdx.x == 0)
+            buf[0] = (static_cast<uint64_t>(num_groups) << 32) | num_blocks;
+    }
+    __device__ __forceinline__ void rec(uint32_t ev, uint32_t type) {
+        if constexpr (!kEnable) return;
+        // Order this thread's prior memory traffic before stamping the timer.
+        __threadfence_block();
+        if (leader && buf != nullptr && cursor <= cap) {
+            buf[cursor] = (static_cast<uint64_t>(globaltimer_lo()) << 32)
+                        | (tag_base | (ev << 2) | type);
+            cursor += stride;
+        }
+    }
+    __device__ __forceinline__ void start(Ev e) { rec(e.id, kBegin); }
+    __device__ __forceinline__ void end  (Ev e) { rec(e.id, kEnd); }
+    __device__ __forceinline__ void inst (Ev e) { rec(e.id, kInstant); }
+    __device__ __forceinline__ void finalize()  { rec(0u, kFinalize); }
+};
+// RAII region guard: start() on construction, end() on scope exit.
+template <bool kEnable>
+struct ScopedRegion {
+    Profiler<kEnable>& p;
+    Ev e;
+    __device__ ScopedRegion(Profiler<kEnable>& p_, Ev e_) : p(p_), e(e_) { p.start(e); }
+    __device__ ~ScopedRegion() { p.end(e); }
+};
+// CTAD guide so `ws::ScopedRegion r{prof, Ev{...}}` deduces kEnable from the profiler.
+template <bool kEnable>
+ScopedRegion(Profiler<kEnable>&, Ev) -> ScopedRegion<kEnable>;
+} // namespace ws
+// Bracket a C++ scope: `{ WS_REGION(p, EvCompute); ...work... }`
+#define WS_CONCAT_(a, b) a##b
+#define WS_CONCAT(a, b) WS_CONCAT_(a, b)
+#define WS_REGION(prof, ev) \
+    ::ws::ScopedRegion WS_CONCAT(_ws_region_, __LINE__){(prof), ::ws::Ev{(ev)}}

warpscope/include/warpscope_host.hpp ADDED Viewed

@@ -0,0 +1,168 @@
+// warpscope - host-side decoder + trace writer (header-only, pure C++)
+//
+// No CUDA, no protobuf, no external deps. Include in any host translation unit to
+// turn a warpscope profiler buffer (read back from device) into:
+//   - per-(block, group) stage durations (ws::decode)
+//   - a Chrome Trace Event JSON file openable in chrome://tracing AND
+//     https://ui.perfetto.dev (ws::write_chrome_trace)
+//
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2026 warpscope contributors
+//
+// Credits:
+//   - Original idea & in-kernel CUDA profiler design: 侯博涵 (Hou Bohan)
+//       https://zhuanlan.zhihu.com/p/2054305616391304228
+//   - Wire format and host decode/Perfetto export adapted from Apache TVM TIRx
+//     `CudaProfiler` / `export_to_perfetto_trace` (Apache-2.0):
+//       https://github.com/apache/tvm/blob/main/python/tvm/tirx/bench.py
+//       https://tvm.apache.org/docs/tirx/native_basics/cuda/profiling.html
+//   - Implementation written by Claude Opus (Anthropic).
+#pragma once
+#include <cstdint>
+#include <cstdio>
+#include <map>
+#include <string>
+#include <tuple>
+#include <vector>
+namespace ws {
+struct Span {
+    uint32_t block;
+    uint32_t group;
+    uint32_t event_id;
+    uint32_t begin_ns;
+    uint32_t dur_ns;
+};
+struct Instant {
+    uint32_t block;
+    uint32_t group;
+    uint32_t event_id;
+    uint32_t ts_ns;
+};
+struct Trace {
+    uint32_t num_blocks = 0;
+    uint32_t num_groups = 0;
+    std::vector<Span> spans;
+    std::vector<Instant> instants;
+};
+// Decode a buffer of `n` uint64 records (including the slot-0 header).
+inline Trace decode(const uint64_t* buf, size_t n) {
+    Trace tr;
+    if (buf == nullptr || n == 0) return tr;
+    tr.num_groups = static_cast<uint32_t>(buf[0] >> 32);
+    tr.num_blocks = static_cast<uint32_t>(buf[0] & 0xffffffffu);
+    if (tr.num_groups == 0) return tr;
+    std::map<std::tuple<uint32_t, uint32_t, uint32_t>, uint32_t> opens;
+    std::map<std::pair<uint32_t, uint32_t>, bool> finished;
+    for (size_t i = 1; i < n; ++i) {
+        const uint64_t w = buf[i];
+        if (w == 0) continue;
+        const uint32_t ts   = static_cast<uint32_t>(w >> 32);
+        const uint32_t tag  = static_cast<uint32_t>(w & 0xffffffffu);
+        const uint32_t bg   = tag >> 12;
+        const uint32_t ev   = (tag >> 2) & 0x3ffu;
+        const uint32_t type = tag & 0x3u;
+        const uint32_t block = bg / tr.num_groups;
+        const uint32_t group = bg % tr.num_groups;
+        if (type == 3u) {  // finalize
+            finished[{block, group}] = true;
+            continue;
+        }
+        if (finished.count({block, group})) continue;
+        if (type == 0u) {  // begin
+            opens[{block, group, ev}] = ts;
+        } else if (type == 1u) {  // end
+            auto it = opens.find({block, group, ev});
+            if (it != opens.end()) {
+                tr.spans.push_back({block, group, ev, it->second,
+                                    (ts - it->second) & 0xffffffffu});
+                opens.erase(it);
+            }
+        } else if (type == 2u) {  // instant
+            tr.instants.push_back({block, group, ev, ts});
+        }
+    }
+    return tr;
+}
+inline Trace decode(const std::vector<uint64_t>& buf) {
+    return decode(buf.data(), buf.size());
+}
+namespace detail {
+inline const char* ev_name(const std::vector<std::string>& names, uint32_t id) {
+    return id < names.size() ? names[id].c_str() : "event";
+}
+inline std::string grp_name(const std::vector<std::string>& names, uint32_t id) {
+    return id < names.size() ? names[id] : ("group_" + std::to_string(id));
+}
+}  // namespace detail
+// Write a Chrome Trace Event JSON file. pid = block, tid = group; ts/dur in us.
+inline void write_chrome_trace(const Trace& tr,
+                               const std::vector<std::string>& event_names,
+                               const std::vector<std::string>& group_names,
+                               const std::string& path) {
+    FILE* f = std::fopen(path.c_str(), "w");
+    if (f == nullptr) return;
+    std::fprintf(f, "{\"displayTimeUnit\":\"ns\",\"traceEvents\":[\n");
+    bool first = true;
+    auto sep = [&]() { const char* s = first ? "" : ",\n"; first = false; return s; };
+    for (const Span& s : tr.spans) {
+        std::fprintf(f,
+            "%s{\"name\":\"%s\",\"ph\":\"X\",\"ts\":%.3f,\"dur\":%.3f,\"pid\":%u,\"tid\":%u}",
+            sep(), detail::ev_name(event_names, s.event_id),
+            s.begin_ns / 1000.0, s.dur_ns / 1000.0, s.block, s.group);
+    }
+    for (const Instant& in : tr.instants) {
+        std::fprintf(f,
+            "%s{\"name\":\"%s\",\"ph\":\"i\",\"ts\":%.3f,\"pid\":%u,\"tid\":%u,\"s\":\"t\"}",
+            sep(), detail::ev_name(event_names, in.event_id),
+            in.ts_ns / 1000.0, in.block, in.group);
+    }
+    // Readable lane names (process per block, thread per group).
+    for (uint32_t b = 0; b < tr.num_blocks; ++b) {
+        std::fprintf(f,
+            "%s{\"name\":\"process_name\",\"ph\":\"M\",\"pid\":%u,\"args\":{\"name\":\"block_%u\"}}",
+            sep(), b, b);
+        for (uint32_t g = 0; g < tr.num_groups; ++g) {
+            std::fprintf(f,
+                "%s{\"name\":\"thread_name\",\"ph\":\"M\",\"pid\":%u,\"tid\":%u,\"args\":{\"name\":\"%s\"}}",
+                sep(), b, g, detail::grp_name(group_names, g).c_str());
+        }
+    }
+    std::fprintf(f, "\n]}\n");
+    std::fclose(f);
+}
+// Convenience: decode + write in one call.
+inline void write_chrome_trace(const uint64_t* buf, size_t n,
+                               const std::vector<std::string>& event_names,
+                               const std::vector<std::string>& group_names,
+                               const std::string& path) {
+    write_chrome_trace(decode(buf, n), event_names, group_names, path);
+}
+// Print per-(block, group) stage durations to stdout.
+inline void print_durations(const Trace& tr,
+                            const std::vector<std::string>& event_names) {
+    std::map<std::pair<uint32_t, uint32_t>, std::vector<const Span*>> by_lane;
+    for (const Span& s : tr.spans) by_lane[{s.block, s.group}].push_back(&s);
+    for (auto& kv : by_lane) {
+        std::printf("block %u group %u:", kv.first.first, kv.first.second);
+        for (const Span* s : kv.second)
+            std::printf(" %s=%uns", detail::ev_name(event_names, s->event_id), s->dur_ns);
+        std::printf("\n");
+    }
+}
+}  // namespace ws

warpscope-0.1.0.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,156 @@
+Metadata-Version: 2.4
+Name: warpscope
+Version: 0.1.0
+Summary: In-kernel %globaltimer profiler for warp-specialized CUDA kernels (Perfetto/Chrome timelines).
+Author: warpscope contributors
+License: MIT
+Project-URL: Homepage, https://github.com/YangWang92/warpscope
+Keywords: cuda,profiler,globaltimer,perfetto,gpu,warp-specialized,tracing
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Programming Language :: Python :: 3
+Classifier: Topic :: Software Development :: Libraries
+Classifier: Environment :: GPU :: NVIDIA CUDA
+Requires-Python: >=3.9
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: numpy>=1.21
+Provides-Extra: torch
+Requires-Dist: torch; extra == "torch"
+Provides-Extra: perfetto
+Requires-Dist: tg4perfetto; extra == "perfetto"
+Provides-Extra: dev
+Requires-Dist: pytest; extra == "dev"
+Dynamic: license-file
+# warpscope
+English | [中文](README.zh.md)
+> Credits: the idea and design all come from **侯博涵 (Hou Bohan)**'s write-up
+> ([zhihu](https://zhuanlan.zhihu.com/p/2054305616391304228)); the wire format and the
+> host-side decode / Perfetto export are adapted from **Apache TVM TIRx `CudaProfiler`**
+> ([bench.py](https://github.com/apache/tvm/blob/main/python/tvm/tirx/bench.py),
+> [docs](https://tvm.apache.org/docs/tirx/native_basics/cuda/profiling.html)).
+> The implementation in this repo was **written by Claude Opus (Anthropic)**.
+In-kernel `%globaltimer` profiler for **warp-specialized CUDA kernels**. Bracket the
+logical stages inside a kernel (TMA load, MMA, softmax, epilogue, ...) with
+`start`/`end` markers; one leader thread per logical group stamps the GPU global timer
+into a buffer you pass as an ordinary kernel argument. Decode it on the host into
+per-`(block, group)` durations or a **Perfetto / Chrome trace** to see how the
+producer and consumer warp-groups actually overlap — something total launch time and
+SM-level counters can't show.
+It is **not** zero cost (a timer read + a global store + a block fence per event), so
+it is a debugging/analysis tool. Build with the profiler disabled for production.
+## Layout
+```
+warpscope/
+  include/
+    warpscope.cuh         # device header (header-only, NVRTC-safe)
+    warpscope_host.hpp    # host decoder + Chrome-trace writer (header-only, pure C++)
+  *.py                    # Python: Profiler buffer mgmt, decode, trace export
+examples/                 # toy CUDA program (pure C++ path) + python driver
+tests/                    # wire-format + decode tests
+```
+## Install
+```bash
+pip install -e .            # core (numpy only)
+pip install -e ".[torch]"   # + GPU buffer allocation
+pip install -e ".[dev]"     # + pytest
+```
+## Device side (CUDA C++)
+```cpp
+#include <warpscope.cuh>
+enum : uint32_t { EvWait = 0, EvWork = 1 };
+__global__ void k(..., uint64_t* prof, uint32_t stride,
+                  uint32_t num_groups, uint32_t num_blocks, uint32_t max_rec) {
+    ws::Profiler<true> p;     // <false> compiles to a no-op
+    const uint32_t warp = threadIdx.x / 32, lane = threadIdx.x % 32;
+    if (warp == 0) {                                  // e.g. TMA producer = group 0
+        p.init(prof, stride, /*group=*/0, num_groups, num_blocks,
+               /*leader=*/lane == 0, max_rec);
+        { WS_REGION(p, EvWait); /* barrier wait */ }  // RAII start/end
+        { WS_REGION(p, EvWork); /* issue work  */ }
+        p.finalize();
+    }
+    // ... other warp-groups: init with their own group id + one leader each ...
+}
+```
+Build: `nvcc -I"$(warpscope --include)" -arch=sm_100a my.cu`
+## Host side — pick one
+**Pure C++ (header-only, no Python):**
+```cpp
+#include <warpscope_host.hpp>
+std::vector<uint64_t> h(slots);                 // cudaMemcpy buffer back into h
+ws::write_chrome_trace(h.data(), h.size(),
+    /*events*/ {"wait", "work"},
+    /*groups*/ {"tma", "umma", "utccp", "epilogue"},
+    "trace.json");                              // open in chrome://tracing or perfetto
+```
+**Python:**
+```python
+import warpscope as ws
+prof = ws.Profiler(num_blocks=num_sms, num_groups=4, max_records_per_lane=64)
+launch(..., prof.ptr)        # pass the device pointer
+torch.cuda.synchronize()
+res = prof.decode(event_names={0: "wait", 1: "work"},
+                  group_names={0: "tma", 1: "umma", 2: "utccp", 3: "epilogue"})
+res.print_durations()
+res.to_perfetto("trace.json")   # Chrome JSON; opens in ui.perfetto.dev too
+```
+## Output
+The raw output is a `uint64` buffer. Both host paths turn it into a **Chrome Trace
+Event JSON** file (`pid = block`, `tid = group`, `ts/dur` in microseconds) that opens
+directly in `chrome://tracing` and <https://ui.perfetto.dev>. A native
+`.perfetto-trace` writer is available via the optional `tg4perfetto` dependency.
+## Wire format (v1, shared ABI)
+```
+record = (globaltimer_lo32 << 32) | tag32
+tag32  = (block_group << 12) | (event_id << 2) | event_type
+block_group = block_idx * num_groups + group_id
+event_type : 0=begin 1=end 2=instant 3=finalize
+buf[0] header = (num_groups << 32) | num_blocks
+```
+Identical to the format used by TIRx/flashinfer, so traces are cross-tool compatible.
+## Caveats
+- Zero the buffer before launch (the decoder treats 0 as empty).
+- Exactly one leader thread per `(block, group)` lane (two writers clobber the cursor).
+- `%globaltimer_lo` is 32-bit ns: ~tens-of-ns resolution and a ~4.29 s wrap.
+- Persistent grids stream records — cap with `max_records_per_lane` (host) which is
+  also enforced device-side via `init(..., max_records_per_lane=...)`.
+- The fence + store perturb tight pipelines; keep events coarse and compare against an
+  unprofiled (`ws::Profiler<false>`) build.
+## Credits & License
+Licensed under the **MIT License** (see [LICENSE](LICENSE)).
+- **侯博涵 (Hou Bohan)** — original idea and write-up:
+  <https://zhuanlan.zhihu.com/p/2054305616391304228>
+- **Apache TVM TIRx `CudaProfiler`** (Apache-2.0) — wire format + host decode/Perfetto
+  export are adapted from it:
+  <https://github.com/apache/tvm/blob/main/python/tvm/tirx/bench.py> ·
+  <https://tvm.apache.org/docs/tirx/native_basics/cuda/profiling.html>
+- The CUDA/Python implementation in this repository was **written by Claude Opus (Anthropic)**.

warpscope-0.1.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,14 @@
+warpscope/__init__.py,sha256=x48196Y5D0FSj8j6xVmh6IBjaqS98Oae-p69jv6o-KE,1362
+warpscope/_decode.py,sha256=msIdBLi7xQMIDZCkbcLUVXHbnB9hS7hq439uMRmG0ko,4575
+warpscope/_perfetto.py,sha256=RDtoJH55xVF5y6hKANgcrt1XocsMoAe9ve842Q7Fqfk,3117
+warpscope/_wire.py,sha256=QUA9_azwqrjY6gyIAl4ou4TskPBCo7L6Z4SgUFFxy2Y,2163
+warpscope/buffer.py,sha256=Ys_oVaFYjUYnAnxhvhSrTW_-37xm7piVtGM23tFav7s,2922
+warpscope/cli.py,sha256=tWzzQzcB6-4HBUh7s8hKa9rtShCaC6H8KKBOKM4Qu9E,1250
+warpscope/include/warpscope.cuh,sha256=FMbJREQxEYykhSun0WhXeAHMUB16EEeFzrhFhot8ToM,4708
+warpscope/include/warpscope_host.hpp,sha256=_K5OxGNu0CeFSVkO-lUYay1zfnFL0plUOOgaxQxjY1M,6390
+warpscope-0.1.0.dist-info/licenses/LICENSE,sha256=ecqGRSwPBofcM0ZLMKUHmwNCPF50n5pM6ZSlrlHOKB4,1668
+warpscope-0.1.0.dist-info/METADATA,sha256=Pe_tHpCVemUsO8K5vY5hlygdiYTEjkvaLZsib9w457Y,6192
+warpscope-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
+warpscope-0.1.0.dist-info/entry_points.txt,sha256=bCos2hE8zn2ZKbzNx-itvIuwmYB0-F3Hw3P2qZucl0k,49
+warpscope-0.1.0.dist-info/top_level.txt,sha256=PEA9PCUwAmLJPxGVtUg19HBdP7KKnZ4e2eIybQQK1iA,10
+warpscope-0.1.0.dist-info/RECORD,,

warpscope-0.1.0.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,5 @@
+Wheel-Version: 1.0
+Generator: setuptools (82.0.1)
+Root-Is-Purelib: true
+Tag: py3-none-any

warpscope-0.1.0.dist-info/entry_points.txt ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ [console_scripts]
2	+ warpscope = warpscope.cli:main

warpscope-0.1.0.dist-info/licenses/LICENSE ADDED Viewed

@@ -0,0 +1,37 @@
+MIT License
+Copyright (c) 2026 warpscope contributors
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+---
+Credits / Attribution
+This project's in-kernel CUDA profiling design and wire format originate from:
+  * 侯博涵 (Hou Bohan) — original idea and write-up:
+    https://zhuanlan.zhihu.com/p/2054305616391304228
+  * Apache TVM, TIRx `CudaProfiler` (Apache License 2.0) — wire format and the
+    host-side decode / Perfetto export logic are adapted from:
+    https://github.com/apache/tvm/blob/main/python/tvm/tirx/bench.py
+    https://tvm.apache.org/docs/tirx/native_basics/cuda/profiling.html
+The CUDA implementation in this repository was written by Claude Opus (Anthropic).

warpscope-0.1.0.dist-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ warpscope