warpscope 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
warpscope/__init__.py ADDED
@@ -0,0 +1,48 @@
1
+ # SPDX-License-Identifier: MIT
2
+ # Copyright (c) 2026 warpscope contributors
3
+ #
4
+ # Credits:
5
+ # - Original idea & design: 侯博涵 (Hou Bohan)
6
+ # https://zhuanlan.zhihu.com/p/2054305616391304228
7
+ # - Wire format / decode / Perfetto export adapted from Apache TVM TIRx CudaProfiler:
8
+ # https://github.com/apache/tvm/blob/main/python/tvm/tirx/bench.py
9
+ # https://tvm.apache.org/docs/tirx/native_basics/cuda/profiling.html
10
+ # - Implementation written by Claude Opus (Anthropic).
11
+ """warpscope - in-kernel %globaltimer profiler for warp-specialized CUDA kernels.
12
+
13
+ Bracket logical stages inside a CUDA kernel with the device header
14
+ ``warpscope.cuh``, pass a zeroed uint64 buffer, then decode it here into per-
15
+ (block, group) durations or a Perfetto/Chrome trace file.
16
+ """
17
+
18
+ from ._decode import Instant, Span, TraceResult, decode
19
+ from ._wire import (
20
+ EVENT_TYPE_BEGIN,
21
+ EVENT_TYPE_END,
22
+ EVENT_TYPE_FINALIZE,
23
+ EVENT_TYPE_INSTANT,
24
+ WIRE_VERSION,
25
+ decode_tag,
26
+ pack_tag,
27
+ )
28
+ from .buffer import Profiler
29
+ from .cli import include_dir
30
+
31
+ __version__ = "0.1.0"
32
+
33
+ __all__ = [
34
+ "Profiler",
35
+ "decode",
36
+ "TraceResult",
37
+ "Span",
38
+ "Instant",
39
+ "include_dir",
40
+ "decode_tag",
41
+ "pack_tag",
42
+ "WIRE_VERSION",
43
+ "EVENT_TYPE_BEGIN",
44
+ "EVENT_TYPE_END",
45
+ "EVENT_TYPE_INSTANT",
46
+ "EVENT_TYPE_FINALIZE",
47
+ "__version__",
48
+ ]
warpscope/_decode.py ADDED
@@ -0,0 +1,141 @@
1
+ # SPDX-License-Identifier: MIT
2
+ # Copyright (c) 2026 warpscope contributors
3
+ #
4
+ # Credits:
5
+ # - Original idea & design: 侯博涵 (Hou Bohan)
6
+ # https://zhuanlan.zhihu.com/p/2054305616391304228
7
+ # - Decode logic adapted from Apache TVM TIRx CudaProfiler (Apache-2.0):
8
+ # https://github.com/apache/tvm/blob/main/python/tvm/tirx/bench.py
9
+ # https://tvm.apache.org/docs/tirx/native_basics/cuda/profiling.html
10
+ # - Implementation written by Claude Opus (Anthropic).
11
+ """Decode a warpscope profiler buffer into spans/instants."""
12
+
13
+ from __future__ import annotations
14
+
15
+ from dataclasses import dataclass, field
16
+ from typing import Dict, List, Optional
17
+
18
+ import numpy as np
19
+
20
+ from ._wire import (
21
+ EVENT_TYPE_BEGIN,
22
+ EVENT_TYPE_END,
23
+ EVENT_TYPE_FINALIZE,
24
+ EVENT_TYPE_INSTANT,
25
+ decode_tag,
26
+ )
27
+
28
+
29
+ @dataclass
30
+ class Span:
31
+ block: int
32
+ group: int
33
+ event_id: int
34
+ begin_ns: int
35
+ dur_ns: int
36
+
37
+
38
+ @dataclass
39
+ class Instant:
40
+ block: int
41
+ group: int
42
+ event_id: int
43
+ ts_ns: int
44
+
45
+
46
+ @dataclass
47
+ class TraceResult:
48
+ num_blocks: int
49
+ num_groups: int
50
+ spans: List[Span] = field(default_factory=list)
51
+ instants: List[Instant] = field(default_factory=list)
52
+ event_names: Optional[Dict[int, str]] = None
53
+ group_names: Optional[Dict[int, str]] = None
54
+
55
+ def event_name(self, event_id: int) -> str:
56
+ if self.event_names and event_id in self.event_names:
57
+ return self.event_names[event_id]
58
+ return f"event_{event_id}"
59
+
60
+ def group_name(self, group: int) -> str:
61
+ if self.group_names and group in self.group_names:
62
+ return self.group_names[group]
63
+ return f"group_{group}"
64
+
65
+ def print_durations(self) -> None:
66
+ lanes: Dict[tuple, List[Span]] = {}
67
+ for s in self.spans:
68
+ lanes.setdefault((s.block, s.group), []).append(s)
69
+ for (block, group) in sorted(lanes):
70
+ parts = ", ".join(f"{self.event_name(s.event_id)}={s.dur_ns}ns" for s in lanes[(block, group)])
71
+ print(f"block {block} {self.group_name(group)}: {parts}")
72
+
73
+ def summary(self) -> Dict[tuple, Dict[str, float]]:
74
+ """Mean/max duration per (group, event) across all blocks."""
75
+ agg: Dict[tuple, List[int]] = {}
76
+ for s in self.spans:
77
+ agg.setdefault((s.group, s.event_id), []).append(s.dur_ns)
78
+ out = {}
79
+ for (group, ev), vals in agg.items():
80
+ arr = np.asarray(vals, dtype=np.float64)
81
+ out[(self.group_name(group), self.event_name(ev))] = {
82
+ "count": int(arr.size),
83
+ "mean_ns": float(arr.mean()),
84
+ "max_ns": float(arr.max()),
85
+ }
86
+ return out
87
+
88
+ def to_perfetto(self, path: str) -> str:
89
+ """Alias for to_chrome_trace (Perfetto UI also reads Chrome JSON)."""
90
+ return self.to_chrome_trace(path)
91
+
92
+ def to_chrome_trace(self, path: str) -> str:
93
+ from ._perfetto import write_chrome_trace
94
+
95
+ write_chrome_trace(self, path)
96
+ return path
97
+
98
+
99
+ def decode(
100
+ buf,
101
+ event_names: Optional[Dict[int, str]] = None,
102
+ group_names: Optional[Dict[int, str]] = None,
103
+ ) -> TraceResult:
104
+ """Decode a buffer (numpy array / torch tensor / sequence of uint64)."""
105
+ if hasattr(buf, "detach"): # torch tensor
106
+ buf = buf.detach().cpu().numpy()
107
+ arr = np.ascontiguousarray(buf).view(np.uint64).ravel()
108
+ if arr.size == 0:
109
+ return TraceResult(0, 0, event_names=event_names, group_names=group_names)
110
+
111
+ header = int(arr[0])
112
+ num_groups = header >> 32
113
+ num_blocks = header & 0xFFFFFFFF
114
+ result = TraceResult(int(num_blocks), int(num_groups),
115
+ event_names=event_names, group_names=group_names)
116
+ if num_groups == 0:
117
+ return result
118
+
119
+ opens: Dict[tuple, int] = {}
120
+ finished = set()
121
+ for i in range(1, arr.size):
122
+ w = int(arr[i])
123
+ if w == 0:
124
+ continue
125
+ ts = w >> 32
126
+ tag = w & 0xFFFFFFFF
127
+ block, group, ev, typ = decode_tag(tag, num_groups)
128
+ if typ == EVENT_TYPE_FINALIZE:
129
+ finished.add((block, group))
130
+ continue
131
+ if (block, group) in finished:
132
+ continue
133
+ if typ == EVENT_TYPE_BEGIN:
134
+ opens[(block, group, ev)] = ts
135
+ elif typ == EVENT_TYPE_END:
136
+ t0 = opens.pop((block, group, ev), None)
137
+ if t0 is not None:
138
+ result.spans.append(Span(block, group, ev, t0, (ts - t0) & 0xFFFFFFFF))
139
+ elif typ == EVENT_TYPE_INSTANT:
140
+ result.instants.append(Instant(block, group, ev, ts))
141
+ return result
warpscope/_perfetto.py ADDED
@@ -0,0 +1,83 @@
1
+ # SPDX-License-Identifier: MIT
2
+ # Copyright (c) 2026 warpscope contributors
3
+ #
4
+ # Credits:
5
+ # - Original idea & design: 侯博涵 (Hou Bohan)
6
+ # https://zhuanlan.zhihu.com/p/2054305616391304228
7
+ # - Perfetto export logic adapted from Apache TVM TIRx `export_to_perfetto_trace`
8
+ # (Apache-2.0; itself adapted from flashinfer):
9
+ # https://github.com/apache/tvm/blob/main/python/tvm/tirx/bench.py
10
+ # https://tvm.apache.org/docs/tirx/native_basics/cuda/profiling.html
11
+ # - Implementation written by Claude Opus (Anthropic).
12
+ """Trace-file writers for warpscope.
13
+
14
+ Default output is Chrome Trace Event JSON, which opens in both chrome://tracing and
15
+ https://ui.perfetto.dev with zero extra dependencies. An optional native
16
+ .perfetto-trace backend is available via tg4perfetto if installed.
17
+ """
18
+
19
+ from __future__ import annotations
20
+
21
+ import json
22
+
23
+
24
+ def write_chrome_trace(result, path: str) -> None:
25
+ """Write a Chrome Trace Event JSON file. pid = block, tid = group; ts/dur in us."""
26
+ events = []
27
+ for s in result.spans:
28
+ events.append({
29
+ "name": result.event_name(s.event_id),
30
+ "ph": "X",
31
+ "ts": s.begin_ns / 1000.0,
32
+ "dur": s.dur_ns / 1000.0,
33
+ "pid": s.block,
34
+ "tid": s.group,
35
+ })
36
+ for ins in result.instants:
37
+ events.append({
38
+ "name": result.event_name(ins.event_id),
39
+ "ph": "i",
40
+ "ts": ins.ts_ns / 1000.0,
41
+ "pid": ins.block,
42
+ "tid": ins.group,
43
+ "s": "t",
44
+ })
45
+ for b in range(result.num_blocks):
46
+ events.append({
47
+ "name": "process_name", "ph": "M", "pid": b,
48
+ "args": {"name": f"block_{b}"},
49
+ })
50
+ for g in range(result.num_groups):
51
+ events.append({
52
+ "name": "thread_name", "ph": "M", "pid": b, "tid": g,
53
+ "args": {"name": result.group_name(g)},
54
+ })
55
+ with open(path, "w", encoding="utf-8") as f:
56
+ json.dump({"displayTimeUnit": "ns", "traceEvents": events}, f)
57
+
58
+
59
+ def write_perfetto_native(result, path: str) -> None:
60
+ """Write a native .perfetto-trace via tg4perfetto (optional dependency)."""
61
+ try:
62
+ from tg4perfetto import TraceGenerator
63
+ except ImportError as err: # pragma: no cover
64
+ raise ImportError(
65
+ "tg4perfetto is required for native perfetto traces; "
66
+ "use to_chrome_trace() for a zero-dependency JSON instead, or "
67
+ "pip install git+https://github.com/ihavnoid/tg4perfetto.git"
68
+ ) from err
69
+
70
+ tgen = TraceGenerator(path)
71
+ lanes = {}
72
+ for b in range(result.num_blocks):
73
+ pid = tgen.create_group(f"block_{b}")
74
+ for g in range(result.num_groups):
75
+ lanes[(b, g)] = pid.create_group(result.group_name(g))
76
+ tracks = {}
77
+ for s in result.spans:
78
+ key = (s.block, s.group, s.event_id)
79
+ if key not in tracks:
80
+ tracks[key] = lanes[(s.block, s.group)].create_track()
81
+ tracks[key].open(s.begin_ns, result.event_name(s.event_id))
82
+ tracks[key].close(s.begin_ns + s.dur_ns)
83
+ tgen.flush()
warpscope/_wire.py ADDED
@@ -0,0 +1,57 @@
1
+ # SPDX-License-Identifier: MIT
2
+ # Copyright (c) 2026 warpscope contributors
3
+ #
4
+ # Credits:
5
+ # - Original idea & design: 侯博涵 (Hou Bohan)
6
+ # https://zhuanlan.zhihu.com/p/2054305616391304228
7
+ # - Wire format adapted from Apache TVM TIRx CudaProfiler (Apache-2.0):
8
+ # https://github.com/apache/tvm/blob/main/python/tvm/tirx/bench.py
9
+ # https://tvm.apache.org/docs/tirx/native_basics/cuda/profiling.html
10
+ # - Implementation written by Claude Opus (Anthropic).
11
+ """warpscope wire format (v1) — the shared ABI between device and host.
12
+
13
+ record = (globaltimer_lo32 << 32) | tag32
14
+ tag32 = (block_group << 12) | (event_id << 2) | event_type
15
+ block_group = block_idx * num_groups + group_id
16
+ """
17
+
18
+ WIRE_VERSION = 1
19
+
20
+ EVENT_TYPE_BEGIN = 0
21
+ EVENT_TYPE_END = 1
22
+ EVENT_TYPE_INSTANT = 2
23
+ EVENT_TYPE_FINALIZE = 3
24
+
25
+ _EVENT_ID_BITS = 10
26
+ _EVENT_ID_MASK = (1 << _EVENT_ID_BITS) - 1 # 0x3FF
27
+ _EVENT_TYPE_MASK = 0x3
28
+ _TAG_BITS = 32
29
+
30
+
31
+ def pack_tag(block: int, group: int, num_groups: int, event_id: int, event_type: int) -> int:
32
+ """Pack a 32-bit tag (the low half of a record)."""
33
+ if not (0 <= event_id <= _EVENT_ID_MASK):
34
+ raise ValueError(f"event_id {event_id} exceeds {_EVENT_ID_MASK}")
35
+ if not (0 <= event_type <= _EVENT_TYPE_MASK):
36
+ raise ValueError(f"event_type {event_type} out of range")
37
+ block_group = block * num_groups + group
38
+ tag = (block_group << 12) | ((event_id & _EVENT_ID_MASK) << 2) | (event_type & _EVENT_TYPE_MASK)
39
+ if tag >> _TAG_BITS:
40
+ raise ValueError("tag overflows 32 bits (too many lanes); reduce num_blocks*num_groups")
41
+ return tag
42
+
43
+
44
+ def decode_tag(tag: int, num_groups: int):
45
+ """Return (block, group, event_id, event_type) from a 32-bit tag."""
46
+ block_group = tag >> 12
47
+ event_id = (tag >> 2) & _EVENT_ID_MASK
48
+ event_type = tag & _EVENT_TYPE_MASK
49
+ return block_group // num_groups, block_group % num_groups, event_id, event_type
50
+
51
+
52
+ def pack_record(timestamp_ns: int, tag: int) -> int:
53
+ return ((timestamp_ns & 0xFFFFFFFF) << 32) | (tag & 0xFFFFFFFF)
54
+
55
+
56
+ def pack_header(num_groups: int, num_blocks: int) -> int:
57
+ return ((num_groups & 0xFFFFFFFF) << 32) | (num_blocks & 0xFFFFFFFF)
warpscope/buffer.py ADDED
@@ -0,0 +1,79 @@
1
+ """Host-side profiler buffer management."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Dict, Optional
6
+
7
+ import numpy as np
8
+
9
+ from ._decode import TraceResult, decode
10
+
11
+
12
+ class Profiler:
13
+ """Allocates and decodes a warpscope profiler buffer.
14
+
15
+ Parameters
16
+ ----------
17
+ num_blocks : int
18
+ Number of CTAs that will record (persistent grid: one block per SM).
19
+ num_groups : int
20
+ Logical sub-tracks per block (e.g. one per warp-group / role).
21
+ max_records_per_lane : int
22
+ Per-(block, group) record cap. Must be passed identically to the device
23
+ ``init(..., max_records_per_lane=...)`` so host buffer size and device cap agree.
24
+ device : str
25
+ "cuda"/"cuda:N" allocates a torch tensor on GPU (requires torch). "cpu"
26
+ allocates a numpy array (for testing/decoding pre-captured buffers).
27
+ """
28
+
29
+ def __init__(self, num_blocks: int, num_groups: int,
30
+ max_records_per_lane: int = 64, device: str = "cuda"):
31
+ self.num_blocks = int(num_blocks)
32
+ self.num_groups = int(num_groups)
33
+ self.max_records_per_lane = int(max_records_per_lane)
34
+ self.write_stride = self.num_blocks * self.num_groups
35
+ self.num_slots = 1 + self.write_stride * self.max_records_per_lane
36
+ self.device = device
37
+
38
+ self._torch = None
39
+ self.tensor = None
40
+ self.array = None
41
+ if device != "cpu":
42
+ try:
43
+ import torch
44
+ self._torch = torch
45
+ except ImportError as err:
46
+ raise ImportError(
47
+ f"device={device!r} requires torch; install torch or use device='cpu'"
48
+ ) from err
49
+ self.tensor = torch.zeros(self.num_slots, dtype=torch.int64, device=device)
50
+ else:
51
+ self.array = np.zeros(self.num_slots, dtype=np.uint64)
52
+
53
+ @property
54
+ def ptr(self) -> int:
55
+ """Device/host pointer to pass to the kernel as the profiler buffer arg."""
56
+ if self.tensor is not None:
57
+ return self.tensor.data_ptr()
58
+ return self.array.ctypes.data
59
+
60
+ def reset(self) -> None:
61
+ if self.tensor is not None:
62
+ self.tensor.zero_()
63
+ else:
64
+ self.array.fill(0)
65
+
66
+ def numpy(self) -> np.ndarray:
67
+ if self.tensor is not None:
68
+ return self.tensor.detach().cpu().numpy().view(np.uint64)
69
+ return self.array
70
+
71
+ def decode(self,
72
+ event_names: Optional[Dict[int, str]] = None,
73
+ group_names: Optional[Dict[int, str]] = None) -> TraceResult:
74
+ return decode(self.numpy(), event_names=event_names, group_names=group_names)
75
+
76
+ def __repr__(self) -> str:
77
+ return (f"Profiler(num_blocks={self.num_blocks}, num_groups={self.num_groups}, "
78
+ f"max_records_per_lane={self.max_records_per_lane}, "
79
+ f"write_stride={self.write_stride}, num_slots={self.num_slots})")
warpscope/cli.py ADDED
@@ -0,0 +1,43 @@
1
+ """warpscope command-line helpers (build-system integration)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import argparse
6
+ import os
7
+ import sys
8
+
9
+
10
+ def include_dir() -> str:
11
+ """Absolute path to the bundled CUDA/C++ headers (for `nvcc -I`)."""
12
+ return os.path.join(os.path.dirname(os.path.abspath(__file__)), "include")
13
+
14
+
15
+ def main(argv=None) -> int:
16
+ parser = argparse.ArgumentParser(
17
+ prog="warpscope",
18
+ description="In-kernel %globaltimer profiler: header path and version helpers.",
19
+ )
20
+ parser.add_argument("--include", action="store_true",
21
+ help="Print the include directory containing warpscope.cuh")
22
+ parser.add_argument("--cuh", action="store_true",
23
+ help="Print the full path to warpscope.cuh")
24
+ parser.add_argument("--version", action="store_true", help="Print the package version")
25
+ args = parser.parse_args(argv)
26
+
27
+ if args.include:
28
+ print(include_dir())
29
+ return 0
30
+ if args.cuh:
31
+ print(os.path.join(include_dir(), "warpscope.cuh"))
32
+ return 0
33
+ if args.version:
34
+ from . import __version__
35
+ print(__version__)
36
+ return 0
37
+
38
+ parser.print_help()
39
+ return 0
40
+
41
+
42
+ if __name__ == "__main__":
43
+ sys.exit(main())
@@ -0,0 +1,115 @@
1
+ // warpscope - in-kernel %globaltimer profiler (device header)
2
+ //
3
+ // Single-file, dependency-free, NVRTC-safe. Include this in any CUDA / CUTLASS /
4
+ // NVRTC / JIT kernel, bracket logical stages with start()/end(), and pass a zeroed
5
+ // uint64 buffer as an ordinary kernel argument. Decode on the host with
6
+ // warpscope_host.hpp (C++) or the `warpscope` Python package.
7
+ //
8
+ // Wire format (v1, shared ABI):
9
+ // record = (globaltimer_lo32 << 32) | tag32
10
+ // tag32 = (block_group << 12) | (event_id << 2) | event_type
11
+ // block_group = blockIdx.x * num_groups + group_id
12
+ // event_type : 0=begin, 1=end, 2=instant, 3=finalize
13
+ // buf[0] header = (uint64(num_groups) << 32) | num_blocks (written by block0/thread0)
14
+ //
15
+ // SPDX-License-Identifier: MIT
16
+ // Copyright (c) 2026 warpscope contributors
17
+ //
18
+ // Credits:
19
+ // - Original idea & in-kernel CUDA profiler design: 侯博涵 (Hou Bohan)
20
+ // https://zhuanlan.zhihu.com/p/2054305616391304228
21
+ // - Wire format adapted from Apache TVM TIRx `CudaProfiler` (Apache-2.0):
22
+ // https://github.com/apache/tvm/blob/main/python/tvm/tirx/bench.py
23
+ // https://tvm.apache.org/docs/tirx/native_basics/cuda/profiling.html
24
+ // - Implementation written by Claude Opus (Anthropic).
25
+ #pragma once
26
+ #include <cstdint>
27
+
28
+ #define WARPSCOPE_WIRE_VERSION 1
29
+
30
+ namespace ws {
31
+
32
+ // Low 32 bits of the global nanosecond timer. Wraps every ~4.29 s.
33
+ __device__ __forceinline__ uint32_t globaltimer_lo() {
34
+ uint32_t t;
35
+ asm volatile("mov.u32 %0, %%globaltimer_lo;" : "=r"(t));
36
+ return t;
37
+ }
38
+
39
+ // Strongly-typed event id wrapper so call sites read clearly.
40
+ struct Ev { uint32_t id; };
41
+
42
+ enum EventType : uint32_t {
43
+ kBegin = 0,
44
+ kEnd = 1,
45
+ kInstant = 2,
46
+ kFinalize = 3,
47
+ };
48
+
49
+ // One profiler instance per (block, group) recording thread.
50
+ // kEnable == false compiles every method to nothing (zero-overhead off path).
51
+ template <bool kEnable = true>
52
+ struct Profiler {
53
+ uint64_t* buf = nullptr;
54
+ uint32_t stride = 0; // write_stride = num_blocks * num_groups
55
+ uint32_t cursor = 0; // this lane's next slot
56
+ uint32_t tag_base = 0; // block_group << 12
57
+ uint32_t cap = 0xffffffffu; // last writable slot (record cap)
58
+ bool leader = false; // exactly one writer per (block, group)
59
+
60
+ // Call once, before any start(). `is_leader` must be true for exactly one
61
+ // thread of this (block, group); only that thread writes records.
62
+ __device__ void init(uint64_t* p, uint32_t write_stride, uint32_t group_id,
63
+ uint32_t num_groups, uint32_t num_blocks, bool is_leader,
64
+ uint32_t max_records_per_lane = 0xffffffffu) {
65
+ if constexpr (!kEnable) return;
66
+ buf = p;
67
+ stride = write_stride;
68
+ leader = is_leader;
69
+ const uint32_t bg = blockIdx.x * num_groups + group_id;
70
+ tag_base = bg << 12;
71
+ cursor = 1u + bg;
72
+ cap = (max_records_per_lane == 0xffffffffu)
73
+ ? 0xffffffffu
74
+ : (1u + bg + (max_records_per_lane - 1u) * write_stride);
75
+ if (blockIdx.x == 0 && threadIdx.x == 0)
76
+ buf[0] = (static_cast<uint64_t>(num_groups) << 32) | num_blocks;
77
+ }
78
+
79
+ __device__ __forceinline__ void rec(uint32_t ev, uint32_t type) {
80
+ if constexpr (!kEnable) return;
81
+ // Order this thread's prior memory traffic before stamping the timer.
82
+ __threadfence_block();
83
+ if (leader && buf != nullptr && cursor <= cap) {
84
+ buf[cursor] = (static_cast<uint64_t>(globaltimer_lo()) << 32)
85
+ | (tag_base | (ev << 2) | type);
86
+ cursor += stride;
87
+ }
88
+ }
89
+
90
+ __device__ __forceinline__ void start(Ev e) { rec(e.id, kBegin); }
91
+ __device__ __forceinline__ void end (Ev e) { rec(e.id, kEnd); }
92
+ __device__ __forceinline__ void inst (Ev e) { rec(e.id, kInstant); }
93
+ __device__ __forceinline__ void finalize() { rec(0u, kFinalize); }
94
+ };
95
+
96
+ // RAII region guard: start() on construction, end() on scope exit.
97
+ template <bool kEnable>
98
+ struct ScopedRegion {
99
+ Profiler<kEnable>& p;
100
+ Ev e;
101
+ __device__ ScopedRegion(Profiler<kEnable>& p_, Ev e_) : p(p_), e(e_) { p.start(e); }
102
+ __device__ ~ScopedRegion() { p.end(e); }
103
+ };
104
+
105
+ // CTAD guide so `ws::ScopedRegion r{prof, Ev{...}}` deduces kEnable from the profiler.
106
+ template <bool kEnable>
107
+ ScopedRegion(Profiler<kEnable>&, Ev) -> ScopedRegion<kEnable>;
108
+
109
+ } // namespace ws
110
+
111
+ // Bracket a C++ scope: `{ WS_REGION(p, EvCompute); ...work... }`
112
+ #define WS_CONCAT_(a, b) a##b
113
+ #define WS_CONCAT(a, b) WS_CONCAT_(a, b)
114
+ #define WS_REGION(prof, ev) \
115
+ ::ws::ScopedRegion WS_CONCAT(_ws_region_, __LINE__){(prof), ::ws::Ev{(ev)}}
@@ -0,0 +1,168 @@
1
+ // warpscope - host-side decoder + trace writer (header-only, pure C++)
2
+ //
3
+ // No CUDA, no protobuf, no external deps. Include in any host translation unit to
4
+ // turn a warpscope profiler buffer (read back from device) into:
5
+ // - per-(block, group) stage durations (ws::decode)
6
+ // - a Chrome Trace Event JSON file openable in chrome://tracing AND
7
+ // https://ui.perfetto.dev (ws::write_chrome_trace)
8
+ //
9
+ // SPDX-License-Identifier: MIT
10
+ // Copyright (c) 2026 warpscope contributors
11
+ //
12
+ // Credits:
13
+ // - Original idea & in-kernel CUDA profiler design: 侯博涵 (Hou Bohan)
14
+ // https://zhuanlan.zhihu.com/p/2054305616391304228
15
+ // - Wire format and host decode/Perfetto export adapted from Apache TVM TIRx
16
+ // `CudaProfiler` / `export_to_perfetto_trace` (Apache-2.0):
17
+ // https://github.com/apache/tvm/blob/main/python/tvm/tirx/bench.py
18
+ // https://tvm.apache.org/docs/tirx/native_basics/cuda/profiling.html
19
+ // - Implementation written by Claude Opus (Anthropic).
20
+ #pragma once
21
+ #include <cstdint>
22
+ #include <cstdio>
23
+ #include <map>
24
+ #include <string>
25
+ #include <tuple>
26
+ #include <vector>
27
+
28
+ namespace ws {
29
+
30
+ struct Span {
31
+ uint32_t block;
32
+ uint32_t group;
33
+ uint32_t event_id;
34
+ uint32_t begin_ns;
35
+ uint32_t dur_ns;
36
+ };
37
+
38
+ struct Instant {
39
+ uint32_t block;
40
+ uint32_t group;
41
+ uint32_t event_id;
42
+ uint32_t ts_ns;
43
+ };
44
+
45
+ struct Trace {
46
+ uint32_t num_blocks = 0;
47
+ uint32_t num_groups = 0;
48
+ std::vector<Span> spans;
49
+ std::vector<Instant> instants;
50
+ };
51
+
52
+ // Decode a buffer of `n` uint64 records (including the slot-0 header).
53
+ inline Trace decode(const uint64_t* buf, size_t n) {
54
+ Trace tr;
55
+ if (buf == nullptr || n == 0) return tr;
56
+ tr.num_groups = static_cast<uint32_t>(buf[0] >> 32);
57
+ tr.num_blocks = static_cast<uint32_t>(buf[0] & 0xffffffffu);
58
+ if (tr.num_groups == 0) return tr;
59
+
60
+ std::map<std::tuple<uint32_t, uint32_t, uint32_t>, uint32_t> opens;
61
+ std::map<std::pair<uint32_t, uint32_t>, bool> finished;
62
+
63
+ for (size_t i = 1; i < n; ++i) {
64
+ const uint64_t w = buf[i];
65
+ if (w == 0) continue;
66
+ const uint32_t ts = static_cast<uint32_t>(w >> 32);
67
+ const uint32_t tag = static_cast<uint32_t>(w & 0xffffffffu);
68
+ const uint32_t bg = tag >> 12;
69
+ const uint32_t ev = (tag >> 2) & 0x3ffu;
70
+ const uint32_t type = tag & 0x3u;
71
+ const uint32_t block = bg / tr.num_groups;
72
+ const uint32_t group = bg % tr.num_groups;
73
+
74
+ if (type == 3u) { // finalize
75
+ finished[{block, group}] = true;
76
+ continue;
77
+ }
78
+ if (finished.count({block, group})) continue;
79
+
80
+ if (type == 0u) { // begin
81
+ opens[{block, group, ev}] = ts;
82
+ } else if (type == 1u) { // end
83
+ auto it = opens.find({block, group, ev});
84
+ if (it != opens.end()) {
85
+ tr.spans.push_back({block, group, ev, it->second,
86
+ (ts - it->second) & 0xffffffffu});
87
+ opens.erase(it);
88
+ }
89
+ } else if (type == 2u) { // instant
90
+ tr.instants.push_back({block, group, ev, ts});
91
+ }
92
+ }
93
+ return tr;
94
+ }
95
+
96
+ inline Trace decode(const std::vector<uint64_t>& buf) {
97
+ return decode(buf.data(), buf.size());
98
+ }
99
+
100
+ namespace detail {
101
+ inline const char* ev_name(const std::vector<std::string>& names, uint32_t id) {
102
+ return id < names.size() ? names[id].c_str() : "event";
103
+ }
104
+ inline std::string grp_name(const std::vector<std::string>& names, uint32_t id) {
105
+ return id < names.size() ? names[id] : ("group_" + std::to_string(id));
106
+ }
107
+ } // namespace detail
108
+
109
+ // Write a Chrome Trace Event JSON file. pid = block, tid = group; ts/dur in us.
110
+ inline void write_chrome_trace(const Trace& tr,
111
+ const std::vector<std::string>& event_names,
112
+ const std::vector<std::string>& group_names,
113
+ const std::string& path) {
114
+ FILE* f = std::fopen(path.c_str(), "w");
115
+ if (f == nullptr) return;
116
+ std::fprintf(f, "{\"displayTimeUnit\":\"ns\",\"traceEvents\":[\n");
117
+ bool first = true;
118
+ auto sep = [&]() { const char* s = first ? "" : ",\n"; first = false; return s; };
119
+
120
+ for (const Span& s : tr.spans) {
121
+ std::fprintf(f,
122
+ "%s{\"name\":\"%s\",\"ph\":\"X\",\"ts\":%.3f,\"dur\":%.3f,\"pid\":%u,\"tid\":%u}",
123
+ sep(), detail::ev_name(event_names, s.event_id),
124
+ s.begin_ns / 1000.0, s.dur_ns / 1000.0, s.block, s.group);
125
+ }
126
+ for (const Instant& in : tr.instants) {
127
+ std::fprintf(f,
128
+ "%s{\"name\":\"%s\",\"ph\":\"i\",\"ts\":%.3f,\"pid\":%u,\"tid\":%u,\"s\":\"t\"}",
129
+ sep(), detail::ev_name(event_names, in.event_id),
130
+ in.ts_ns / 1000.0, in.block, in.group);
131
+ }
132
+ // Readable lane names (process per block, thread per group).
133
+ for (uint32_t b = 0; b < tr.num_blocks; ++b) {
134
+ std::fprintf(f,
135
+ "%s{\"name\":\"process_name\",\"ph\":\"M\",\"pid\":%u,\"args\":{\"name\":\"block_%u\"}}",
136
+ sep(), b, b);
137
+ for (uint32_t g = 0; g < tr.num_groups; ++g) {
138
+ std::fprintf(f,
139
+ "%s{\"name\":\"thread_name\",\"ph\":\"M\",\"pid\":%u,\"tid\":%u,\"args\":{\"name\":\"%s\"}}",
140
+ sep(), b, g, detail::grp_name(group_names, g).c_str());
141
+ }
142
+ }
143
+ std::fprintf(f, "\n]}\n");
144
+ std::fclose(f);
145
+ }
146
+
147
+ // Convenience: decode + write in one call.
148
+ inline void write_chrome_trace(const uint64_t* buf, size_t n,
149
+ const std::vector<std::string>& event_names,
150
+ const std::vector<std::string>& group_names,
151
+ const std::string& path) {
152
+ write_chrome_trace(decode(buf, n), event_names, group_names, path);
153
+ }
154
+
155
+ // Print per-(block, group) stage durations to stdout.
156
+ inline void print_durations(const Trace& tr,
157
+ const std::vector<std::string>& event_names) {
158
+ std::map<std::pair<uint32_t, uint32_t>, std::vector<const Span*>> by_lane;
159
+ for (const Span& s : tr.spans) by_lane[{s.block, s.group}].push_back(&s);
160
+ for (auto& kv : by_lane) {
161
+ std::printf("block %u group %u:", kv.first.first, kv.first.second);
162
+ for (const Span* s : kv.second)
163
+ std::printf(" %s=%uns", detail::ev_name(event_names, s->event_id), s->dur_ns);
164
+ std::printf("\n");
165
+ }
166
+ }
167
+
168
+ } // namespace ws
@@ -0,0 +1,156 @@
1
+ Metadata-Version: 2.4
2
+ Name: warpscope
3
+ Version: 0.1.0
4
+ Summary: In-kernel %globaltimer profiler for warp-specialized CUDA kernels (Perfetto/Chrome timelines).
5
+ Author: warpscope contributors
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/YangWang92/warpscope
8
+ Keywords: cuda,profiler,globaltimer,perfetto,gpu,warp-specialized,tracing
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Classifier: Programming Language :: Python :: 3
11
+ Classifier: Topic :: Software Development :: Libraries
12
+ Classifier: Environment :: GPU :: NVIDIA CUDA
13
+ Requires-Python: >=3.9
14
+ Description-Content-Type: text/markdown
15
+ License-File: LICENSE
16
+ Requires-Dist: numpy>=1.21
17
+ Provides-Extra: torch
18
+ Requires-Dist: torch; extra == "torch"
19
+ Provides-Extra: perfetto
20
+ Requires-Dist: tg4perfetto; extra == "perfetto"
21
+ Provides-Extra: dev
22
+ Requires-Dist: pytest; extra == "dev"
23
+ Dynamic: license-file
24
+
25
+ # warpscope
26
+
27
+ English | [中文](README.zh.md)
28
+
29
+ > Credits: the idea and design all come from **侯博涵 (Hou Bohan)**'s write-up
30
+ > ([zhihu](https://zhuanlan.zhihu.com/p/2054305616391304228)); the wire format and the
31
+ > host-side decode / Perfetto export are adapted from **Apache TVM TIRx `CudaProfiler`**
32
+ > ([bench.py](https://github.com/apache/tvm/blob/main/python/tvm/tirx/bench.py),
33
+ > [docs](https://tvm.apache.org/docs/tirx/native_basics/cuda/profiling.html)).
34
+ > The implementation in this repo was **written by Claude Opus (Anthropic)**.
35
+
36
+ In-kernel `%globaltimer` profiler for **warp-specialized CUDA kernels**. Bracket the
37
+ logical stages inside a kernel (TMA load, MMA, softmax, epilogue, ...) with
38
+ `start`/`end` markers; one leader thread per logical group stamps the GPU global timer
39
+ into a buffer you pass as an ordinary kernel argument. Decode it on the host into
40
+ per-`(block, group)` durations or a **Perfetto / Chrome trace** to see how the
41
+ producer and consumer warp-groups actually overlap — something total launch time and
42
+ SM-level counters can't show.
43
+
44
+ It is **not** zero cost (a timer read + a global store + a block fence per event), so
45
+ it is a debugging/analysis tool. Build with the profiler disabled for production.
46
+
47
+ ## Layout
48
+
49
+ ```
50
+ warpscope/
51
+ include/
52
+ warpscope.cuh # device header (header-only, NVRTC-safe)
53
+ warpscope_host.hpp # host decoder + Chrome-trace writer (header-only, pure C++)
54
+ *.py # Python: Profiler buffer mgmt, decode, trace export
55
+ examples/ # toy CUDA program (pure C++ path) + python driver
56
+ tests/ # wire-format + decode tests
57
+ ```
58
+
59
+ ## Install
60
+
61
+ ```bash
62
+ pip install -e . # core (numpy only)
63
+ pip install -e ".[torch]" # + GPU buffer allocation
64
+ pip install -e ".[dev]" # + pytest
65
+ ```
66
+
67
+ ## Device side (CUDA C++)
68
+
69
+ ```cpp
70
+ #include <warpscope.cuh>
71
+ enum : uint32_t { EvWait = 0, EvWork = 1 };
72
+
73
+ __global__ void k(..., uint64_t* prof, uint32_t stride,
74
+ uint32_t num_groups, uint32_t num_blocks, uint32_t max_rec) {
75
+ ws::Profiler<true> p; // <false> compiles to a no-op
76
+ const uint32_t warp = threadIdx.x / 32, lane = threadIdx.x % 32;
77
+
78
+ if (warp == 0) { // e.g. TMA producer = group 0
79
+ p.init(prof, stride, /*group=*/0, num_groups, num_blocks,
80
+ /*leader=*/lane == 0, max_rec);
81
+ { WS_REGION(p, EvWait); /* barrier wait */ } // RAII start/end
82
+ { WS_REGION(p, EvWork); /* issue work */ }
83
+ p.finalize();
84
+ }
85
+ // ... other warp-groups: init with their own group id + one leader each ...
86
+ }
87
+ ```
88
+
89
+ Build: `nvcc -I"$(warpscope --include)" -arch=sm_100a my.cu`
90
+
91
+ ## Host side — pick one
92
+
93
+ **Pure C++ (header-only, no Python):**
94
+
95
+ ```cpp
96
+ #include <warpscope_host.hpp>
97
+ std::vector<uint64_t> h(slots); // cudaMemcpy buffer back into h
98
+ ws::write_chrome_trace(h.data(), h.size(),
99
+ /*events*/ {"wait", "work"},
100
+ /*groups*/ {"tma", "umma", "utccp", "epilogue"},
101
+ "trace.json"); // open in chrome://tracing or perfetto
102
+ ```
103
+
104
+ **Python:**
105
+
106
+ ```python
107
+ import warpscope as ws
108
+ prof = ws.Profiler(num_blocks=num_sms, num_groups=4, max_records_per_lane=64)
109
+ launch(..., prof.ptr) # pass the device pointer
110
+ torch.cuda.synchronize()
111
+ res = prof.decode(event_names={0: "wait", 1: "work"},
112
+ group_names={0: "tma", 1: "umma", 2: "utccp", 3: "epilogue"})
113
+ res.print_durations()
114
+ res.to_perfetto("trace.json") # Chrome JSON; opens in ui.perfetto.dev too
115
+ ```
116
+
117
+ ## Output
118
+
119
+ The raw output is a `uint64` buffer. Both host paths turn it into a **Chrome Trace
120
+ Event JSON** file (`pid = block`, `tid = group`, `ts/dur` in microseconds) that opens
121
+ directly in `chrome://tracing` and <https://ui.perfetto.dev>. A native
122
+ `.perfetto-trace` writer is available via the optional `tg4perfetto` dependency.
123
+
124
+ ## Wire format (v1, shared ABI)
125
+
126
+ ```
127
+ record = (globaltimer_lo32 << 32) | tag32
128
+ tag32 = (block_group << 12) | (event_id << 2) | event_type
129
+ block_group = block_idx * num_groups + group_id
130
+ event_type : 0=begin 1=end 2=instant 3=finalize
131
+ buf[0] header = (num_groups << 32) | num_blocks
132
+ ```
133
+
134
+ Identical to the format used by TIRx/flashinfer, so traces are cross-tool compatible.
135
+
136
+ ## Caveats
137
+
138
+ - Zero the buffer before launch (the decoder treats 0 as empty).
139
+ - Exactly one leader thread per `(block, group)` lane (two writers clobber the cursor).
140
+ - `%globaltimer_lo` is 32-bit ns: ~tens-of-ns resolution and a ~4.29 s wrap.
141
+ - Persistent grids stream records — cap with `max_records_per_lane` (host) which is
142
+ also enforced device-side via `init(..., max_records_per_lane=...)`.
143
+ - The fence + store perturb tight pipelines; keep events coarse and compare against an
144
+ unprofiled (`ws::Profiler<false>`) build.
145
+
146
+ ## Credits & License
147
+
148
+ Licensed under the **MIT License** (see [LICENSE](LICENSE)).
149
+
150
+ - **侯博涵 (Hou Bohan)** — original idea and write-up:
151
+ <https://zhuanlan.zhihu.com/p/2054305616391304228>
152
+ - **Apache TVM TIRx `CudaProfiler`** (Apache-2.0) — wire format + host decode/Perfetto
153
+ export are adapted from it:
154
+ <https://github.com/apache/tvm/blob/main/python/tvm/tirx/bench.py> ·
155
+ <https://tvm.apache.org/docs/tirx/native_basics/cuda/profiling.html>
156
+ - The CUDA/Python implementation in this repository was **written by Claude Opus (Anthropic)**.
@@ -0,0 +1,14 @@
1
+ warpscope/__init__.py,sha256=x48196Y5D0FSj8j6xVmh6IBjaqS98Oae-p69jv6o-KE,1362
2
+ warpscope/_decode.py,sha256=msIdBLi7xQMIDZCkbcLUVXHbnB9hS7hq439uMRmG0ko,4575
3
+ warpscope/_perfetto.py,sha256=RDtoJH55xVF5y6hKANgcrt1XocsMoAe9ve842Q7Fqfk,3117
4
+ warpscope/_wire.py,sha256=QUA9_azwqrjY6gyIAl4ou4TskPBCo7L6Z4SgUFFxy2Y,2163
5
+ warpscope/buffer.py,sha256=Ys_oVaFYjUYnAnxhvhSrTW_-37xm7piVtGM23tFav7s,2922
6
+ warpscope/cli.py,sha256=tWzzQzcB6-4HBUh7s8hKa9rtShCaC6H8KKBOKM4Qu9E,1250
7
+ warpscope/include/warpscope.cuh,sha256=FMbJREQxEYykhSun0WhXeAHMUB16EEeFzrhFhot8ToM,4708
8
+ warpscope/include/warpscope_host.hpp,sha256=_K5OxGNu0CeFSVkO-lUYay1zfnFL0plUOOgaxQxjY1M,6390
9
+ warpscope-0.1.0.dist-info/licenses/LICENSE,sha256=ecqGRSwPBofcM0ZLMKUHmwNCPF50n5pM6ZSlrlHOKB4,1668
10
+ warpscope-0.1.0.dist-info/METADATA,sha256=Pe_tHpCVemUsO8K5vY5hlygdiYTEjkvaLZsib9w457Y,6192
11
+ warpscope-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
12
+ warpscope-0.1.0.dist-info/entry_points.txt,sha256=bCos2hE8zn2ZKbzNx-itvIuwmYB0-F3Hw3P2qZucl0k,49
13
+ warpscope-0.1.0.dist-info/top_level.txt,sha256=PEA9PCUwAmLJPxGVtUg19HBdP7KKnZ4e2eIybQQK1iA,10
14
+ warpscope-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (82.0.1)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ warpscope = warpscope.cli:main
@@ -0,0 +1,37 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 warpscope contributors
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
22
+
23
+ ---
24
+
25
+ Credits / Attribution
26
+
27
+ This project's in-kernel CUDA profiling design and wire format originate from:
28
+
29
+ * 侯博涵 (Hou Bohan) — original idea and write-up:
30
+ https://zhuanlan.zhihu.com/p/2054305616391304228
31
+
32
+ * Apache TVM, TIRx `CudaProfiler` (Apache License 2.0) — wire format and the
33
+ host-side decode / Perfetto export logic are adapted from:
34
+ https://github.com/apache/tvm/blob/main/python/tvm/tirx/bench.py
35
+ https://tvm.apache.org/docs/tirx/native_basics/cuda/profiling.html
36
+
37
+ The CUDA implementation in this repository was written by Claude Opus (Anthropic).
@@ -0,0 +1 @@
1
+ warpscope