warpscope 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,37 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 warpscope contributors
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
22
+
23
+ ---
24
+
25
+ Credits / Attribution
26
+
27
+ This project's in-kernel CUDA profiling design and wire format originate from:
28
+
29
+ * 侯博涵 (Hou Bohan) — original idea and write-up:
30
+ https://zhuanlan.zhihu.com/p/2054305616391304228
31
+
32
+ * Apache TVM, TIRx `CudaProfiler` (Apache License 2.0) — wire format and the
33
+ host-side decode / Perfetto export logic are adapted from:
34
+ https://github.com/apache/tvm/blob/main/python/tvm/tirx/bench.py
35
+ https://tvm.apache.org/docs/tirx/native_basics/cuda/profiling.html
36
+
37
+ The CUDA implementation in this repository was written by Claude Opus (Anthropic).
@@ -0,0 +1,156 @@
1
+ Metadata-Version: 2.4
2
+ Name: warpscope
3
+ Version: 0.1.0
4
+ Summary: In-kernel %globaltimer profiler for warp-specialized CUDA kernels (Perfetto/Chrome timelines).
5
+ Author: warpscope contributors
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/YangWang92/warpscope
8
+ Keywords: cuda,profiler,globaltimer,perfetto,gpu,warp-specialized,tracing
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Classifier: Programming Language :: Python :: 3
11
+ Classifier: Topic :: Software Development :: Libraries
12
+ Classifier: Environment :: GPU :: NVIDIA CUDA
13
+ Requires-Python: >=3.9
14
+ Description-Content-Type: text/markdown
15
+ License-File: LICENSE
16
+ Requires-Dist: numpy>=1.21
17
+ Provides-Extra: torch
18
+ Requires-Dist: torch; extra == "torch"
19
+ Provides-Extra: perfetto
20
+ Requires-Dist: tg4perfetto; extra == "perfetto"
21
+ Provides-Extra: dev
22
+ Requires-Dist: pytest; extra == "dev"
23
+ Dynamic: license-file
24
+
25
+ # warpscope
26
+
27
+ English | [中文](README.zh.md)
28
+
29
+ > Credits: the idea and design all come from **侯博涵 (Hou Bohan)**'s write-up
30
+ > ([zhihu](https://zhuanlan.zhihu.com/p/2054305616391304228)); the wire format and the
31
+ > host-side decode / Perfetto export are adapted from **Apache TVM TIRx `CudaProfiler`**
32
+ > ([bench.py](https://github.com/apache/tvm/blob/main/python/tvm/tirx/bench.py),
33
+ > [docs](https://tvm.apache.org/docs/tirx/native_basics/cuda/profiling.html)).
34
+ > The implementation in this repo was **written by Claude Opus (Anthropic)**.
35
+
36
+ In-kernel `%globaltimer` profiler for **warp-specialized CUDA kernels**. Bracket the
37
+ logical stages inside a kernel (TMA load, MMA, softmax, epilogue, ...) with
38
+ `start`/`end` markers; one leader thread per logical group stamps the GPU global timer
39
+ into a buffer you pass as an ordinary kernel argument. Decode it on the host into
40
+ per-`(block, group)` durations or a **Perfetto / Chrome trace** to see how the
41
+ producer and consumer warp-groups actually overlap — something total launch time and
42
+ SM-level counters can't show.
43
+
44
+ It is **not** zero cost (a timer read + a global store + a block fence per event), so
45
+ it is a debugging/analysis tool. Build with the profiler disabled for production.
46
+
47
+ ## Layout
48
+
49
+ ```
50
+ warpscope/
51
+ include/
52
+ warpscope.cuh # device header (header-only, NVRTC-safe)
53
+ warpscope_host.hpp # host decoder + Chrome-trace writer (header-only, pure C++)
54
+ *.py # Python: Profiler buffer mgmt, decode, trace export
55
+ examples/ # toy CUDA program (pure C++ path) + python driver
56
+ tests/ # wire-format + decode tests
57
+ ```
58
+
59
+ ## Install
60
+
61
+ ```bash
62
+ pip install -e . # core (numpy only)
63
+ pip install -e ".[torch]" # + GPU buffer allocation
64
+ pip install -e ".[dev]" # + pytest
65
+ ```
66
+
67
+ ## Device side (CUDA C++)
68
+
69
+ ```cpp
70
+ #include <warpscope.cuh>
71
+ enum : uint32_t { EvWait = 0, EvWork = 1 };
72
+
73
+ __global__ void k(..., uint64_t* prof, uint32_t stride,
74
+ uint32_t num_groups, uint32_t num_blocks, uint32_t max_rec) {
75
+ ws::Profiler<true> p; // <false> compiles to a no-op
76
+ const uint32_t warp = threadIdx.x / 32, lane = threadIdx.x % 32;
77
+
78
+ if (warp == 0) { // e.g. TMA producer = group 0
79
+ p.init(prof, stride, /*group=*/0, num_groups, num_blocks,
80
+ /*leader=*/lane == 0, max_rec);
81
+ { WS_REGION(p, EvWait); /* barrier wait */ } // RAII start/end
82
+ { WS_REGION(p, EvWork); /* issue work */ }
83
+ p.finalize();
84
+ }
85
+ // ... other warp-groups: init with their own group id + one leader each ...
86
+ }
87
+ ```
88
+
89
+ Build: `nvcc -I"$(warpscope --include)" -arch=sm_100a my.cu`
90
+
91
+ ## Host side — pick one
92
+
93
+ **Pure C++ (header-only, no Python):**
94
+
95
+ ```cpp
96
+ #include <warpscope_host.hpp>
97
+ std::vector<uint64_t> h(slots); // cudaMemcpy buffer back into h
98
+ ws::write_chrome_trace(h.data(), h.size(),
99
+ /*events*/ {"wait", "work"},
100
+ /*groups*/ {"tma", "umma", "utccp", "epilogue"},
101
+ "trace.json"); // open in chrome://tracing or perfetto
102
+ ```
103
+
104
+ **Python:**
105
+
106
+ ```python
107
+ import warpscope as ws
108
+ prof = ws.Profiler(num_blocks=num_sms, num_groups=4, max_records_per_lane=64)
109
+ launch(..., prof.ptr) # pass the device pointer
110
+ torch.cuda.synchronize()
111
+ res = prof.decode(event_names={0: "wait", 1: "work"},
112
+ group_names={0: "tma", 1: "umma", 2: "utccp", 3: "epilogue"})
113
+ res.print_durations()
114
+ res.to_perfetto("trace.json") # Chrome JSON; opens in ui.perfetto.dev too
115
+ ```
116
+
117
+ ## Output
118
+
119
+ The raw output is a `uint64` buffer. Both host paths turn it into a **Chrome Trace
120
+ Event JSON** file (`pid = block`, `tid = group`, `ts/dur` in microseconds) that opens
121
+ directly in `chrome://tracing` and <https://ui.perfetto.dev>. A native
122
+ `.perfetto-trace` writer is available via the optional `tg4perfetto` dependency.
123
+
124
+ ## Wire format (v1, shared ABI)
125
+
126
+ ```
127
+ record = (globaltimer_lo32 << 32) | tag32
128
+ tag32 = (block_group << 12) | (event_id << 2) | event_type
129
+ block_group = block_idx * num_groups + group_id
130
+ event_type : 0=begin 1=end 2=instant 3=finalize
131
+ buf[0] header = (num_groups << 32) | num_blocks
132
+ ```
133
+
134
+ Identical to the format used by TIRx/flashinfer, so traces are cross-tool compatible.
135
+
136
+ ## Caveats
137
+
138
+ - Zero the buffer before launch (the decoder treats 0 as empty).
139
+ - Exactly one leader thread per `(block, group)` lane (two writers clobber the cursor).
140
+ - `%globaltimer_lo` is 32-bit ns: ~tens-of-ns resolution and a ~4.29 s wrap.
141
+ - Persistent grids stream records — cap with `max_records_per_lane` (host) which is
142
+ also enforced device-side via `init(..., max_records_per_lane=...)`.
143
+ - The fence + store perturb tight pipelines; keep events coarse and compare against an
144
+ unprofiled (`ws::Profiler<false>`) build.
145
+
146
+ ## Credits & License
147
+
148
+ Licensed under the **MIT License** (see [LICENSE](LICENSE)).
149
+
150
+ - **侯博涵 (Hou Bohan)** — original idea and write-up:
151
+ <https://zhuanlan.zhihu.com/p/2054305616391304228>
152
+ - **Apache TVM TIRx `CudaProfiler`** (Apache-2.0) — wire format + host decode/Perfetto
153
+ export are adapted from it:
154
+ <https://github.com/apache/tvm/blob/main/python/tvm/tirx/bench.py> ·
155
+ <https://tvm.apache.org/docs/tirx/native_basics/cuda/profiling.html>
156
+ - The CUDA/Python implementation in this repository was **written by Claude Opus (Anthropic)**.
@@ -0,0 +1,132 @@
1
+ # warpscope
2
+
3
+ English | [中文](README.zh.md)
4
+
5
+ > Credits: the idea and design all come from **侯博涵 (Hou Bohan)**'s write-up
6
+ > ([zhihu](https://zhuanlan.zhihu.com/p/2054305616391304228)); the wire format and the
7
+ > host-side decode / Perfetto export are adapted from **Apache TVM TIRx `CudaProfiler`**
8
+ > ([bench.py](https://github.com/apache/tvm/blob/main/python/tvm/tirx/bench.py),
9
+ > [docs](https://tvm.apache.org/docs/tirx/native_basics/cuda/profiling.html)).
10
+ > The implementation in this repo was **written by Claude Opus (Anthropic)**.
11
+
12
+ In-kernel `%globaltimer` profiler for **warp-specialized CUDA kernels**. Bracket the
13
+ logical stages inside a kernel (TMA load, MMA, softmax, epilogue, ...) with
14
+ `start`/`end` markers; one leader thread per logical group stamps the GPU global timer
15
+ into a buffer you pass as an ordinary kernel argument. Decode it on the host into
16
+ per-`(block, group)` durations or a **Perfetto / Chrome trace** to see how the
17
+ producer and consumer warp-groups actually overlap — something total launch time and
18
+ SM-level counters can't show.
19
+
20
+ It is **not** zero cost (a timer read + a global store + a block fence per event), so
21
+ it is a debugging/analysis tool. Build with the profiler disabled for production.
22
+
23
+ ## Layout
24
+
25
+ ```
26
+ warpscope/
27
+ include/
28
+ warpscope.cuh # device header (header-only, NVRTC-safe)
29
+ warpscope_host.hpp # host decoder + Chrome-trace writer (header-only, pure C++)
30
+ *.py # Python: Profiler buffer mgmt, decode, trace export
31
+ examples/ # toy CUDA program (pure C++ path) + python driver
32
+ tests/ # wire-format + decode tests
33
+ ```
34
+
35
+ ## Install
36
+
37
+ ```bash
38
+ pip install -e . # core (numpy only)
39
+ pip install -e ".[torch]" # + GPU buffer allocation
40
+ pip install -e ".[dev]" # + pytest
41
+ ```
42
+
43
+ ## Device side (CUDA C++)
44
+
45
+ ```cpp
46
+ #include <warpscope.cuh>
47
+ enum : uint32_t { EvWait = 0, EvWork = 1 };
48
+
49
+ __global__ void k(..., uint64_t* prof, uint32_t stride,
50
+ uint32_t num_groups, uint32_t num_blocks, uint32_t max_rec) {
51
+ ws::Profiler<true> p; // <false> compiles to a no-op
52
+ const uint32_t warp = threadIdx.x / 32, lane = threadIdx.x % 32;
53
+
54
+ if (warp == 0) { // e.g. TMA producer = group 0
55
+ p.init(prof, stride, /*group=*/0, num_groups, num_blocks,
56
+ /*leader=*/lane == 0, max_rec);
57
+ { WS_REGION(p, EvWait); /* barrier wait */ } // RAII start/end
58
+ { WS_REGION(p, EvWork); /* issue work */ }
59
+ p.finalize();
60
+ }
61
+ // ... other warp-groups: init with their own group id + one leader each ...
62
+ }
63
+ ```
64
+
65
+ Build: `nvcc -I"$(warpscope --include)" -arch=sm_100a my.cu`
66
+
67
+ ## Host side — pick one
68
+
69
+ **Pure C++ (header-only, no Python):**
70
+
71
+ ```cpp
72
+ #include <warpscope_host.hpp>
73
+ std::vector<uint64_t> h(slots); // cudaMemcpy buffer back into h
74
+ ws::write_chrome_trace(h.data(), h.size(),
75
+ /*events*/ {"wait", "work"},
76
+ /*groups*/ {"tma", "umma", "utccp", "epilogue"},
77
+ "trace.json"); // open in chrome://tracing or perfetto
78
+ ```
79
+
80
+ **Python:**
81
+
82
+ ```python
83
+ import warpscope as ws
84
+ prof = ws.Profiler(num_blocks=num_sms, num_groups=4, max_records_per_lane=64)
85
+ launch(..., prof.ptr) # pass the device pointer
86
+ torch.cuda.synchronize()
87
+ res = prof.decode(event_names={0: "wait", 1: "work"},
88
+ group_names={0: "tma", 1: "umma", 2: "utccp", 3: "epilogue"})
89
+ res.print_durations()
90
+ res.to_perfetto("trace.json") # Chrome JSON; opens in ui.perfetto.dev too
91
+ ```
92
+
93
+ ## Output
94
+
95
+ The raw output is a `uint64` buffer. Both host paths turn it into a **Chrome Trace
96
+ Event JSON** file (`pid = block`, `tid = group`, `ts/dur` in microseconds) that opens
97
+ directly in `chrome://tracing` and <https://ui.perfetto.dev>. A native
98
+ `.perfetto-trace` writer is available via the optional `tg4perfetto` dependency.
99
+
100
+ ## Wire format (v1, shared ABI)
101
+
102
+ ```
103
+ record = (globaltimer_lo32 << 32) | tag32
104
+ tag32 = (block_group << 12) | (event_id << 2) | event_type
105
+ block_group = block_idx * num_groups + group_id
106
+ event_type : 0=begin 1=end 2=instant 3=finalize
107
+ buf[0] header = (num_groups << 32) | num_blocks
108
+ ```
109
+
110
+ Identical to the format used by TIRx/flashinfer, so traces are cross-tool compatible.
111
+
112
+ ## Caveats
113
+
114
+ - Zero the buffer before launch (the decoder treats 0 as empty).
115
+ - Exactly one leader thread per `(block, group)` lane (two writers clobber the cursor).
116
+ - `%globaltimer_lo` is 32-bit ns: ~tens-of-ns resolution and a ~4.29 s wrap.
117
+ - Persistent grids stream records — cap with `max_records_per_lane` (host) which is
118
+ also enforced device-side via `init(..., max_records_per_lane=...)`.
119
+ - The fence + store perturb tight pipelines; keep events coarse and compare against an
120
+ unprofiled (`ws::Profiler<false>`) build.
121
+
122
+ ## Credits & License
123
+
124
+ Licensed under the **MIT License** (see [LICENSE](LICENSE)).
125
+
126
+ - **侯博涵 (Hou Bohan)** — original idea and write-up:
127
+ <https://zhuanlan.zhihu.com/p/2054305616391304228>
128
+ - **Apache TVM TIRx `CudaProfiler`** (Apache-2.0) — wire format + host decode/Perfetto
129
+ export are adapted from it:
130
+ <https://github.com/apache/tvm/blob/main/python/tvm/tirx/bench.py> ·
131
+ <https://tvm.apache.org/docs/tirx/native_basics/cuda/profiling.html>
132
+ - The CUDA/Python implementation in this repository was **written by Claude Opus (Anthropic)**.
@@ -0,0 +1,37 @@
1
+ [build-system]
2
+ requires = ["setuptools>=64", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "warpscope"
7
+ version = "0.1.0"
8
+ description = "In-kernel %globaltimer profiler for warp-specialized CUDA kernels (Perfetto/Chrome timelines)."
9
+ readme = "README.md"
10
+ requires-python = ">=3.9"
11
+ license = { text = "MIT" }
12
+ authors = [{ name = "warpscope contributors" }]
13
+ keywords = ["cuda", "profiler", "globaltimer", "perfetto", "gpu", "warp-specialized", "tracing"]
14
+ classifiers = [
15
+ "License :: OSI Approved :: MIT License",
16
+ "Programming Language :: Python :: 3",
17
+ "Topic :: Software Development :: Libraries",
18
+ "Environment :: GPU :: NVIDIA CUDA",
19
+ ]
20
+ dependencies = ["numpy>=1.21"]
21
+
22
+ [project.optional-dependencies]
23
+ torch = ["torch"]
24
+ perfetto = ["tg4perfetto"]
25
+ dev = ["pytest"]
26
+
27
+ [project.scripts]
28
+ warpscope = "warpscope.cli:main"
29
+
30
+ [project.urls]
31
+ Homepage = "https://github.com/YangWang92/warpscope"
32
+
33
+ [tool.setuptools]
34
+ packages = ["warpscope"]
35
+
36
+ [tool.setuptools.package-data]
37
+ warpscope = ["include/*.cuh", "include/*.hpp"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,84 @@
1
+ """Build a synthetic buffer the way the device would, then check decode + export."""
2
+
3
+ import json
4
+ import os
5
+
6
+ import numpy as np
7
+
8
+ from warpscope import decode
9
+ from warpscope._wire import (
10
+ EVENT_TYPE_BEGIN,
11
+ EVENT_TYPE_END,
12
+ EVENT_TYPE_FINALIZE,
13
+ pack_header,
14
+ pack_record,
15
+ pack_tag,
16
+ )
17
+
18
+
19
+ def _build_buffer(num_blocks, num_groups, lane_records):
20
+ """lane_records: {(block, group): [(event_id, type, ts), ...]} in cursor order."""
21
+ write_stride = num_blocks * num_groups
22
+ max_rec = max((len(v) for v in lane_records.values()), default=0) + 1
23
+ buf = np.zeros(1 + write_stride * max_rec, dtype=np.uint64)
24
+ buf[0] = pack_header(num_groups, num_blocks)
25
+ for (block, group), recs in lane_records.items():
26
+ bg = block * num_groups + group
27
+ cursor = 1 + bg
28
+ for (ev, typ, ts) in recs:
29
+ tag = pack_tag(block, group, num_groups, ev, typ)
30
+ buf[cursor] = pack_record(ts, tag)
31
+ cursor += write_stride
32
+ return buf
33
+
34
+
35
+ def test_decode_two_groups():
36
+ lanes = {
37
+ (0, 0): [(0, EVENT_TYPE_BEGIN, 100), (0, EVENT_TYPE_END, 196),
38
+ (1, EVENT_TYPE_BEGIN, 200), (1, EVENT_TYPE_END, 3240),
39
+ (0, EVENT_TYPE_FINALIZE, 3300)],
40
+ (0, 1): [(1, EVENT_TYPE_BEGIN, 200), (1, EVENT_TYPE_END, 11016),
41
+ (0, EVENT_TYPE_FINALIZE, 11100)],
42
+ }
43
+ buf = _build_buffer(num_blocks=1, num_groups=2, lane_records=lanes)
44
+ res = decode(buf, event_names={0: "load", 1: "compute"},
45
+ group_names={0: "light", 1: "heavy"})
46
+
47
+ assert res.num_blocks == 1 and res.num_groups == 2
48
+ durs = {(s.group, s.event_id): s.dur_ns for s in res.spans}
49
+ assert durs[(0, 0)] == 96 # group 0 load
50
+ assert durs[(0, 1)] == 3040 # group 0 compute
51
+ assert durs[(1, 1)] == 10816 # group 1 compute (heavier)
52
+
53
+
54
+ def test_finalize_drops_later_records():
55
+ lanes = {
56
+ (0, 0): [(0, EVENT_TYPE_BEGIN, 10), (0, EVENT_TYPE_END, 20),
57
+ (0, EVENT_TYPE_FINALIZE, 30),
58
+ (0, EVENT_TYPE_BEGIN, 40), (0, EVENT_TYPE_END, 99)],
59
+ }
60
+ buf = _build_buffer(1, 1, lanes)
61
+ res = decode(buf)
62
+ assert len(res.spans) == 1
63
+ assert res.spans[0].dur_ns == 10
64
+
65
+
66
+ def test_timestamp_wrap():
67
+ lanes = {(0, 0): [(0, EVENT_TYPE_BEGIN, 0xFFFFFFF0), (0, EVENT_TYPE_END, 0x10)]}
68
+ buf = _build_buffer(1, 1, lanes)
69
+ res = decode(buf)
70
+ assert res.spans[0].dur_ns == 0x20 # (0x10 - 0xFFFFFFF0) & 0xFFFFFFFF
71
+
72
+
73
+ def test_chrome_trace_export(tmp_path):
74
+ lanes = {(0, 0): [(0, EVENT_TYPE_BEGIN, 1000), (0, EVENT_TYPE_END, 2000)]}
75
+ buf = _build_buffer(1, 1, lanes)
76
+ res = decode(buf, event_names={0: "work"}, group_names={0: "main"})
77
+ out = os.path.join(tmp_path, "t.json")
78
+ res.to_perfetto(out)
79
+ with open(out) as f:
80
+ data = json.load(f)
81
+ xs = [e for e in data["traceEvents"] if e["ph"] == "X"]
82
+ assert len(xs) == 1
83
+ assert xs[0]["name"] == "work"
84
+ assert xs[0]["ts"] == 1.0 and xs[0]["dur"] == 1.0 # microseconds
@@ -0,0 +1,47 @@
1
+ import numpy as np
2
+ import pytest
3
+
4
+ from warpscope._wire import (
5
+ EVENT_TYPE_BEGIN,
6
+ EVENT_TYPE_END,
7
+ decode_tag,
8
+ pack_header,
9
+ pack_record,
10
+ pack_tag,
11
+ )
12
+
13
+
14
+ def test_tag_roundtrip():
15
+ for block in (0, 1, 7, 147):
16
+ for group in range(4):
17
+ for ev in (0, 1, 9, 1023):
18
+ for typ in range(4):
19
+ tag = pack_tag(block, group, 4, ev, typ)
20
+ b, g, e, t = decode_tag(tag, 4)
21
+ assert (b, g, e, t) == (block, group, ev, typ)
22
+
23
+
24
+ def test_record_layout():
25
+ tag = pack_tag(2, 1, 4, 6, EVENT_TYPE_BEGIN)
26
+ rec = pack_record(0x1234ABCD, tag)
27
+ assert rec >> 32 == 0x1234ABCD
28
+ assert rec & 0xFFFFFFFF == tag
29
+
30
+
31
+ def test_header():
32
+ h = pack_header(num_groups=4, num_blocks=148)
33
+ assert h >> 32 == 4
34
+ assert h & 0xFFFFFFFF == 148
35
+
36
+
37
+ def test_event_id_overflow():
38
+ with pytest.raises(ValueError):
39
+ pack_tag(0, 0, 1, 1024, EVENT_TYPE_END)
40
+
41
+
42
+ def test_view_as_uint64():
43
+ # records must survive an int64<->uint64 view (torch stores as int64)
44
+ tag = pack_tag(1, 0, 2, 3, EVENT_TYPE_END)
45
+ rec = pack_record(0xFFFFFFFF, tag)
46
+ arr = np.array([rec], dtype=np.uint64)
47
+ assert int(arr.view(np.int64).view(np.uint64)[0]) == rec
@@ -0,0 +1,48 @@
1
+ # SPDX-License-Identifier: MIT
2
+ # Copyright (c) 2026 warpscope contributors
3
+ #
4
+ # Credits:
5
+ # - Original idea & design: 侯博涵 (Hou Bohan)
6
+ # https://zhuanlan.zhihu.com/p/2054305616391304228
7
+ # - Wire format / decode / Perfetto export adapted from Apache TVM TIRx CudaProfiler:
8
+ # https://github.com/apache/tvm/blob/main/python/tvm/tirx/bench.py
9
+ # https://tvm.apache.org/docs/tirx/native_basics/cuda/profiling.html
10
+ # - Implementation written by Claude Opus (Anthropic).
11
+ """warpscope - in-kernel %globaltimer profiler for warp-specialized CUDA kernels.
12
+
13
+ Bracket logical stages inside a CUDA kernel with the device header
14
+ ``warpscope.cuh``, pass a zeroed uint64 buffer, then decode it here into per-
15
+ (block, group) durations or a Perfetto/Chrome trace file.
16
+ """
17
+
18
+ from ._decode import Instant, Span, TraceResult, decode
19
+ from ._wire import (
20
+ EVENT_TYPE_BEGIN,
21
+ EVENT_TYPE_END,
22
+ EVENT_TYPE_FINALIZE,
23
+ EVENT_TYPE_INSTANT,
24
+ WIRE_VERSION,
25
+ decode_tag,
26
+ pack_tag,
27
+ )
28
+ from .buffer import Profiler
29
+ from .cli import include_dir
30
+
31
+ __version__ = "0.1.0"
32
+
33
+ __all__ = [
34
+ "Profiler",
35
+ "decode",
36
+ "TraceResult",
37
+ "Span",
38
+ "Instant",
39
+ "include_dir",
40
+ "decode_tag",
41
+ "pack_tag",
42
+ "WIRE_VERSION",
43
+ "EVENT_TYPE_BEGIN",
44
+ "EVENT_TYPE_END",
45
+ "EVENT_TYPE_INSTANT",
46
+ "EVENT_TYPE_FINALIZE",
47
+ "__version__",
48
+ ]
@@ -0,0 +1,141 @@
1
+ # SPDX-License-Identifier: MIT
2
+ # Copyright (c) 2026 warpscope contributors
3
+ #
4
+ # Credits:
5
+ # - Original idea & design: 侯博涵 (Hou Bohan)
6
+ # https://zhuanlan.zhihu.com/p/2054305616391304228
7
+ # - Decode logic adapted from Apache TVM TIRx CudaProfiler (Apache-2.0):
8
+ # https://github.com/apache/tvm/blob/main/python/tvm/tirx/bench.py
9
+ # https://tvm.apache.org/docs/tirx/native_basics/cuda/profiling.html
10
+ # - Implementation written by Claude Opus (Anthropic).
11
+ """Decode a warpscope profiler buffer into spans/instants."""
12
+
13
+ from __future__ import annotations
14
+
15
+ from dataclasses import dataclass, field
16
+ from typing import Dict, List, Optional
17
+
18
+ import numpy as np
19
+
20
+ from ._wire import (
21
+ EVENT_TYPE_BEGIN,
22
+ EVENT_TYPE_END,
23
+ EVENT_TYPE_FINALIZE,
24
+ EVENT_TYPE_INSTANT,
25
+ decode_tag,
26
+ )
27
+
28
+
29
+ @dataclass
30
+ class Span:
31
+ block: int
32
+ group: int
33
+ event_id: int
34
+ begin_ns: int
35
+ dur_ns: int
36
+
37
+
38
+ @dataclass
39
+ class Instant:
40
+ block: int
41
+ group: int
42
+ event_id: int
43
+ ts_ns: int
44
+
45
+
46
+ @dataclass
47
+ class TraceResult:
48
+ num_blocks: int
49
+ num_groups: int
50
+ spans: List[Span] = field(default_factory=list)
51
+ instants: List[Instant] = field(default_factory=list)
52
+ event_names: Optional[Dict[int, str]] = None
53
+ group_names: Optional[Dict[int, str]] = None
54
+
55
+ def event_name(self, event_id: int) -> str:
56
+ if self.event_names and event_id in self.event_names:
57
+ return self.event_names[event_id]
58
+ return f"event_{event_id}"
59
+
60
+ def group_name(self, group: int) -> str:
61
+ if self.group_names and group in self.group_names:
62
+ return self.group_names[group]
63
+ return f"group_{group}"
64
+
65
+ def print_durations(self) -> None:
66
+ lanes: Dict[tuple, List[Span]] = {}
67
+ for s in self.spans:
68
+ lanes.setdefault((s.block, s.group), []).append(s)
69
+ for (block, group) in sorted(lanes):
70
+ parts = ", ".join(f"{self.event_name(s.event_id)}={s.dur_ns}ns" for s in lanes[(block, group)])
71
+ print(f"block {block} {self.group_name(group)}: {parts}")
72
+
73
+ def summary(self) -> Dict[tuple, Dict[str, float]]:
74
+ """Mean/max duration per (group, event) across all blocks."""
75
+ agg: Dict[tuple, List[int]] = {}
76
+ for s in self.spans:
77
+ agg.setdefault((s.group, s.event_id), []).append(s.dur_ns)
78
+ out = {}
79
+ for (group, ev), vals in agg.items():
80
+ arr = np.asarray(vals, dtype=np.float64)
81
+ out[(self.group_name(group), self.event_name(ev))] = {
82
+ "count": int(arr.size),
83
+ "mean_ns": float(arr.mean()),
84
+ "max_ns": float(arr.max()),
85
+ }
86
+ return out
87
+
88
+ def to_perfetto(self, path: str) -> str:
89
+ """Alias for to_chrome_trace (Perfetto UI also reads Chrome JSON)."""
90
+ return self.to_chrome_trace(path)
91
+
92
+ def to_chrome_trace(self, path: str) -> str:
93
+ from ._perfetto import write_chrome_trace
94
+
95
+ write_chrome_trace(self, path)
96
+ return path
97
+
98
+
99
+ def decode(
100
+ buf,
101
+ event_names: Optional[Dict[int, str]] = None,
102
+ group_names: Optional[Dict[int, str]] = None,
103
+ ) -> TraceResult:
104
+ """Decode a buffer (numpy array / torch tensor / sequence of uint64)."""
105
+ if hasattr(buf, "detach"): # torch tensor
106
+ buf = buf.detach().cpu().numpy()
107
+ arr = np.ascontiguousarray(buf).view(np.uint64).ravel()
108
+ if arr.size == 0:
109
+ return TraceResult(0, 0, event_names=event_names, group_names=group_names)
110
+
111
+ header = int(arr[0])
112
+ num_groups = header >> 32
113
+ num_blocks = header & 0xFFFFFFFF
114
+ result = TraceResult(int(num_blocks), int(num_groups),
115
+ event_names=event_names, group_names=group_names)
116
+ if num_groups == 0:
117
+ return result
118
+
119
+ opens: Dict[tuple, int] = {}
120
+ finished = set()
121
+ for i in range(1, arr.size):
122
+ w = int(arr[i])
123
+ if w == 0:
124
+ continue
125
+ ts = w >> 32
126
+ tag = w & 0xFFFFFFFF
127
+ block, group, ev, typ = decode_tag(tag, num_groups)
128
+ if typ == EVENT_TYPE_FINALIZE:
129
+ finished.add((block, group))
130
+ continue
131
+ if (block, group) in finished:
132
+ continue
133
+ if typ == EVENT_TYPE_BEGIN:
134
+ opens[(block, group, ev)] = ts
135
+ elif typ == EVENT_TYPE_END:
136
+ t0 = opens.pop((block, group, ev), None)
137
+ if t0 is not None:
138
+ result.spans.append(Span(block, group, ev, t0, (ts - t0) & 0xFFFFFFFF))
139
+ elif typ == EVENT_TYPE_INSTANT:
140
+ result.instants.append(Instant(block, group, ev, ts))
141
+ return result