warpscope 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- warpscope-0.1.0/LICENSE +37 -0
- warpscope-0.1.0/PKG-INFO +156 -0
- warpscope-0.1.0/README.md +132 -0
- warpscope-0.1.0/pyproject.toml +37 -0
- warpscope-0.1.0/setup.cfg +4 -0
- warpscope-0.1.0/tests/test_decode_golden.py +84 -0
- warpscope-0.1.0/tests/test_wire_roundtrip.py +47 -0
- warpscope-0.1.0/warpscope/__init__.py +48 -0
- warpscope-0.1.0/warpscope/_decode.py +141 -0
- warpscope-0.1.0/warpscope/_perfetto.py +83 -0
- warpscope-0.1.0/warpscope/_wire.py +57 -0
- warpscope-0.1.0/warpscope/buffer.py +79 -0
- warpscope-0.1.0/warpscope/cli.py +43 -0
- warpscope-0.1.0/warpscope/include/warpscope.cuh +115 -0
- warpscope-0.1.0/warpscope/include/warpscope_host.hpp +168 -0
- warpscope-0.1.0/warpscope.egg-info/PKG-INFO +156 -0
- warpscope-0.1.0/warpscope.egg-info/SOURCES.txt +19 -0
- warpscope-0.1.0/warpscope.egg-info/dependency_links.txt +1 -0
- warpscope-0.1.0/warpscope.egg-info/entry_points.txt +2 -0
- warpscope-0.1.0/warpscope.egg-info/requires.txt +10 -0
- warpscope-0.1.0/warpscope.egg-info/top_level.txt +1 -0
warpscope-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 warpscope contributors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
22
|
+
|
|
23
|
+
---
|
|
24
|
+
|
|
25
|
+
Credits / Attribution
|
|
26
|
+
|
|
27
|
+
This project's in-kernel CUDA profiling design and wire format originate from:
|
|
28
|
+
|
|
29
|
+
* 侯博涵 (Hou Bohan) — original idea and write-up:
|
|
30
|
+
https://zhuanlan.zhihu.com/p/2054305616391304228
|
|
31
|
+
|
|
32
|
+
* Apache TVM, TIRx `CudaProfiler` (Apache License 2.0) — wire format and the
|
|
33
|
+
host-side decode / Perfetto export logic are adapted from:
|
|
34
|
+
https://github.com/apache/tvm/blob/main/python/tvm/tirx/bench.py
|
|
35
|
+
https://tvm.apache.org/docs/tirx/native_basics/cuda/profiling.html
|
|
36
|
+
|
|
37
|
+
The CUDA implementation in this repository was written by Claude Opus (Anthropic).
|
warpscope-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: warpscope
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: In-kernel %globaltimer profiler for warp-specialized CUDA kernels (Perfetto/Chrome timelines).
|
|
5
|
+
Author: warpscope contributors
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/YangWang92/warpscope
|
|
8
|
+
Keywords: cuda,profiler,globaltimer,perfetto,gpu,warp-specialized,tracing
|
|
9
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Classifier: Topic :: Software Development :: Libraries
|
|
12
|
+
Classifier: Environment :: GPU :: NVIDIA CUDA
|
|
13
|
+
Requires-Python: >=3.9
|
|
14
|
+
Description-Content-Type: text/markdown
|
|
15
|
+
License-File: LICENSE
|
|
16
|
+
Requires-Dist: numpy>=1.21
|
|
17
|
+
Provides-Extra: torch
|
|
18
|
+
Requires-Dist: torch; extra == "torch"
|
|
19
|
+
Provides-Extra: perfetto
|
|
20
|
+
Requires-Dist: tg4perfetto; extra == "perfetto"
|
|
21
|
+
Provides-Extra: dev
|
|
22
|
+
Requires-Dist: pytest; extra == "dev"
|
|
23
|
+
Dynamic: license-file
|
|
24
|
+
|
|
25
|
+
# warpscope
|
|
26
|
+
|
|
27
|
+
English | [中文](README.zh.md)
|
|
28
|
+
|
|
29
|
+
> Credits: the idea and design all come from **侯博涵 (Hou Bohan)**'s write-up
|
|
30
|
+
> ([zhihu](https://zhuanlan.zhihu.com/p/2054305616391304228)); the wire format and the
|
|
31
|
+
> host-side decode / Perfetto export are adapted from **Apache TVM TIRx `CudaProfiler`**
|
|
32
|
+
> ([bench.py](https://github.com/apache/tvm/blob/main/python/tvm/tirx/bench.py),
|
|
33
|
+
> [docs](https://tvm.apache.org/docs/tirx/native_basics/cuda/profiling.html)).
|
|
34
|
+
> The implementation in this repo was **written by Claude Opus (Anthropic)**.
|
|
35
|
+
|
|
36
|
+
In-kernel `%globaltimer` profiler for **warp-specialized CUDA kernels**. Bracket the
|
|
37
|
+
logical stages inside a kernel (TMA load, MMA, softmax, epilogue, ...) with
|
|
38
|
+
`start`/`end` markers; one leader thread per logical group stamps the GPU global timer
|
|
39
|
+
into a buffer you pass as an ordinary kernel argument. Decode it on the host into
|
|
40
|
+
per-`(block, group)` durations or a **Perfetto / Chrome trace** to see how the
|
|
41
|
+
producer and consumer warp-groups actually overlap — something total launch time and
|
|
42
|
+
SM-level counters can't show.
|
|
43
|
+
|
|
44
|
+
It is **not** zero cost (a timer read + a global store + a block fence per event), so
|
|
45
|
+
it is a debugging/analysis tool. Build with the profiler disabled for production.
|
|
46
|
+
|
|
47
|
+
## Layout
|
|
48
|
+
|
|
49
|
+
```
|
|
50
|
+
warpscope/
|
|
51
|
+
include/
|
|
52
|
+
warpscope.cuh # device header (header-only, NVRTC-safe)
|
|
53
|
+
warpscope_host.hpp # host decoder + Chrome-trace writer (header-only, pure C++)
|
|
54
|
+
*.py # Python: Profiler buffer mgmt, decode, trace export
|
|
55
|
+
examples/ # toy CUDA program (pure C++ path) + python driver
|
|
56
|
+
tests/ # wire-format + decode tests
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
## Install
|
|
60
|
+
|
|
61
|
+
```bash
|
|
62
|
+
pip install -e . # core (numpy only)
|
|
63
|
+
pip install -e ".[torch]" # + GPU buffer allocation
|
|
64
|
+
pip install -e ".[dev]" # + pytest
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
## Device side (CUDA C++)
|
|
68
|
+
|
|
69
|
+
```cpp
|
|
70
|
+
#include <warpscope.cuh>
|
|
71
|
+
enum : uint32_t { EvWait = 0, EvWork = 1 };
|
|
72
|
+
|
|
73
|
+
__global__ void k(..., uint64_t* prof, uint32_t stride,
|
|
74
|
+
uint32_t num_groups, uint32_t num_blocks, uint32_t max_rec) {
|
|
75
|
+
ws::Profiler<true> p; // <false> compiles to a no-op
|
|
76
|
+
const uint32_t warp = threadIdx.x / 32, lane = threadIdx.x % 32;
|
|
77
|
+
|
|
78
|
+
if (warp == 0) { // e.g. TMA producer = group 0
|
|
79
|
+
p.init(prof, stride, /*group=*/0, num_groups, num_blocks,
|
|
80
|
+
/*leader=*/lane == 0, max_rec);
|
|
81
|
+
{ WS_REGION(p, EvWait); /* barrier wait */ } // RAII start/end
|
|
82
|
+
{ WS_REGION(p, EvWork); /* issue work */ }
|
|
83
|
+
p.finalize();
|
|
84
|
+
}
|
|
85
|
+
// ... other warp-groups: init with their own group id + one leader each ...
|
|
86
|
+
}
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
Build: `nvcc -I"$(warpscope --include)" -arch=sm_100a my.cu`
|
|
90
|
+
|
|
91
|
+
## Host side — pick one
|
|
92
|
+
|
|
93
|
+
**Pure C++ (header-only, no Python):**
|
|
94
|
+
|
|
95
|
+
```cpp
|
|
96
|
+
#include <warpscope_host.hpp>
|
|
97
|
+
std::vector<uint64_t> h(slots); // cudaMemcpy buffer back into h
|
|
98
|
+
ws::write_chrome_trace(h.data(), h.size(),
|
|
99
|
+
/*events*/ {"wait", "work"},
|
|
100
|
+
/*groups*/ {"tma", "umma", "utccp", "epilogue"},
|
|
101
|
+
"trace.json"); // open in chrome://tracing or perfetto
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
**Python:**
|
|
105
|
+
|
|
106
|
+
```python
|
|
107
|
+
import warpscope as ws
|
|
108
|
+
prof = ws.Profiler(num_blocks=num_sms, num_groups=4, max_records_per_lane=64)
|
|
109
|
+
launch(..., prof.ptr) # pass the device pointer
|
|
110
|
+
torch.cuda.synchronize()
|
|
111
|
+
res = prof.decode(event_names={0: "wait", 1: "work"},
|
|
112
|
+
group_names={0: "tma", 1: "umma", 2: "utccp", 3: "epilogue"})
|
|
113
|
+
res.print_durations()
|
|
114
|
+
res.to_perfetto("trace.json") # Chrome JSON; opens in ui.perfetto.dev too
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
## Output
|
|
118
|
+
|
|
119
|
+
The raw output is a `uint64` buffer. Both host paths turn it into a **Chrome Trace
|
|
120
|
+
Event JSON** file (`pid = block`, `tid = group`, `ts/dur` in microseconds) that opens
|
|
121
|
+
directly in `chrome://tracing` and <https://ui.perfetto.dev>. A native
|
|
122
|
+
`.perfetto-trace` writer is available via the optional `tg4perfetto` dependency.
|
|
123
|
+
|
|
124
|
+
## Wire format (v1, shared ABI)
|
|
125
|
+
|
|
126
|
+
```
|
|
127
|
+
record = (globaltimer_lo32 << 32) | tag32
|
|
128
|
+
tag32 = (block_group << 12) | (event_id << 2) | event_type
|
|
129
|
+
block_group = block_idx * num_groups + group_id
|
|
130
|
+
event_type : 0=begin 1=end 2=instant 3=finalize
|
|
131
|
+
buf[0] header = (num_groups << 32) | num_blocks
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
Identical to the format used by TIRx/flashinfer, so traces are cross-tool compatible.
|
|
135
|
+
|
|
136
|
+
## Caveats
|
|
137
|
+
|
|
138
|
+
- Zero the buffer before launch (the decoder treats 0 as empty).
|
|
139
|
+
- Exactly one leader thread per `(block, group)` lane (two writers clobber the cursor).
|
|
140
|
+
- `%globaltimer_lo` is 32-bit ns: ~tens-of-ns resolution and a ~4.29 s wrap.
|
|
141
|
+
- Persistent grids stream records — cap with `max_records_per_lane` (host) which is
|
|
142
|
+
also enforced device-side via `init(..., max_records_per_lane=...)`.
|
|
143
|
+
- The fence + store perturb tight pipelines; keep events coarse and compare against an
|
|
144
|
+
unprofiled (`ws::Profiler<false>`) build.
|
|
145
|
+
|
|
146
|
+
## Credits & License
|
|
147
|
+
|
|
148
|
+
Licensed under the **MIT License** (see [LICENSE](LICENSE)).
|
|
149
|
+
|
|
150
|
+
- **侯博涵 (Hou Bohan)** — original idea and write-up:
|
|
151
|
+
<https://zhuanlan.zhihu.com/p/2054305616391304228>
|
|
152
|
+
- **Apache TVM TIRx `CudaProfiler`** (Apache-2.0) — wire format + host decode/Perfetto
|
|
153
|
+
export are adapted from it:
|
|
154
|
+
<https://github.com/apache/tvm/blob/main/python/tvm/tirx/bench.py> ·
|
|
155
|
+
<https://tvm.apache.org/docs/tirx/native_basics/cuda/profiling.html>
|
|
156
|
+
- The CUDA/Python implementation in this repository was **written by Claude Opus (Anthropic)**.
|
|
@@ -0,0 +1,132 @@
|
|
|
1
|
+
# warpscope
|
|
2
|
+
|
|
3
|
+
English | [中文](README.zh.md)
|
|
4
|
+
|
|
5
|
+
> Credits: the idea and design all come from **侯博涵 (Hou Bohan)**'s write-up
|
|
6
|
+
> ([zhihu](https://zhuanlan.zhihu.com/p/2054305616391304228)); the wire format and the
|
|
7
|
+
> host-side decode / Perfetto export are adapted from **Apache TVM TIRx `CudaProfiler`**
|
|
8
|
+
> ([bench.py](https://github.com/apache/tvm/blob/main/python/tvm/tirx/bench.py),
|
|
9
|
+
> [docs](https://tvm.apache.org/docs/tirx/native_basics/cuda/profiling.html)).
|
|
10
|
+
> The implementation in this repo was **written by Claude Opus (Anthropic)**.
|
|
11
|
+
|
|
12
|
+
In-kernel `%globaltimer` profiler for **warp-specialized CUDA kernels**. Bracket the
|
|
13
|
+
logical stages inside a kernel (TMA load, MMA, softmax, epilogue, ...) with
|
|
14
|
+
`start`/`end` markers; one leader thread per logical group stamps the GPU global timer
|
|
15
|
+
into a buffer you pass as an ordinary kernel argument. Decode it on the host into
|
|
16
|
+
per-`(block, group)` durations or a **Perfetto / Chrome trace** to see how the
|
|
17
|
+
producer and consumer warp-groups actually overlap — something total launch time and
|
|
18
|
+
SM-level counters can't show.
|
|
19
|
+
|
|
20
|
+
It is **not** zero cost (a timer read + a global store + a block fence per event), so
|
|
21
|
+
it is a debugging/analysis tool. Build with the profiler disabled for production.
|
|
22
|
+
|
|
23
|
+
## Layout
|
|
24
|
+
|
|
25
|
+
```
|
|
26
|
+
warpscope/
|
|
27
|
+
include/
|
|
28
|
+
warpscope.cuh # device header (header-only, NVRTC-safe)
|
|
29
|
+
warpscope_host.hpp # host decoder + Chrome-trace writer (header-only, pure C++)
|
|
30
|
+
*.py # Python: Profiler buffer mgmt, decode, trace export
|
|
31
|
+
examples/ # toy CUDA program (pure C++ path) + python driver
|
|
32
|
+
tests/ # wire-format + decode tests
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
## Install
|
|
36
|
+
|
|
37
|
+
```bash
|
|
38
|
+
pip install -e . # core (numpy only)
|
|
39
|
+
pip install -e ".[torch]" # + GPU buffer allocation
|
|
40
|
+
pip install -e ".[dev]" # + pytest
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
## Device side (CUDA C++)
|
|
44
|
+
|
|
45
|
+
```cpp
|
|
46
|
+
#include <warpscope.cuh>
|
|
47
|
+
enum : uint32_t { EvWait = 0, EvWork = 1 };
|
|
48
|
+
|
|
49
|
+
__global__ void k(..., uint64_t* prof, uint32_t stride,
|
|
50
|
+
uint32_t num_groups, uint32_t num_blocks, uint32_t max_rec) {
|
|
51
|
+
ws::Profiler<true> p; // <false> compiles to a no-op
|
|
52
|
+
const uint32_t warp = threadIdx.x / 32, lane = threadIdx.x % 32;
|
|
53
|
+
|
|
54
|
+
if (warp == 0) { // e.g. TMA producer = group 0
|
|
55
|
+
p.init(prof, stride, /*group=*/0, num_groups, num_blocks,
|
|
56
|
+
/*leader=*/lane == 0, max_rec);
|
|
57
|
+
{ WS_REGION(p, EvWait); /* barrier wait */ } // RAII start/end
|
|
58
|
+
{ WS_REGION(p, EvWork); /* issue work */ }
|
|
59
|
+
p.finalize();
|
|
60
|
+
}
|
|
61
|
+
// ... other warp-groups: init with their own group id + one leader each ...
|
|
62
|
+
}
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
Build: `nvcc -I"$(warpscope --include)" -arch=sm_100a my.cu`
|
|
66
|
+
|
|
67
|
+
## Host side — pick one
|
|
68
|
+
|
|
69
|
+
**Pure C++ (header-only, no Python):**
|
|
70
|
+
|
|
71
|
+
```cpp
|
|
72
|
+
#include <warpscope_host.hpp>
|
|
73
|
+
std::vector<uint64_t> h(slots); // cudaMemcpy buffer back into h
|
|
74
|
+
ws::write_chrome_trace(h.data(), h.size(),
|
|
75
|
+
/*events*/ {"wait", "work"},
|
|
76
|
+
/*groups*/ {"tma", "umma", "utccp", "epilogue"},
|
|
77
|
+
"trace.json"); // open in chrome://tracing or perfetto
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
**Python:**
|
|
81
|
+
|
|
82
|
+
```python
|
|
83
|
+
import warpscope as ws
|
|
84
|
+
prof = ws.Profiler(num_blocks=num_sms, num_groups=4, max_records_per_lane=64)
|
|
85
|
+
launch(..., prof.ptr) # pass the device pointer
|
|
86
|
+
torch.cuda.synchronize()
|
|
87
|
+
res = prof.decode(event_names={0: "wait", 1: "work"},
|
|
88
|
+
group_names={0: "tma", 1: "umma", 2: "utccp", 3: "epilogue"})
|
|
89
|
+
res.print_durations()
|
|
90
|
+
res.to_perfetto("trace.json") # Chrome JSON; opens in ui.perfetto.dev too
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
## Output
|
|
94
|
+
|
|
95
|
+
The raw output is a `uint64` buffer. Both host paths turn it into a **Chrome Trace
|
|
96
|
+
Event JSON** file (`pid = block`, `tid = group`, `ts/dur` in microseconds) that opens
|
|
97
|
+
directly in `chrome://tracing` and <https://ui.perfetto.dev>. A native
|
|
98
|
+
`.perfetto-trace` writer is available via the optional `tg4perfetto` dependency.
|
|
99
|
+
|
|
100
|
+
## Wire format (v1, shared ABI)
|
|
101
|
+
|
|
102
|
+
```
|
|
103
|
+
record = (globaltimer_lo32 << 32) | tag32
|
|
104
|
+
tag32 = (block_group << 12) | (event_id << 2) | event_type
|
|
105
|
+
block_group = block_idx * num_groups + group_id
|
|
106
|
+
event_type : 0=begin 1=end 2=instant 3=finalize
|
|
107
|
+
buf[0] header = (num_groups << 32) | num_blocks
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
Identical to the format used by TIRx/flashinfer, so traces are cross-tool compatible.
|
|
111
|
+
|
|
112
|
+
## Caveats
|
|
113
|
+
|
|
114
|
+
- Zero the buffer before launch (the decoder treats 0 as empty).
|
|
115
|
+
- Exactly one leader thread per `(block, group)` lane (two writers clobber the cursor).
|
|
116
|
+
- `%globaltimer_lo` is 32-bit ns: ~tens-of-ns resolution and a ~4.29 s wrap.
|
|
117
|
+
- Persistent grids stream records — cap with `max_records_per_lane` (host) which is
|
|
118
|
+
also enforced device-side via `init(..., max_records_per_lane=...)`.
|
|
119
|
+
- The fence + store perturb tight pipelines; keep events coarse and compare against an
|
|
120
|
+
unprofiled (`ws::Profiler<false>`) build.
|
|
121
|
+
|
|
122
|
+
## Credits & License
|
|
123
|
+
|
|
124
|
+
Licensed under the **MIT License** (see [LICENSE](LICENSE)).
|
|
125
|
+
|
|
126
|
+
- **侯博涵 (Hou Bohan)** — original idea and write-up:
|
|
127
|
+
<https://zhuanlan.zhihu.com/p/2054305616391304228>
|
|
128
|
+
- **Apache TVM TIRx `CudaProfiler`** (Apache-2.0) — wire format + host decode/Perfetto
|
|
129
|
+
export are adapted from it:
|
|
130
|
+
<https://github.com/apache/tvm/blob/main/python/tvm/tirx/bench.py> ·
|
|
131
|
+
<https://tvm.apache.org/docs/tirx/native_basics/cuda/profiling.html>
|
|
132
|
+
- The CUDA/Python implementation in this repository was **written by Claude Opus (Anthropic)**.
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=64", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "warpscope"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "In-kernel %globaltimer profiler for warp-specialized CUDA kernels (Perfetto/Chrome timelines)."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.9"
|
|
11
|
+
license = { text = "MIT" }
|
|
12
|
+
authors = [{ name = "warpscope contributors" }]
|
|
13
|
+
keywords = ["cuda", "profiler", "globaltimer", "perfetto", "gpu", "warp-specialized", "tracing"]
|
|
14
|
+
classifiers = [
|
|
15
|
+
"License :: OSI Approved :: MIT License",
|
|
16
|
+
"Programming Language :: Python :: 3",
|
|
17
|
+
"Topic :: Software Development :: Libraries",
|
|
18
|
+
"Environment :: GPU :: NVIDIA CUDA",
|
|
19
|
+
]
|
|
20
|
+
dependencies = ["numpy>=1.21"]
|
|
21
|
+
|
|
22
|
+
[project.optional-dependencies]
|
|
23
|
+
torch = ["torch"]
|
|
24
|
+
perfetto = ["tg4perfetto"]
|
|
25
|
+
dev = ["pytest"]
|
|
26
|
+
|
|
27
|
+
[project.scripts]
|
|
28
|
+
warpscope = "warpscope.cli:main"
|
|
29
|
+
|
|
30
|
+
[project.urls]
|
|
31
|
+
Homepage = "https://github.com/YangWang92/warpscope"
|
|
32
|
+
|
|
33
|
+
[tool.setuptools]
|
|
34
|
+
packages = ["warpscope"]
|
|
35
|
+
|
|
36
|
+
[tool.setuptools.package-data]
|
|
37
|
+
warpscope = ["include/*.cuh", "include/*.hpp"]
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
"""Build a synthetic buffer the way the device would, then check decode + export."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import os
|
|
5
|
+
|
|
6
|
+
import numpy as np
|
|
7
|
+
|
|
8
|
+
from warpscope import decode
|
|
9
|
+
from warpscope._wire import (
|
|
10
|
+
EVENT_TYPE_BEGIN,
|
|
11
|
+
EVENT_TYPE_END,
|
|
12
|
+
EVENT_TYPE_FINALIZE,
|
|
13
|
+
pack_header,
|
|
14
|
+
pack_record,
|
|
15
|
+
pack_tag,
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _build_buffer(num_blocks, num_groups, lane_records):
|
|
20
|
+
"""lane_records: {(block, group): [(event_id, type, ts), ...]} in cursor order."""
|
|
21
|
+
write_stride = num_blocks * num_groups
|
|
22
|
+
max_rec = max((len(v) for v in lane_records.values()), default=0) + 1
|
|
23
|
+
buf = np.zeros(1 + write_stride * max_rec, dtype=np.uint64)
|
|
24
|
+
buf[0] = pack_header(num_groups, num_blocks)
|
|
25
|
+
for (block, group), recs in lane_records.items():
|
|
26
|
+
bg = block * num_groups + group
|
|
27
|
+
cursor = 1 + bg
|
|
28
|
+
for (ev, typ, ts) in recs:
|
|
29
|
+
tag = pack_tag(block, group, num_groups, ev, typ)
|
|
30
|
+
buf[cursor] = pack_record(ts, tag)
|
|
31
|
+
cursor += write_stride
|
|
32
|
+
return buf
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def test_decode_two_groups():
|
|
36
|
+
lanes = {
|
|
37
|
+
(0, 0): [(0, EVENT_TYPE_BEGIN, 100), (0, EVENT_TYPE_END, 196),
|
|
38
|
+
(1, EVENT_TYPE_BEGIN, 200), (1, EVENT_TYPE_END, 3240),
|
|
39
|
+
(0, EVENT_TYPE_FINALIZE, 3300)],
|
|
40
|
+
(0, 1): [(1, EVENT_TYPE_BEGIN, 200), (1, EVENT_TYPE_END, 11016),
|
|
41
|
+
(0, EVENT_TYPE_FINALIZE, 11100)],
|
|
42
|
+
}
|
|
43
|
+
buf = _build_buffer(num_blocks=1, num_groups=2, lane_records=lanes)
|
|
44
|
+
res = decode(buf, event_names={0: "load", 1: "compute"},
|
|
45
|
+
group_names={0: "light", 1: "heavy"})
|
|
46
|
+
|
|
47
|
+
assert res.num_blocks == 1 and res.num_groups == 2
|
|
48
|
+
durs = {(s.group, s.event_id): s.dur_ns for s in res.spans}
|
|
49
|
+
assert durs[(0, 0)] == 96 # group 0 load
|
|
50
|
+
assert durs[(0, 1)] == 3040 # group 0 compute
|
|
51
|
+
assert durs[(1, 1)] == 10816 # group 1 compute (heavier)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def test_finalize_drops_later_records():
|
|
55
|
+
lanes = {
|
|
56
|
+
(0, 0): [(0, EVENT_TYPE_BEGIN, 10), (0, EVENT_TYPE_END, 20),
|
|
57
|
+
(0, EVENT_TYPE_FINALIZE, 30),
|
|
58
|
+
(0, EVENT_TYPE_BEGIN, 40), (0, EVENT_TYPE_END, 99)],
|
|
59
|
+
}
|
|
60
|
+
buf = _build_buffer(1, 1, lanes)
|
|
61
|
+
res = decode(buf)
|
|
62
|
+
assert len(res.spans) == 1
|
|
63
|
+
assert res.spans[0].dur_ns == 10
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def test_timestamp_wrap():
|
|
67
|
+
lanes = {(0, 0): [(0, EVENT_TYPE_BEGIN, 0xFFFFFFF0), (0, EVENT_TYPE_END, 0x10)]}
|
|
68
|
+
buf = _build_buffer(1, 1, lanes)
|
|
69
|
+
res = decode(buf)
|
|
70
|
+
assert res.spans[0].dur_ns == 0x20 # (0x10 - 0xFFFFFFF0) & 0xFFFFFFFF
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def test_chrome_trace_export(tmp_path):
|
|
74
|
+
lanes = {(0, 0): [(0, EVENT_TYPE_BEGIN, 1000), (0, EVENT_TYPE_END, 2000)]}
|
|
75
|
+
buf = _build_buffer(1, 1, lanes)
|
|
76
|
+
res = decode(buf, event_names={0: "work"}, group_names={0: "main"})
|
|
77
|
+
out = os.path.join(tmp_path, "t.json")
|
|
78
|
+
res.to_perfetto(out)
|
|
79
|
+
with open(out) as f:
|
|
80
|
+
data = json.load(f)
|
|
81
|
+
xs = [e for e in data["traceEvents"] if e["ph"] == "X"]
|
|
82
|
+
assert len(xs) == 1
|
|
83
|
+
assert xs[0]["name"] == "work"
|
|
84
|
+
assert xs[0]["ts"] == 1.0 and xs[0]["dur"] == 1.0 # microseconds
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
import pytest
|
|
3
|
+
|
|
4
|
+
from warpscope._wire import (
|
|
5
|
+
EVENT_TYPE_BEGIN,
|
|
6
|
+
EVENT_TYPE_END,
|
|
7
|
+
decode_tag,
|
|
8
|
+
pack_header,
|
|
9
|
+
pack_record,
|
|
10
|
+
pack_tag,
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def test_tag_roundtrip():
|
|
15
|
+
for block in (0, 1, 7, 147):
|
|
16
|
+
for group in range(4):
|
|
17
|
+
for ev in (0, 1, 9, 1023):
|
|
18
|
+
for typ in range(4):
|
|
19
|
+
tag = pack_tag(block, group, 4, ev, typ)
|
|
20
|
+
b, g, e, t = decode_tag(tag, 4)
|
|
21
|
+
assert (b, g, e, t) == (block, group, ev, typ)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def test_record_layout():
|
|
25
|
+
tag = pack_tag(2, 1, 4, 6, EVENT_TYPE_BEGIN)
|
|
26
|
+
rec = pack_record(0x1234ABCD, tag)
|
|
27
|
+
assert rec >> 32 == 0x1234ABCD
|
|
28
|
+
assert rec & 0xFFFFFFFF == tag
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def test_header():
|
|
32
|
+
h = pack_header(num_groups=4, num_blocks=148)
|
|
33
|
+
assert h >> 32 == 4
|
|
34
|
+
assert h & 0xFFFFFFFF == 148
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def test_event_id_overflow():
|
|
38
|
+
with pytest.raises(ValueError):
|
|
39
|
+
pack_tag(0, 0, 1, 1024, EVENT_TYPE_END)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def test_view_as_uint64():
|
|
43
|
+
# records must survive an int64<->uint64 view (torch stores as int64)
|
|
44
|
+
tag = pack_tag(1, 0, 2, 3, EVENT_TYPE_END)
|
|
45
|
+
rec = pack_record(0xFFFFFFFF, tag)
|
|
46
|
+
arr = np.array([rec], dtype=np.uint64)
|
|
47
|
+
assert int(arr.view(np.int64).view(np.uint64)[0]) == rec
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
# SPDX-License-Identifier: MIT
|
|
2
|
+
# Copyright (c) 2026 warpscope contributors
|
|
3
|
+
#
|
|
4
|
+
# Credits:
|
|
5
|
+
# - Original idea & design: 侯博涵 (Hou Bohan)
|
|
6
|
+
# https://zhuanlan.zhihu.com/p/2054305616391304228
|
|
7
|
+
# - Wire format / decode / Perfetto export adapted from Apache TVM TIRx CudaProfiler:
|
|
8
|
+
# https://github.com/apache/tvm/blob/main/python/tvm/tirx/bench.py
|
|
9
|
+
# https://tvm.apache.org/docs/tirx/native_basics/cuda/profiling.html
|
|
10
|
+
# - Implementation written by Claude Opus (Anthropic).
|
|
11
|
+
"""warpscope - in-kernel %globaltimer profiler for warp-specialized CUDA kernels.
|
|
12
|
+
|
|
13
|
+
Bracket logical stages inside a CUDA kernel with the device header
|
|
14
|
+
``warpscope.cuh``, pass a zeroed uint64 buffer, then decode it here into per-
|
|
15
|
+
(block, group) durations or a Perfetto/Chrome trace file.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
from ._decode import Instant, Span, TraceResult, decode
|
|
19
|
+
from ._wire import (
|
|
20
|
+
EVENT_TYPE_BEGIN,
|
|
21
|
+
EVENT_TYPE_END,
|
|
22
|
+
EVENT_TYPE_FINALIZE,
|
|
23
|
+
EVENT_TYPE_INSTANT,
|
|
24
|
+
WIRE_VERSION,
|
|
25
|
+
decode_tag,
|
|
26
|
+
pack_tag,
|
|
27
|
+
)
|
|
28
|
+
from .buffer import Profiler
|
|
29
|
+
from .cli import include_dir
|
|
30
|
+
|
|
31
|
+
__version__ = "0.1.0"
|
|
32
|
+
|
|
33
|
+
__all__ = [
|
|
34
|
+
"Profiler",
|
|
35
|
+
"decode",
|
|
36
|
+
"TraceResult",
|
|
37
|
+
"Span",
|
|
38
|
+
"Instant",
|
|
39
|
+
"include_dir",
|
|
40
|
+
"decode_tag",
|
|
41
|
+
"pack_tag",
|
|
42
|
+
"WIRE_VERSION",
|
|
43
|
+
"EVENT_TYPE_BEGIN",
|
|
44
|
+
"EVENT_TYPE_END",
|
|
45
|
+
"EVENT_TYPE_INSTANT",
|
|
46
|
+
"EVENT_TYPE_FINALIZE",
|
|
47
|
+
"__version__",
|
|
48
|
+
]
|
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
# SPDX-License-Identifier: MIT
|
|
2
|
+
# Copyright (c) 2026 warpscope contributors
|
|
3
|
+
#
|
|
4
|
+
# Credits:
|
|
5
|
+
# - Original idea & design: 侯博涵 (Hou Bohan)
|
|
6
|
+
# https://zhuanlan.zhihu.com/p/2054305616391304228
|
|
7
|
+
# - Decode logic adapted from Apache TVM TIRx CudaProfiler (Apache-2.0):
|
|
8
|
+
# https://github.com/apache/tvm/blob/main/python/tvm/tirx/bench.py
|
|
9
|
+
# https://tvm.apache.org/docs/tirx/native_basics/cuda/profiling.html
|
|
10
|
+
# - Implementation written by Claude Opus (Anthropic).
|
|
11
|
+
"""Decode a warpscope profiler buffer into spans/instants."""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
from dataclasses import dataclass, field
|
|
16
|
+
from typing import Dict, List, Optional
|
|
17
|
+
|
|
18
|
+
import numpy as np
|
|
19
|
+
|
|
20
|
+
from ._wire import (
|
|
21
|
+
EVENT_TYPE_BEGIN,
|
|
22
|
+
EVENT_TYPE_END,
|
|
23
|
+
EVENT_TYPE_FINALIZE,
|
|
24
|
+
EVENT_TYPE_INSTANT,
|
|
25
|
+
decode_tag,
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@dataclass
|
|
30
|
+
class Span:
|
|
31
|
+
block: int
|
|
32
|
+
group: int
|
|
33
|
+
event_id: int
|
|
34
|
+
begin_ns: int
|
|
35
|
+
dur_ns: int
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@dataclass
|
|
39
|
+
class Instant:
|
|
40
|
+
block: int
|
|
41
|
+
group: int
|
|
42
|
+
event_id: int
|
|
43
|
+
ts_ns: int
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
@dataclass
|
|
47
|
+
class TraceResult:
|
|
48
|
+
num_blocks: int
|
|
49
|
+
num_groups: int
|
|
50
|
+
spans: List[Span] = field(default_factory=list)
|
|
51
|
+
instants: List[Instant] = field(default_factory=list)
|
|
52
|
+
event_names: Optional[Dict[int, str]] = None
|
|
53
|
+
group_names: Optional[Dict[int, str]] = None
|
|
54
|
+
|
|
55
|
+
def event_name(self, event_id: int) -> str:
|
|
56
|
+
if self.event_names and event_id in self.event_names:
|
|
57
|
+
return self.event_names[event_id]
|
|
58
|
+
return f"event_{event_id}"
|
|
59
|
+
|
|
60
|
+
def group_name(self, group: int) -> str:
|
|
61
|
+
if self.group_names and group in self.group_names:
|
|
62
|
+
return self.group_names[group]
|
|
63
|
+
return f"group_{group}"
|
|
64
|
+
|
|
65
|
+
def print_durations(self) -> None:
|
|
66
|
+
lanes: Dict[tuple, List[Span]] = {}
|
|
67
|
+
for s in self.spans:
|
|
68
|
+
lanes.setdefault((s.block, s.group), []).append(s)
|
|
69
|
+
for (block, group) in sorted(lanes):
|
|
70
|
+
parts = ", ".join(f"{self.event_name(s.event_id)}={s.dur_ns}ns" for s in lanes[(block, group)])
|
|
71
|
+
print(f"block {block} {self.group_name(group)}: {parts}")
|
|
72
|
+
|
|
73
|
+
def summary(self) -> Dict[tuple, Dict[str, float]]:
|
|
74
|
+
"""Mean/max duration per (group, event) across all blocks."""
|
|
75
|
+
agg: Dict[tuple, List[int]] = {}
|
|
76
|
+
for s in self.spans:
|
|
77
|
+
agg.setdefault((s.group, s.event_id), []).append(s.dur_ns)
|
|
78
|
+
out = {}
|
|
79
|
+
for (group, ev), vals in agg.items():
|
|
80
|
+
arr = np.asarray(vals, dtype=np.float64)
|
|
81
|
+
out[(self.group_name(group), self.event_name(ev))] = {
|
|
82
|
+
"count": int(arr.size),
|
|
83
|
+
"mean_ns": float(arr.mean()),
|
|
84
|
+
"max_ns": float(arr.max()),
|
|
85
|
+
}
|
|
86
|
+
return out
|
|
87
|
+
|
|
88
|
+
def to_perfetto(self, path: str) -> str:
|
|
89
|
+
"""Alias for to_chrome_trace (Perfetto UI also reads Chrome JSON)."""
|
|
90
|
+
return self.to_chrome_trace(path)
|
|
91
|
+
|
|
92
|
+
def to_chrome_trace(self, path: str) -> str:
|
|
93
|
+
from ._perfetto import write_chrome_trace
|
|
94
|
+
|
|
95
|
+
write_chrome_trace(self, path)
|
|
96
|
+
return path
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def decode(
|
|
100
|
+
buf,
|
|
101
|
+
event_names: Optional[Dict[int, str]] = None,
|
|
102
|
+
group_names: Optional[Dict[int, str]] = None,
|
|
103
|
+
) -> TraceResult:
|
|
104
|
+
"""Decode a buffer (numpy array / torch tensor / sequence of uint64)."""
|
|
105
|
+
if hasattr(buf, "detach"): # torch tensor
|
|
106
|
+
buf = buf.detach().cpu().numpy()
|
|
107
|
+
arr = np.ascontiguousarray(buf).view(np.uint64).ravel()
|
|
108
|
+
if arr.size == 0:
|
|
109
|
+
return TraceResult(0, 0, event_names=event_names, group_names=group_names)
|
|
110
|
+
|
|
111
|
+
header = int(arr[0])
|
|
112
|
+
num_groups = header >> 32
|
|
113
|
+
num_blocks = header & 0xFFFFFFFF
|
|
114
|
+
result = TraceResult(int(num_blocks), int(num_groups),
|
|
115
|
+
event_names=event_names, group_names=group_names)
|
|
116
|
+
if num_groups == 0:
|
|
117
|
+
return result
|
|
118
|
+
|
|
119
|
+
opens: Dict[tuple, int] = {}
|
|
120
|
+
finished = set()
|
|
121
|
+
for i in range(1, arr.size):
|
|
122
|
+
w = int(arr[i])
|
|
123
|
+
if w == 0:
|
|
124
|
+
continue
|
|
125
|
+
ts = w >> 32
|
|
126
|
+
tag = w & 0xFFFFFFFF
|
|
127
|
+
block, group, ev, typ = decode_tag(tag, num_groups)
|
|
128
|
+
if typ == EVENT_TYPE_FINALIZE:
|
|
129
|
+
finished.add((block, group))
|
|
130
|
+
continue
|
|
131
|
+
if (block, group) in finished:
|
|
132
|
+
continue
|
|
133
|
+
if typ == EVENT_TYPE_BEGIN:
|
|
134
|
+
opens[(block, group, ev)] = ts
|
|
135
|
+
elif typ == EVENT_TYPE_END:
|
|
136
|
+
t0 = opens.pop((block, group, ev), None)
|
|
137
|
+
if t0 is not None:
|
|
138
|
+
result.spans.append(Span(block, group, ev, t0, (ts - t0) & 0xFFFFFFFF))
|
|
139
|
+
elif typ == EVENT_TYPE_INSTANT:
|
|
140
|
+
result.instants.append(Instant(block, group, ev, ts))
|
|
141
|
+
return result
|