gpufl 0.1.0.dev0__tar.gz → 0.1.0.dev7__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gpufl-0.1.0.dev7/.github/workflows/release.yml +193 -0
- {gpufl-0.1.0.dev0 → gpufl-0.1.0.dev7}/CMakeLists.txt +77 -3
- {gpufl-0.1.0.dev0 → gpufl-0.1.0.dev7}/PKG-INFO +1 -1
- {gpufl-0.1.0.dev0 → gpufl-0.1.0.dev7}/example/cuda/block_style_example.cu +3 -1
- {gpufl-0.1.0.dev0 → gpufl-0.1.0.dev7}/example/python/analyzer/01_analyzer_sample.py +5 -1
- gpufl-0.1.0.dev7/include/gpufl/backends/nvidia/cupti_backend.cpp +316 -0
- gpufl-0.1.0.dev7/include/gpufl/backends/nvidia/cupti_backend.hpp +116 -0
- {gpufl-0.1.0.dev0 → gpufl-0.1.0.dev7}/include/gpufl/backends/nvidia/cupti_common.hpp +11 -0
- gpufl-0.1.0.dev7/include/gpufl/backends/nvidia/cupti_utils.cpp +152 -0
- {gpufl-0.1.0.dev0 → gpufl-0.1.0.dev7}/include/gpufl/backends/nvidia/cupti_utils.hpp +28 -0
- gpufl-0.1.0.dev7/include/gpufl/backends/nvidia/engine/pc_sampling_engine.cpp +395 -0
- gpufl-0.1.0.dev7/include/gpufl/backends/nvidia/engine/pc_sampling_engine.hpp +66 -0
- gpufl-0.1.0.dev7/include/gpufl/backends/nvidia/engine/profiling_engine.hpp +73 -0
- gpufl-0.1.0.dev7/include/gpufl/backends/nvidia/engine/range_profiler_engine.cpp +479 -0
- gpufl-0.1.0.dev7/include/gpufl/backends/nvidia/engine/range_profiler_engine.hpp +53 -0
- gpufl-0.1.0.dev7/include/gpufl/backends/nvidia/engine/sass_metrics_engine.cpp +221 -0
- gpufl-0.1.0.dev7/include/gpufl/backends/nvidia/engine/sass_metrics_engine.hpp +44 -0
- {gpufl-0.1.0.dev0 → gpufl-0.1.0.dev7}/include/gpufl/backends/nvidia/kernel_launch_handler.cpp +46 -1
- {gpufl-0.1.0.dev0 → gpufl-0.1.0.dev7}/include/gpufl/backends/nvidia/resource_handler.cpp +2 -3
- {gpufl-0.1.0.dev0 → gpufl-0.1.0.dev7}/include/gpufl/core/debug_logger.hpp +1 -1
- {gpufl-0.1.0.dev0 → gpufl-0.1.0.dev7}/include/gpufl/core/events.hpp +29 -8
- {gpufl-0.1.0.dev0 → gpufl-0.1.0.dev7}/include/gpufl/core/gpufl.cpp +54 -21
- gpufl-0.1.0.dev7/include/gpufl/core/logger/file_compressor.cpp +44 -0
- gpufl-0.1.0.dev7/include/gpufl/core/logger/file_compressor.hpp +18 -0
- gpufl-0.1.0.dev7/include/gpufl/core/logger/log_rotator.cpp +65 -0
- gpufl-0.1.0.dev7/include/gpufl/core/logger/log_rotator.hpp +32 -0
- gpufl-0.1.0.dev7/include/gpufl/core/logger/logger.cpp +152 -0
- gpufl-0.1.0.dev7/include/gpufl/core/logger/logger.hpp +70 -0
- gpufl-0.1.0.dev7/include/gpufl/core/model/kernel_event_model.cpp +51 -0
- gpufl-0.1.0.dev7/include/gpufl/core/model/kernel_event_model.hpp +16 -0
- gpufl-0.1.0.dev7/include/gpufl/core/model/lifecycle_model.cpp +34 -0
- gpufl-0.1.0.dev7/include/gpufl/core/model/lifecycle_model.hpp +24 -0
- gpufl-0.1.0.dev7/include/gpufl/core/model/memcpy_event_model.cpp +58 -0
- gpufl-0.1.0.dev7/include/gpufl/core/model/memcpy_event_model.hpp +24 -0
- gpufl-0.1.0.dev7/include/gpufl/core/model/model_utils.hpp +94 -0
- gpufl-0.1.0.dev7/include/gpufl/core/model/perf_metric_model.cpp +33 -0
- gpufl-0.1.0.dev7/include/gpufl/core/model/perf_metric_model.hpp +16 -0
- gpufl-0.1.0.dev7/include/gpufl/core/model/profile_sample_model.cpp +40 -0
- gpufl-0.1.0.dev7/include/gpufl/core/model/profile_sample_model.hpp +16 -0
- gpufl-0.1.0.dev7/include/gpufl/core/model/scope_event_model.cpp +43 -0
- gpufl-0.1.0.dev7/include/gpufl/core/model/scope_event_model.hpp +24 -0
- gpufl-0.1.0.dev7/include/gpufl/core/model/serializable.hpp +15 -0
- gpufl-0.1.0.dev7/include/gpufl/core/model/system_event_model.cpp +51 -0
- gpufl-0.1.0.dev7/include/gpufl/core/model/system_event_model.hpp +32 -0
- {gpufl-0.1.0.dev0 → gpufl-0.1.0.dev7}/include/gpufl/core/monitor.hpp +25 -30
- {gpufl-0.1.0.dev0 → gpufl-0.1.0.dev7}/include/gpufl/core/monitor_backend.hpp +7 -0
- {gpufl-0.1.0.dev0 → gpufl-0.1.0.dev7}/include/gpufl/core/sampler.cpp +3 -2
- {gpufl-0.1.0.dev0 → gpufl-0.1.0.dev7}/include/gpufl/cuda/monitor.cpp +33 -8
- {gpufl-0.1.0.dev0 → gpufl-0.1.0.dev7}/include/gpufl/gpufl.hpp +10 -7
- {gpufl-0.1.0.dev0 → gpufl-0.1.0.dev7}/pyproject.toml +1 -1
- {gpufl-0.1.0.dev0 → gpufl-0.1.0.dev7}/python/bindings.cpp +39 -13
- {gpufl-0.1.0.dev0 → gpufl-0.1.0.dev7}/python/gpufl/__init__.py +10 -4
- gpufl-0.1.0.dev7/python/gpufl/analyzer/analyzer.py +721 -0
- {gpufl-0.1.0.dev0 → gpufl-0.1.0.dev7}/tests/backends/nvidia/test_nvidia_backend.cpp +1 -1
- {gpufl-0.1.0.dev0 → gpufl-0.1.0.dev7}/tests/core/test_analyzer.cpp +1 -1
- {gpufl-0.1.0.dev0 → gpufl-0.1.0.dev7}/tests/verify_pipeline.py +10 -2
- gpufl-0.1.0.dev0/.github/workflows/release.yml +0 -71
- gpufl-0.1.0.dev0/include/gpufl/backends/nvidia/cupti_backend.cpp +0 -806
- gpufl-0.1.0.dev0/include/gpufl/backends/nvidia/cupti_backend.hpp +0 -164
- gpufl-0.1.0.dev0/include/gpufl/backends/nvidia/cupti_utils.cpp +0 -73
- gpufl-0.1.0.dev0/include/gpufl/core/logger.cpp +0 -437
- gpufl-0.1.0.dev0/include/gpufl/core/logger.hpp +0 -88
- gpufl-0.1.0.dev0/python/gpufl/analyzer/analyzer.py +0 -359
- {gpufl-0.1.0.dev0 → gpufl-0.1.0.dev7}/.clang-format +0 -0
- {gpufl-0.1.0.dev0 → gpufl-0.1.0.dev7}/.github/pull_request_template.md +0 -0
- {gpufl-0.1.0.dev0 → gpufl-0.1.0.dev7}/.github/workflows/build.yml +0 -0
- {gpufl-0.1.0.dev0 → gpufl-0.1.0.dev7}/.gitignore +0 -0
- {gpufl-0.1.0.dev0 → gpufl-0.1.0.dev7}/CONTRIBUTING.md +0 -0
- {gpufl-0.1.0.dev0 → gpufl-0.1.0.dev7}/LICENSE +0 -0
- {gpufl-0.1.0.dev0 → gpufl-0.1.0.dev7}/README.md +0 -0
- {gpufl-0.1.0.dev0 → gpufl-0.1.0.dev7}/build.sh +0 -0
- {gpufl-0.1.0.dev0 → gpufl-0.1.0.dev7}/example/cuda/CMakeLists.txt +0 -0
- {gpufl-0.1.0.dev0 → gpufl-0.1.0.dev7}/example/cuda/check_conflict.cu +0 -0
- {gpufl-0.1.0.dev0 → gpufl-0.1.0.dev7}/example/cuda/check_device.cu +0 -0
- {gpufl-0.1.0.dev0 → gpufl-0.1.0.dev7}/example/cuda/cupti_basic.cu +0 -0
- {gpufl-0.1.0.dev0 → gpufl-0.1.0.dev7}/example/cuda/cupti_pc_sampling.cu +0 -0
- {gpufl-0.1.0.dev0 → gpufl-0.1.0.dev7}/example/cuda/list_sass_metrics.cu +0 -0
- {gpufl-0.1.0.dev0 → gpufl-0.1.0.dev7}/example/cuda/occupancy_demo.cu +0 -0
- {gpufl-0.1.0.dev0 → gpufl-0.1.0.dev7}/example/cuda/system_monitor.cu +0 -0
- {gpufl-0.1.0.dev0 → gpufl-0.1.0.dev7}/example/cuda/test_occupancy.cu +0 -0
- {gpufl-0.1.0.dev0 → gpufl-0.1.0.dev7}/example/cuda/test_sass_cubin.cu +0 -0
- {gpufl-0.1.0.dev0 → gpufl-0.1.0.dev7}/example/cuda/test_sass_metrics.cu +0 -0
- {gpufl-0.1.0.dev0 → gpufl-0.1.0.dev7}/example/cuda/vector_add_benchmark.cu +0 -0
- {gpufl-0.1.0.dev0 → gpufl-0.1.0.dev7}/example/python/01_basic.py +0 -0
- {gpufl-0.1.0.dev0 → gpufl-0.1.0.dev7}/example/python/02_numba_cuda.py +0 -0
- {gpufl-0.1.0.dev0 → gpufl-0.1.0.dev7}/example/python/03_pytorch_benchmark.py +0 -0
- {gpufl-0.1.0.dev0 → gpufl-0.1.0.dev7}/example/python/requirements.txt +0 -0
- {gpufl-0.1.0.dev0 → gpufl-0.1.0.dev7}/example/python/viz/01_plot_memory_timeline.py +0 -0
- {gpufl-0.1.0.dev0 → gpufl-0.1.0.dev7}/example/python/viz/02_plot_stress_timeline.py +0 -0
- {gpufl-0.1.0.dev0 → gpufl-0.1.0.dev7}/images/Screenshot1.png +0 -0
- {gpufl-0.1.0.dev0 → gpufl-0.1.0.dev7}/include/gpufl/backends/amd/rocm_collector.cpp +0 -0
- {gpufl-0.1.0.dev0 → gpufl-0.1.0.dev7}/include/gpufl/backends/amd/rocm_collector.hpp +0 -0
- {gpufl-0.1.0.dev0 → gpufl-0.1.0.dev7}/include/gpufl/backends/host_collector.hpp +0 -0
- {gpufl-0.1.0.dev0 → gpufl-0.1.0.dev7}/include/gpufl/backends/nvidia/cuda_collector.cpp +0 -0
- {gpufl-0.1.0.dev0 → gpufl-0.1.0.dev7}/include/gpufl/backends/nvidia/cuda_collector.hpp +0 -0
- {gpufl-0.1.0.dev0 → gpufl-0.1.0.dev7}/include/gpufl/backends/nvidia/kernel_launch_handler.hpp +0 -0
- {gpufl-0.1.0.dev0 → gpufl-0.1.0.dev7}/include/gpufl/backends/nvidia/mem_transfer_handler.cpp +0 -0
- {gpufl-0.1.0.dev0 → gpufl-0.1.0.dev7}/include/gpufl/backends/nvidia/mem_transfer_handler.hpp +0 -0
- {gpufl-0.1.0.dev0 → gpufl-0.1.0.dev7}/include/gpufl/backends/nvidia/nvml_collector.cpp +0 -0
- {gpufl-0.1.0.dev0 → gpufl-0.1.0.dev7}/include/gpufl/backends/nvidia/nvml_collector.hpp +0 -0
- {gpufl-0.1.0.dev0 → gpufl-0.1.0.dev7}/include/gpufl/backends/nvidia/resource_handler.hpp +0 -0
- {gpufl-0.1.0.dev0 → gpufl-0.1.0.dev7}/include/gpufl/backends/nvidia/sampler/cupti_sass.cpp +0 -0
- {gpufl-0.1.0.dev0 → gpufl-0.1.0.dev7}/include/gpufl/backends/nvidia/sampler/cupti_sass.hpp +0 -0
- {gpufl-0.1.0.dev0 → gpufl-0.1.0.dev7}/include/gpufl/core/common.cpp +0 -0
- {gpufl-0.1.0.dev0 → gpufl-0.1.0.dev7}/include/gpufl/core/common.hpp +0 -0
- {gpufl-0.1.0.dev0 → gpufl-0.1.0.dev7}/include/gpufl/core/debug_logger.cpp +0 -0
- {gpufl-0.1.0.dev0 → gpufl-0.1.0.dev7}/include/gpufl/core/ring_buffer.hpp +0 -0
- {gpufl-0.1.0.dev0 → gpufl-0.1.0.dev7}/include/gpufl/core/runtime.cpp +0 -0
- {gpufl-0.1.0.dev0 → gpufl-0.1.0.dev7}/include/gpufl/core/runtime.hpp +0 -0
- {gpufl-0.1.0.dev0 → gpufl-0.1.0.dev7}/include/gpufl/core/sampler.hpp +0 -0
- {gpufl-0.1.0.dev0 → gpufl-0.1.0.dev7}/include/gpufl/core/scope_registry.cpp +0 -0
- {gpufl-0.1.0.dev0 → gpufl-0.1.0.dev7}/include/gpufl/core/scope_registry.hpp +0 -0
- {gpufl-0.1.0.dev0 → gpufl-0.1.0.dev7}/include/gpufl/core/stack_registry.hpp +0 -0
- {gpufl-0.1.0.dev0 → gpufl-0.1.0.dev7}/include/gpufl/core/stack_trace.cpp +0 -0
- {gpufl-0.1.0.dev0 → gpufl-0.1.0.dev7}/include/gpufl/core/stack_trace.hpp +0 -0
- {gpufl-0.1.0.dev0 → gpufl-0.1.0.dev7}/include/gpufl/core/trace_type.hpp +0 -0
- {gpufl-0.1.0.dev0 → gpufl-0.1.0.dev7}/include/gpufl.hpp +0 -0
- {gpufl-0.1.0.dev0 → gpufl-0.1.0.dev7}/python/gpufl/.gitignore +0 -0
- {gpufl-0.1.0.dev0 → gpufl-0.1.0.dev7}/python/gpufl/analyzer/__init__.py +0 -0
- {gpufl-0.1.0.dev0 → gpufl-0.1.0.dev7}/python/gpufl/utils.py +0 -0
- {gpufl-0.1.0.dev0 → gpufl-0.1.0.dev7}/python/gpufl/viz/__init__.py +0 -0
- {gpufl-0.1.0.dev0 → gpufl-0.1.0.dev7}/python/gpufl/viz/reader.py +0 -0
- {gpufl-0.1.0.dev0 → gpufl-0.1.0.dev7}/python/gpufl/viz/timeline.py +0 -0
- {gpufl-0.1.0.dev0 → gpufl-0.1.0.dev7}/python/gpufl/viz/visualizer.py +0 -0
- {gpufl-0.1.0.dev0 → gpufl-0.1.0.dev7}/schema/ndjson.schema.json +0 -0
- {gpufl-0.1.0.dev0 → gpufl-0.1.0.dev7}/tests/CMakeLists.txt +0 -0
- {gpufl-0.1.0.dev0 → gpufl-0.1.0.dev7}/tests/backends/nvidia/test_cuda_collector.cpp +0 -0
- {gpufl-0.1.0.dev0 → gpufl-0.1.0.dev7}/tests/backends/nvidia/test_nvml_collector.cpp +0 -0
- {gpufl-0.1.0.dev0 → gpufl-0.1.0.dev7}/tests/common/test_utils.hpp +0 -0
- {gpufl-0.1.0.dev0 → gpufl-0.1.0.dev7}/tests/core/test_monitor.cpp +0 -0
- {gpufl-0.1.0.dev0 → gpufl-0.1.0.dev7}/tests/main_test_runner.cpp +0 -0
- {gpufl-0.1.0.dev0 → gpufl-0.1.0.dev7}/tests/python/conftest.py +0 -0
- {gpufl-0.1.0.dev0 → gpufl-0.1.0.dev7}/tests/python/test_analyzer.py +0 -0
|
@@ -0,0 +1,193 @@
|
|
|
1
|
+
name: Build and Release Wheels
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
tags:
|
|
6
|
+
- 'v*'
|
|
7
|
+
workflow_dispatch:
|
|
8
|
+
|
|
9
|
+
jobs:
|
|
10
|
+
build_wheels:
|
|
11
|
+
name: Build wheels on ${{ matrix.os }}
|
|
12
|
+
runs-on: ${{ matrix.os }}
|
|
13
|
+
strategy:
|
|
14
|
+
matrix:
|
|
15
|
+
os: [ubuntu-22.04, windows-latest]
|
|
16
|
+
|
|
17
|
+
steps:
|
|
18
|
+
- uses: actions/checkout@v4
|
|
19
|
+
|
|
20
|
+
- name: Set package version from tag
|
|
21
|
+
if: startsWith(github.ref, 'refs/tags/v')
|
|
22
|
+
shell: python
|
|
23
|
+
run: |
|
|
24
|
+
import os
|
|
25
|
+
import re
|
|
26
|
+
from pathlib import Path
|
|
27
|
+
|
|
28
|
+
ref_name = os.environ.get("GITHUB_REF_NAME", "")
|
|
29
|
+
if not ref_name.startswith("v"):
|
|
30
|
+
raise SystemExit(f"Expected tag starting with 'v', got: {ref_name}")
|
|
31
|
+
version = ref_name[1:]
|
|
32
|
+
print(f"Using version from tag: {version}")
|
|
33
|
+
|
|
34
|
+
pyproject = Path("pyproject.toml")
|
|
35
|
+
text = pyproject.read_text(encoding="utf-8")
|
|
36
|
+
text_new, n = re.subn(
|
|
37
|
+
r'(?m)^version\s*=\s*"[^\"]+"$',
|
|
38
|
+
f'version = "{version}"',
|
|
39
|
+
text,
|
|
40
|
+
count=1,
|
|
41
|
+
)
|
|
42
|
+
if n != 1:
|
|
43
|
+
raise SystemExit("Failed to update [project].version in pyproject.toml")
|
|
44
|
+
pyproject.write_text(text_new, encoding="utf-8")
|
|
45
|
+
|
|
46
|
+
init_py = Path("python/gpufl/__init__.py")
|
|
47
|
+
if init_py.exists():
|
|
48
|
+
init_text = init_py.read_text(encoding="utf-8")
|
|
49
|
+
init_new, _ = re.subn(
|
|
50
|
+
r'(?m)^__version__\s*=\s*"[^\"]+"$',
|
|
51
|
+
f'__version__ = "{version}"',
|
|
52
|
+
init_text,
|
|
53
|
+
)
|
|
54
|
+
init_py.write_text(init_new, encoding="utf-8")
|
|
55
|
+
|
|
56
|
+
- name: Cache cibuildwheel downloads
|
|
57
|
+
uses: actions/cache@v4
|
|
58
|
+
with:
|
|
59
|
+
path: |
|
|
60
|
+
~/.cache/cibuildwheel
|
|
61
|
+
~/AppData/Local/pypa/cibuildwheel/Cache
|
|
62
|
+
key: cibw-${{ runner.os }}-${{ hashFiles('.github/workflows/release.yml') }}
|
|
63
|
+
restore-keys: |
|
|
64
|
+
cibw-${{ runner.os }}-
|
|
65
|
+
|
|
66
|
+
- name: Install CUDA (Windows)
|
|
67
|
+
if: runner.os == 'Windows'
|
|
68
|
+
uses: Jimver/cuda-toolkit@v0.2.30
|
|
69
|
+
with:
|
|
70
|
+
cuda: '13.1.0'
|
|
71
|
+
method: 'network'
|
|
72
|
+
|
|
73
|
+
- name: Prefetch virtualenv.pyz (Windows)
|
|
74
|
+
if: runner.os == 'Windows'
|
|
75
|
+
shell: pwsh
|
|
76
|
+
run: |
|
|
77
|
+
$version = "20.27.1"
|
|
78
|
+
$cacheDir = Join-Path $env:LOCALAPPDATA "pypa\cibuildwheel\Cache"
|
|
79
|
+
New-Item -ItemType Directory -Path $cacheDir -Force | Out-Null
|
|
80
|
+
$dest = Join-Path $cacheDir "virtualenv-$version.pyz"
|
|
81
|
+
if (Test-Path $dest) {
|
|
82
|
+
Write-Host "virtualenv.pyz already cached: $dest"
|
|
83
|
+
exit 0
|
|
84
|
+
}
|
|
85
|
+
$urls = @(
|
|
86
|
+
"https://raw.githubusercontent.com/pypa/get-virtualenv/$version/public/virtualenv.pyz",
|
|
87
|
+
"https://raw.githubusercontent.com/pypa/get-virtualenv/refs/tags/$version/public/virtualenv.pyz",
|
|
88
|
+
"https://bootstrap.pypa.io/virtualenv.pyz"
|
|
89
|
+
)
|
|
90
|
+
$max = 6
|
|
91
|
+
$ok = $false
|
|
92
|
+
foreach ($url in $urls) {
|
|
93
|
+
for ($i = 1; $i -le $max; $i++) {
|
|
94
|
+
try {
|
|
95
|
+
Write-Host "Downloading virtualenv.pyz from $url (attempt $i/$max)..."
|
|
96
|
+
Invoke-WebRequest -Uri $url -OutFile $dest -TimeoutSec 120 -Headers @{ "User-Agent" = "cibuildwheel-prefetch" }
|
|
97
|
+
if ((Get-Item $dest).Length -gt 0) {
|
|
98
|
+
Write-Host "Downloaded: $dest"
|
|
99
|
+
$ok = $true
|
|
100
|
+
break
|
|
101
|
+
}
|
|
102
|
+
} catch {
|
|
103
|
+
if (Test-Path $dest) { Remove-Item $dest -Force -ErrorAction SilentlyContinue }
|
|
104
|
+
if ($i -eq $max) { break }
|
|
105
|
+
Start-Sleep -Seconds (5 * $i)
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
if ($ok) { break }
|
|
109
|
+
}
|
|
110
|
+
if (-not $ok) { throw "Failed to prefetch virtualenv.pyz from all sources." }
|
|
111
|
+
|
|
112
|
+
- name: Build wheels
|
|
113
|
+
uses: pypa/cibuildwheel@v2.22.0
|
|
114
|
+
env:
|
|
115
|
+
CIBW_VIRTUALENV_VERSION: "20.27.1"
|
|
116
|
+
CIBW_ENVIRONMENT_LINUX: "CUDA_HOME=/usr/local/cuda PATH=/usr/local/cuda/bin:$PATH CMAKE_ARGS='-DGPUFL_ENABLE_NVIDIA=ON -DGPUFL_ENABLE_AMD=OFF -DBUILD_TESTING=OFF'"
|
|
117
|
+
CIBW_BEFORE_ALL_LINUX: >-
|
|
118
|
+
curl -fsSL https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo > /etc/yum.repos.d/cuda.repo &&
|
|
119
|
+
dnf install -y --nogpgcheck cuda-nvcc-13-1 cuda-cudart-devel-13-1 cuda-cupti-13-1 cuda-driver-devel-13-1
|
|
120
|
+
CIBW_MANYLINUX_X86_64_IMAGE: manylinux_2_28
|
|
121
|
+
CIBW_BUILD: "cp312-manylinux_x86_64 cp313-manylinux_x86_64 cp312-win_amd64 cp313-win_amd64"
|
|
122
|
+
CIBW_REPAIR_WHEEL_COMMAND_LINUX: "auditwheel repair --plat manylinux_2_28_x86_64 --exclude libcuda.so.1 -w {dest_dir} {wheel}"
|
|
123
|
+
|
|
124
|
+
- uses: actions/upload-artifact@v4
|
|
125
|
+
with:
|
|
126
|
+
name: cibw-wheels-${{ matrix.os }}-${{ strategy.job-index }}
|
|
127
|
+
path: ./wheelhouse/*.whl
|
|
128
|
+
|
|
129
|
+
build_sdist:
|
|
130
|
+
name: Build source distribution
|
|
131
|
+
runs-on: ubuntu-latest
|
|
132
|
+
steps:
|
|
133
|
+
- uses: actions/checkout@v4
|
|
134
|
+
|
|
135
|
+
- name: Set package version from tag
|
|
136
|
+
if: startsWith(github.ref, 'refs/tags/v')
|
|
137
|
+
shell: python
|
|
138
|
+
run: |
|
|
139
|
+
import os
|
|
140
|
+
import re
|
|
141
|
+
from pathlib import Path
|
|
142
|
+
|
|
143
|
+
ref_name = os.environ.get("GITHUB_REF_NAME", "")
|
|
144
|
+
if not ref_name.startswith("v"):
|
|
145
|
+
raise SystemExit(f"Expected tag starting with 'v', got: {ref_name}")
|
|
146
|
+
version = ref_name[1:]
|
|
147
|
+
print(f"Using version from tag: {version}")
|
|
148
|
+
|
|
149
|
+
pyproject = Path("pyproject.toml")
|
|
150
|
+
text = pyproject.read_text(encoding="utf-8")
|
|
151
|
+
text_new, n = re.subn(
|
|
152
|
+
r'(?m)^version\s*=\s*"[^\"]+"$',
|
|
153
|
+
f'version = "{version}"',
|
|
154
|
+
text,
|
|
155
|
+
count=1,
|
|
156
|
+
)
|
|
157
|
+
if n != 1:
|
|
158
|
+
raise SystemExit("Failed to update [project].version in pyproject.toml")
|
|
159
|
+
pyproject.write_text(text_new, encoding="utf-8")
|
|
160
|
+
|
|
161
|
+
init_py = Path("python/gpufl/__init__.py")
|
|
162
|
+
if init_py.exists():
|
|
163
|
+
init_text = init_py.read_text(encoding="utf-8")
|
|
164
|
+
init_new, _ = re.subn(
|
|
165
|
+
r'(?m)^__version__\s*=\s*"[^\"]+"$',
|
|
166
|
+
f'__version__ = "{version}"',
|
|
167
|
+
init_text,
|
|
168
|
+
)
|
|
169
|
+
init_py.write_text(init_new, encoding="utf-8")
|
|
170
|
+
|
|
171
|
+
- name: Build sdist
|
|
172
|
+
run: pipx run build --sdist
|
|
173
|
+
|
|
174
|
+
- uses: actions/upload-artifact@v4
|
|
175
|
+
with:
|
|
176
|
+
name: cibw-sdist
|
|
177
|
+
path: dist/*.tar.gz
|
|
178
|
+
|
|
179
|
+
upload_pypi:
|
|
180
|
+
needs: [build_wheels, build_sdist]
|
|
181
|
+
runs-on: ubuntu-latest
|
|
182
|
+
if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v')
|
|
183
|
+
steps:
|
|
184
|
+
- uses: actions/download-artifact@v4
|
|
185
|
+
with:
|
|
186
|
+
pattern: cibw-*
|
|
187
|
+
path: dist
|
|
188
|
+
merge-multiple: true
|
|
189
|
+
|
|
190
|
+
- name: Publish to PyPI
|
|
191
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
192
|
+
with:
|
|
193
|
+
password: ${{ secrets.PYPI_API_TOKEN }}
|
|
@@ -47,7 +47,15 @@ target_compile_features(gpufl INTERFACE cxx_std_17)
|
|
|
47
47
|
set_target_properties(gpufl PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
|
48
48
|
|
|
49
49
|
target_sources(gpufl PRIVATE
|
|
50
|
-
include/gpufl/core/logger.cpp
|
|
50
|
+
include/gpufl/core/logger/logger.cpp
|
|
51
|
+
include/gpufl/core/logger/log_rotator.cpp
|
|
52
|
+
include/gpufl/core/model/lifecycle_model.cpp
|
|
53
|
+
include/gpufl/core/model/kernel_event_model.cpp
|
|
54
|
+
include/gpufl/core/model/memcpy_event_model.cpp
|
|
55
|
+
include/gpufl/core/model/scope_event_model.cpp
|
|
56
|
+
include/gpufl/core/model/profile_sample_model.cpp
|
|
57
|
+
include/gpufl/core/model/perf_metric_model.cpp
|
|
58
|
+
include/gpufl/core/model/system_event_model.cpp
|
|
51
59
|
include/gpufl/core/sampler.cpp
|
|
52
60
|
include/gpufl/core/runtime.cpp
|
|
53
61
|
include/gpufl/core/gpufl.cpp
|
|
@@ -61,6 +69,43 @@ set(GPUFL_HAS_CUDA 0)
|
|
|
61
69
|
set(GPUFL_HAS_NVML 0)
|
|
62
70
|
set(GPUFL_HAS_ROCM 0)
|
|
63
71
|
set(GPUFL_HAS_CUPTI 0)
|
|
72
|
+
set(GPUFL_HAS_PERFWORKS 0)
|
|
73
|
+
# ZLIB — try system install first, fall back to FetchContent so every platform
|
|
74
|
+
# (including Windows CI) always gets compression support and .gz output.
|
|
75
|
+
find_package(ZLIB QUIET)
|
|
76
|
+
if(ZLIB_FOUND)
|
|
77
|
+
message(STATUS "Found system ZLIB: ${ZLIB_LIBRARIES}")
|
|
78
|
+
target_link_libraries(gpufl PRIVATE ZLIB::ZLIB)
|
|
79
|
+
else()
|
|
80
|
+
message(STATUS "ZLIB not found on system — fetching via FetchContent")
|
|
81
|
+
include(FetchContent)
|
|
82
|
+
# Suppress zlib's own example / test targets to keep the build clean
|
|
83
|
+
set(ZLIB_BUILD_EXAMPLES OFF CACHE BOOL "" FORCE)
|
|
84
|
+
FetchContent_Declare(
|
|
85
|
+
zlib
|
|
86
|
+
GIT_REPOSITORY https://github.com/madler/zlib.git
|
|
87
|
+
GIT_TAG v1.3.1
|
|
88
|
+
)
|
|
89
|
+
FetchContent_MakeAvailable(zlib)
|
|
90
|
+
# zlib.h lives in the source dir; zconf.h is generated in the binary dir.
|
|
91
|
+
# Add both privately to gpufl — consumers never include zlib headers directly.
|
|
92
|
+
target_link_libraries(gpufl PRIVATE zlibstatic)
|
|
93
|
+
target_include_directories(gpufl PRIVATE
|
|
94
|
+
${zlib_SOURCE_DIR}
|
|
95
|
+
${zlib_BINARY_DIR}
|
|
96
|
+
)
|
|
97
|
+
# zlib's own CMakeLists sets INTERFACE_INCLUDE_DIRECTORIES on zlibstatic to
|
|
98
|
+
# build-directory paths, which CMake forbids in install exports. Clear them:
|
|
99
|
+
# gpufl already propagates the paths privately, so consumers don't need them.
|
|
100
|
+
set_target_properties(zlibstatic PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "")
|
|
101
|
+
# zlibstatic must be in the same export set as gpufl — static lib consumers
|
|
102
|
+
# need to link it transitively, so CMake requires it to be exported too.
|
|
103
|
+
install(TARGETS zlibstatic
|
|
104
|
+
EXPORT gpufl_clientTargets
|
|
105
|
+
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
|
|
106
|
+
)
|
|
107
|
+
endif()
|
|
108
|
+
target_sources(gpufl PRIVATE include/gpufl/core/logger/file_compressor.cpp)
|
|
64
109
|
|
|
65
110
|
# -----------------------
|
|
66
111
|
# Backends
|
|
@@ -95,7 +140,10 @@ if(GPUFL_ENABLE_NVIDIA)
|
|
|
95
140
|
include/gpufl/backends/nvidia/kernel_launch_handler.cpp
|
|
96
141
|
include/gpufl/backends/nvidia/mem_transfer_handler.cpp
|
|
97
142
|
include/gpufl/cuda/monitor.cpp
|
|
98
|
-
include/gpufl/backends/nvidia/cupti_backend.cpp
|
|
143
|
+
include/gpufl/backends/nvidia/cupti_backend.cpp
|
|
144
|
+
include/gpufl/backends/nvidia/engine/pc_sampling_engine.cpp
|
|
145
|
+
include/gpufl/backends/nvidia/engine/sass_metrics_engine.cpp
|
|
146
|
+
include/gpufl/backends/nvidia/engine/range_profiler_engine.cpp)
|
|
99
147
|
target_link_libraries(gpufl PRIVATE CUDA::cudart CUDA::cuda_driver)
|
|
100
148
|
|
|
101
149
|
# --------------------------------------------------------
|
|
@@ -124,6 +172,31 @@ if(GPUFL_ENABLE_NVIDIA)
|
|
|
124
172
|
message(STATUS "Found CUPTI manually: ${CUPTI_LIBRARY}")
|
|
125
173
|
endif()
|
|
126
174
|
endif()
|
|
175
|
+
|
|
176
|
+
# --------------------------------------------------------
|
|
177
|
+
# NVPERF Support (for GFL_PERF_SCOPE hardware counters)
|
|
178
|
+
# --------------------------------------------------------
|
|
179
|
+
find_library(NVPERF_HOST_LIBRARY NAMES nvperf_host nvperf_host_static
|
|
180
|
+
HINTS "${CUDAToolkit_ROOT}/extras/CUPTI/lib64"
|
|
181
|
+
"${CUDAToolkit_ROOT}/targets/x86_64-linux/lib"
|
|
182
|
+
"$ENV{CUDA_PATH}/extras/CUPTI/lib64"
|
|
183
|
+
"$ENV{CUDA_PATH}/targets/x86_64-linux/lib"
|
|
184
|
+
)
|
|
185
|
+
find_library(NVPERF_TARGET_LIBRARY NAMES nvperf_target
|
|
186
|
+
HINTS "${CUDAToolkit_ROOT}/extras/CUPTI/lib64"
|
|
187
|
+
"${CUDAToolkit_ROOT}/targets/x86_64-linux/lib"
|
|
188
|
+
"$ENV{CUDA_PATH}/extras/CUPTI/lib64"
|
|
189
|
+
"$ENV{CUDA_PATH}/targets/x86_64-linux/lib"
|
|
190
|
+
)
|
|
191
|
+
if(NVPERF_HOST_LIBRARY AND NVPERF_TARGET_LIBRARY)
|
|
192
|
+
set(GPUFL_HAS_PERFWORKS 1)
|
|
193
|
+
target_link_libraries(gpufl PRIVATE
|
|
194
|
+
${NVPERF_HOST_LIBRARY} ${NVPERF_TARGET_LIBRARY})
|
|
195
|
+
message(STATUS "Found NVPERF: ${NVPERF_HOST_LIBRARY}")
|
|
196
|
+
else()
|
|
197
|
+
set(GPUFL_HAS_PERFWORKS 0)
|
|
198
|
+
message(WARNING "NVPERF not found — GFL_PERF_SCOPE disabled at runtime")
|
|
199
|
+
endif()
|
|
127
200
|
endif()
|
|
128
201
|
endif()
|
|
129
202
|
#
|
|
@@ -178,10 +251,11 @@ if(GPUFL_ENABLE_NVIDIA)
|
|
|
178
251
|
endif()
|
|
179
252
|
|
|
180
253
|
# Apply definitions to public interface so tests inherit them
|
|
181
|
-
target_compile_definitions(gpufl PUBLIC
|
|
254
|
+
target_compile_definitions(gpufl PUBLIC
|
|
182
255
|
GPUFL_HAS_CUDA=${GPUFL_HAS_CUDA}
|
|
183
256
|
GPUFL_HAS_NVML=${GPUFL_HAS_NVML}
|
|
184
257
|
GPUFL_HAS_CUPTI=${GPUFL_HAS_CUPTI}
|
|
258
|
+
GPUFL_HAS_PERFWORKS=${GPUFL_HAS_PERFWORKS}
|
|
185
259
|
)
|
|
186
260
|
|
|
187
261
|
#
|
|
@@ -36,12 +36,14 @@ int main() {
|
|
|
36
36
|
// Initialize GFL
|
|
37
37
|
gpufl::InitOptions opts;
|
|
38
38
|
opts.app_name = "block_style_demo";
|
|
39
|
-
opts.log_path = "gfl_block
|
|
39
|
+
opts.log_path = "gfl_block";
|
|
40
40
|
opts.system_sample_rate_ms = 50;
|
|
41
41
|
opts.kernel_sample_rate_ms = 50;
|
|
42
42
|
opts.enable_kernel_details = true;
|
|
43
43
|
opts.sampling_auto_start = true;
|
|
44
44
|
opts.enable_debug_output = true;
|
|
45
|
+
opts.profiling_engine = gpufl::ProfilingEngine::SassMetrics;
|
|
46
|
+
|
|
45
47
|
if (!gpufl::init(opts)) {
|
|
46
48
|
std::cerr << "Failed to initialize gpufl" << std::endl;
|
|
47
49
|
return 1;
|
|
@@ -0,0 +1,316 @@
|
|
|
1
|
+
#include "gpufl/backends/nvidia/cupti_backend.hpp"
|
|
2
|
+
|
|
3
|
+
#include <cupti_pcsampling.h>
|
|
4
|
+
#include <cupti_profiler_target.h>
|
|
5
|
+
#include <cupti_sass_metrics.h>
|
|
6
|
+
#include <cupti_target.h>
|
|
7
|
+
|
|
8
|
+
#if GPUFL_HAS_PERFWORKS
|
|
9
|
+
#include <cupti_range_profiler.h>
|
|
10
|
+
#endif
|
|
11
|
+
|
|
12
|
+
#include <cstring>
|
|
13
|
+
#include <exception>
|
|
14
|
+
#include <set>
|
|
15
|
+
|
|
16
|
+
#include "gpufl/backends/nvidia/cupti_utils.hpp"
|
|
17
|
+
#include "gpufl/backends/nvidia/engine/pc_sampling_engine.hpp"
|
|
18
|
+
#include "gpufl/backends/nvidia/engine/range_profiler_engine.hpp"
|
|
19
|
+
#include "gpufl/backends/nvidia/engine/sass_metrics_engine.hpp"
|
|
20
|
+
#include "gpufl/backends/nvidia/kernel_launch_handler.hpp"
|
|
21
|
+
#include "gpufl/backends/nvidia/mem_transfer_handler.hpp"
|
|
22
|
+
#include "gpufl/backends/nvidia/resource_handler.hpp"
|
|
23
|
+
#include "gpufl/core/common.hpp"
|
|
24
|
+
#include "gpufl/core/debug_logger.hpp"
|
|
25
|
+
#include "gpufl/core/ring_buffer.hpp"
|
|
26
|
+
#include "gpufl/core/trace_type.hpp"
|
|
27
|
+
|
|
28
|
+
#include "gpufl/backends/nvidia/cuda_collector.hpp"
|
|
29
|
+
#include "gpufl/core/scope_registry.hpp"
|
|
30
|
+
#include "gpufl/core/stack_registry.hpp"
|
|
31
|
+
#include "gpufl/core/stack_trace.hpp"
|
|
32
|
+
|
|
33
|
+
namespace gpufl {
|
|
34
|
+
std::atomic<gpufl::CuptiBackend*> g_activeBackend{nullptr};
|
|
35
|
+
|
|
36
|
+
extern RingBuffer<ActivityRecord, 1024> g_monitorBuffer;
|
|
37
|
+
|
|
38
|
+
void CuptiBackend::initialize(const MonitorOptions& opts) {
|
|
39
|
+
opts_ = opts;
|
|
40
|
+
|
|
41
|
+
DebugLogger::setEnabled(opts_.enable_debug_output);
|
|
42
|
+
|
|
43
|
+
// Create the engine (no CUDA context needed yet)
|
|
44
|
+
switch (opts_.profiling_engine) {
|
|
45
|
+
case ProfilingEngine::PcSampling:
|
|
46
|
+
engine_ = std::make_unique<PcSamplingEngine>();
|
|
47
|
+
GFL_LOG_DEBUG("[CuptiBackend] Engine: PcSampling");
|
|
48
|
+
break;
|
|
49
|
+
case ProfilingEngine::SassMetrics:
|
|
50
|
+
engine_ = std::make_unique<SassMetricsEngine>();
|
|
51
|
+
GFL_LOG_DEBUG("[CuptiBackend] Engine: SassMetrics");
|
|
52
|
+
break;
|
|
53
|
+
case ProfilingEngine::RangeProfiler:
|
|
54
|
+
#if GPUFL_HAS_PERFWORKS
|
|
55
|
+
engine_ = std::make_unique<RangeProfilerEngine>();
|
|
56
|
+
GFL_LOG_DEBUG("[CuptiBackend] Engine: RangeProfiler");
|
|
57
|
+
#else
|
|
58
|
+
GFL_LOG_ERROR("[CuptiBackend] RangeProfiler engine requires "
|
|
59
|
+
"GPUFL_HAS_PERFWORKS; falling back to None");
|
|
60
|
+
#endif
|
|
61
|
+
break;
|
|
62
|
+
case ProfilingEngine::None:
|
|
63
|
+
default:
|
|
64
|
+
GFL_LOG_DEBUG("[CuptiBackend] Engine: None (monitoring only)");
|
|
65
|
+
break;
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
g_activeBackend.store(this, std::memory_order_release);
|
|
69
|
+
|
|
70
|
+
// Internal handler registration
|
|
71
|
+
RegisterHandler(std::make_shared<ResourceHandler>(this));
|
|
72
|
+
RegisterHandler(std::make_shared<KernelLaunchHandler>(this));
|
|
73
|
+
RegisterHandler(std::make_shared<MemTransferHandler>(this));
|
|
74
|
+
|
|
75
|
+
GFL_LOG_DEBUG("Subscribing to CUPTI...");
|
|
76
|
+
CUPTI_CHECK_RETURN(
|
|
77
|
+
cuptiSubscribe(&subscriber_,
|
|
78
|
+
reinterpret_cast<CUpti_CallbackFunc>(GflCallback), this),
|
|
79
|
+
"[GPUFL Monitor] ERROR: Failed to subscribe to CUPTI\n"
|
|
80
|
+
"[GPUFL Monitor] This may indicate:\n"
|
|
81
|
+
" - CUPTI library not found or incompatible\n"
|
|
82
|
+
" - Insufficient permissions\n"
|
|
83
|
+
" - CUDA driver issues");
|
|
84
|
+
GFL_LOG_DEBUG("CUPTI subscription successful");
|
|
85
|
+
|
|
86
|
+
std::set<CUpti_CallbackDomain> domains;
|
|
87
|
+
std::set<std::pair<CUpti_CallbackDomain, CUpti_CallbackId>> callbacks;
|
|
88
|
+
{
|
|
89
|
+
std::lock_guard<std::mutex> lk(handler_mu_);
|
|
90
|
+
for (const auto& h : handlers_) {
|
|
91
|
+
for (auto d : h->requiredDomains()) domains.insert(d);
|
|
92
|
+
for (auto cb : h->requiredCallbacks()) callbacks.insert(cb);
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
for (auto d : domains) CUPTI_CHECK(cuptiEnableDomain(1, subscriber_, d));
|
|
96
|
+
for (auto& [domain, cbid] : callbacks)
|
|
97
|
+
CUPTI_CHECK(cuptiEnableCallback(1, subscriber_, domain, cbid));
|
|
98
|
+
|
|
99
|
+
CUptiResult resCb =
|
|
100
|
+
cuptiActivityRegisterCallbacks(BufferRequested, BufferCompleted);
|
|
101
|
+
if (resCb != CUPTI_SUCCESS) {
|
|
102
|
+
GFL_LOG_ERROR("FATAL: Failed to register activity callbacks.");
|
|
103
|
+
LogCuptiErrorIfFailed("CUPTI", "cuptiActivityRegisterCallbacks", resCb);
|
|
104
|
+
initialized_ = false;
|
|
105
|
+
return;
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
initialized_ = true;
|
|
109
|
+
GFL_LOG_DEBUG("Callbacks registered successfully.");
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
void CuptiBackend::shutdown() {
|
|
113
|
+
if (!initialized_) return;
|
|
114
|
+
|
|
115
|
+
// Delegate engine teardown first
|
|
116
|
+
if (engine_) {
|
|
117
|
+
engine_->stop();
|
|
118
|
+
engine_->shutdown();
|
|
119
|
+
engine_.reset();
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
LogCuptiErrorIfFailed("Perfworks", "cuptiActivityFlushAll",
|
|
123
|
+
cuptiActivityFlushAll(1));
|
|
124
|
+
|
|
125
|
+
{
|
|
126
|
+
std::lock_guard<std::mutex> lk(handler_mu_);
|
|
127
|
+
std::set<CUpti_CallbackDomain> domains;
|
|
128
|
+
for (const auto& h : handlers_)
|
|
129
|
+
for (auto d : h->requiredDomains()) domains.insert(d);
|
|
130
|
+
for (auto d : domains) cuptiEnableDomain(0, subscriber_, d);
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
cuptiUnsubscribe(subscriber_);
|
|
134
|
+
g_activeBackend.store(nullptr, std::memory_order_release);
|
|
135
|
+
initialized_ = false;
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
CUptiResult (*CuptiBackend::get_value())(CUpti_ActivityKind) {
|
|
139
|
+
return cuptiActivityEnable;
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
void CuptiBackend::start() {
|
|
143
|
+
if (!initialized_) return;
|
|
144
|
+
kernel_activity_seen_.store(0, std::memory_order_relaxed);
|
|
145
|
+
kernel_activity_emitted_.store(0, std::memory_order_relaxed);
|
|
146
|
+
kernel_activity_throttled_.store(0, std::memory_order_relaxed);
|
|
147
|
+
|
|
148
|
+
CUPTI_CHECK(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_SOURCE_LOCATOR));
|
|
149
|
+
|
|
150
|
+
// Enable activity kinds required by registered handlers (always on)
|
|
151
|
+
{
|
|
152
|
+
std::set<CUpti_ActivityKind> kinds;
|
|
153
|
+
{
|
|
154
|
+
std::lock_guard<std::mutex> lk(handler_mu_);
|
|
155
|
+
for (const auto& h : handlers_)
|
|
156
|
+
for (auto k : h->requiredActivityKinds()) kinds.insert(k);
|
|
157
|
+
}
|
|
158
|
+
for (auto k : kinds) CUPTI_CHECK(cuptiActivityEnable(k));
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
// Initialize and start the engine (requires CUDA context)
|
|
162
|
+
if (engine_) {
|
|
163
|
+
if (EnsureCudaContext(&ctx_)) {
|
|
164
|
+
cuptiGetDeviceId(ctx_, &device_id_);
|
|
165
|
+
chip_name_ = getChipName(device_id_);
|
|
166
|
+
cached_device_name_ = GetCurrentDeviceName();
|
|
167
|
+
|
|
168
|
+
EngineContext ectx{ctx_, device_id_, chip_name_,
|
|
169
|
+
&cubin_mu_, &cubin_by_crc_};
|
|
170
|
+
engine_->initialize(opts_, ectx);
|
|
171
|
+
engine_->start();
|
|
172
|
+
} else {
|
|
173
|
+
GFL_LOG_ERROR("[CuptiBackend] Failed to get CUDA context; "
|
|
174
|
+
"engine will not start.");
|
|
175
|
+
}
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
active_.store(true);
|
|
179
|
+
GFL_LOG_DEBUG("Backend started.");
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
void CuptiBackend::stop() {
|
|
183
|
+
if (!initialized_) return;
|
|
184
|
+
active_.store(false);
|
|
185
|
+
|
|
186
|
+
LogCuptiErrorIfFailed("Perfworks", "cuptiActivityFlushAll",
|
|
187
|
+
cuptiActivityFlushAll(1));
|
|
188
|
+
|
|
189
|
+
{
|
|
190
|
+
std::set<CUpti_ActivityKind> kinds;
|
|
191
|
+
{
|
|
192
|
+
std::lock_guard<std::mutex> lk(handler_mu_);
|
|
193
|
+
for (const auto& h : handlers_)
|
|
194
|
+
for (auto k : h->requiredActivityKinds()) kinds.insert(k);
|
|
195
|
+
}
|
|
196
|
+
for (auto k : kinds) cuptiActivityDisable(k);
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
const uint64_t seen = kernel_activity_seen_.load(std::memory_order_relaxed);
|
|
200
|
+
const uint64_t emitted =
|
|
201
|
+
kernel_activity_emitted_.load(std::memory_order_relaxed);
|
|
202
|
+
const uint64_t throttled =
|
|
203
|
+
kernel_activity_throttled_.load(std::memory_order_relaxed);
|
|
204
|
+
GFL_LOG_DEBUG("[KernelLaunchHandler] activity summary seen=", seen,
|
|
205
|
+
" emitted=", emitted, " throttled=", throttled);
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
void CuptiBackend::RegisterHandler(
|
|
209
|
+
const std::shared_ptr<ICuptiHandler>& handler) {
|
|
210
|
+
if (!handler) return;
|
|
211
|
+
std::lock_guard<std::mutex> lk(handler_mu_);
|
|
212
|
+
handlers_.push_back(handler);
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
// ---- Static callbacks ------------------------------------------------------
|
|
216
|
+
|
|
217
|
+
void CUPTIAPI CuptiBackend::BufferRequested(uint8_t** buffer, size_t* size,
|
|
218
|
+
size_t* maxNumRecords) {
|
|
219
|
+
*size = 64 * 1024;
|
|
220
|
+
*buffer = static_cast<uint8_t*>(malloc(*size));
|
|
221
|
+
*maxNumRecords = 0;
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
void CUPTIAPI CuptiBackend::BufferCompleted(CUcontext context,
|
|
225
|
+
uint32_t streamId, uint8_t* buffer,
|
|
226
|
+
size_t size,
|
|
227
|
+
const size_t validSize) {
|
|
228
|
+
auto* backend = g_activeBackend.load(std::memory_order_acquire);
|
|
229
|
+
if (!backend) {
|
|
230
|
+
::gpufl::DebugLogger::error("[CUPTI] ",
|
|
231
|
+
"BufferCompleted: No active backend!");
|
|
232
|
+
if (buffer) free(buffer);
|
|
233
|
+
return;
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
static int64_t baseCpuNs = detail::GetTimestampNs();
|
|
237
|
+
static uint64_t baseCuptiTs = 0;
|
|
238
|
+
if (baseCuptiTs == 0) cuptiGetTimestamp(&baseCuptiTs);
|
|
239
|
+
|
|
240
|
+
std::vector<std::shared_ptr<ICuptiHandler>> handlers;
|
|
241
|
+
{
|
|
242
|
+
std::lock_guard<std::mutex> lk(backend->handler_mu_);
|
|
243
|
+
handlers = backend->handlers_;
|
|
244
|
+
}
|
|
245
|
+
|
|
246
|
+
if (validSize > 0) {
|
|
247
|
+
CUpti_Activity* record = nullptr;
|
|
248
|
+
while (true) {
|
|
249
|
+
const CUptiResult st =
|
|
250
|
+
cuptiActivityGetNextRecord(buffer, validSize, &record);
|
|
251
|
+
if (st == CUPTI_SUCCESS) {
|
|
252
|
+
bool handled = false;
|
|
253
|
+
for (const auto& h : handlers) {
|
|
254
|
+
if (h->handleActivityRecord(record, baseCpuNs,
|
|
255
|
+
baseCuptiTs)) {
|
|
256
|
+
handled = true;
|
|
257
|
+
break;
|
|
258
|
+
}
|
|
259
|
+
}
|
|
260
|
+
if (!handled &&
|
|
261
|
+
record->kind == CUPTI_ACTIVITY_KIND_PC_SAMPLING) {
|
|
262
|
+
auto* pc =
|
|
263
|
+
reinterpret_cast<CUpti_ActivityPCSampling3*>(record);
|
|
264
|
+
ActivityRecord out{};
|
|
265
|
+
out.type = TraceType::PC_SAMPLE;
|
|
266
|
+
out.corr_id = pc->correlationId;
|
|
267
|
+
std::snprintf(out.sample_kind, sizeof(out.sample_kind),
|
|
268
|
+
"%s", "pc_sampling");
|
|
269
|
+
out.samples_count = pc->samples;
|
|
270
|
+
out.stall_reason = pc->stallReason;
|
|
271
|
+
out.device_id =
|
|
272
|
+
reinterpret_cast<const CUpti_ActivityKernel11*>(record)
|
|
273
|
+
->deviceId;
|
|
274
|
+
g_monitorBuffer.Push(out);
|
|
275
|
+
}
|
|
276
|
+
} else if (st == CUPTI_ERROR_MAX_LIMIT_REACHED) {
|
|
277
|
+
break;
|
|
278
|
+
} else {
|
|
279
|
+
::gpufl::DebugLogger::error("[CUPTI] ",
|
|
280
|
+
"Error parsing buffer: ", st);
|
|
281
|
+
break;
|
|
282
|
+
}
|
|
283
|
+
}
|
|
284
|
+
}
|
|
285
|
+
|
|
286
|
+
free(buffer);
|
|
287
|
+
}
|
|
288
|
+
|
|
289
|
+
void CuptiBackend::GflCallback(void* userdata, CUpti_CallbackDomain domain,
|
|
290
|
+
CUpti_CallbackId cbid, const void* cbdata) {
|
|
291
|
+
if (!cbdata) return;
|
|
292
|
+
|
|
293
|
+
auto* backend = static_cast<CuptiBackend*>(userdata);
|
|
294
|
+
if (!backend) return;
|
|
295
|
+
|
|
296
|
+
std::vector<std::shared_ptr<ICuptiHandler>> handlers;
|
|
297
|
+
{
|
|
298
|
+
std::lock_guard<std::mutex> lk(backend->handler_mu_);
|
|
299
|
+
handlers = backend->handlers_;
|
|
300
|
+
}
|
|
301
|
+
|
|
302
|
+
bool apiHandled = false;
|
|
303
|
+
|
|
304
|
+
for (const auto& handler : handlers) {
|
|
305
|
+
if (handler->shouldHandle(domain, cbid)) {
|
|
306
|
+
if (domain == CUPTI_CB_DOMAIN_RUNTIME_API ||
|
|
307
|
+
domain == CUPTI_CB_DOMAIN_DRIVER_API) {
|
|
308
|
+
if (apiHandled) continue;
|
|
309
|
+
apiHandled = true;
|
|
310
|
+
}
|
|
311
|
+
handler->handle(domain, cbid, cbdata);
|
|
312
|
+
}
|
|
313
|
+
}
|
|
314
|
+
}
|
|
315
|
+
|
|
316
|
+
} // namespace gpufl
|