nsight-python 0.9.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nsight/__init__.py +12 -0
- nsight/analyze.py +363 -0
- nsight/annotation.py +80 -0
- nsight/collection/__init__.py +10 -0
- nsight/collection/core.py +399 -0
- nsight/collection/ncu.py +268 -0
- nsight/exceptions.py +51 -0
- nsight/extraction.py +224 -0
- nsight/thermovision.py +115 -0
- nsight/transformation.py +167 -0
- nsight/utils.py +320 -0
- nsight/visualization.py +470 -0
- nsight_python-0.9.4.dist-info/METADATA +254 -0
- nsight_python-0.9.4.dist-info/RECORD +16 -0
- nsight_python-0.9.4.dist-info/WHEEL +4 -0
- nsight_python-0.9.4.dist-info/licenses/LICENSE +202 -0
nsight/utils.py
ADDED
|
@@ -0,0 +1,320 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
|
|
4
|
+
import functools
|
|
5
|
+
import re
|
|
6
|
+
import subprocess
|
|
7
|
+
import sys
|
|
8
|
+
from dataclasses import dataclass
|
|
9
|
+
from itertools import islice
|
|
10
|
+
from typing import Any, Iterator
|
|
11
|
+
|
|
12
|
+
from nsight.exceptions import CUDA_CORE_UNAVAILABLE_MSG, NCUErrorContext
|
|
13
|
+
|
|
14
|
+
# Try to import cuda-core (optional dependency)
|
|
15
|
+
try:
|
|
16
|
+
from cuda.core.experimental import (
|
|
17
|
+
Device,
|
|
18
|
+
LaunchConfig,
|
|
19
|
+
Program,
|
|
20
|
+
ProgramOptions,
|
|
21
|
+
launch,
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
CUDA_CORE_AVAILABLE = True
|
|
25
|
+
except ImportError:
|
|
26
|
+
CUDA_CORE_AVAILABLE = False
|
|
27
|
+
Device = None
|
|
28
|
+
LaunchConfig = None
|
|
29
|
+
Program = None
|
|
30
|
+
ProgramOptions = None
|
|
31
|
+
launch = None
|
|
32
|
+
|
|
33
|
+
NVTX_DOMAIN = "nsight-python"
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class row_panel:
|
|
37
|
+
pass
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class col_panel:
|
|
41
|
+
pass
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class Colors:
|
|
45
|
+
"""For colorful printing."""
|
|
46
|
+
|
|
47
|
+
HEADER = "\033[95m"
|
|
48
|
+
BLUE = "\033[0;34m"
|
|
49
|
+
CYAN = "\033[0;36m"
|
|
50
|
+
GREEN = "\033[0;32m"
|
|
51
|
+
ORANGE = "\033[0;33m"
|
|
52
|
+
RED = "\033[0;31m"
|
|
53
|
+
PURPLE = "\033[0;35m"
|
|
54
|
+
ENDC = "\033[0m"
|
|
55
|
+
BOLD = "\033[1m"
|
|
56
|
+
UNDERLINE = "\033[4m"
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def purple(msg: str) -> str: # pragma: no cover
|
|
60
|
+
"""Prints ``msg`` in purple."""
|
|
61
|
+
return Colors.PURPLE + msg + Colors.ENDC
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
# ------------------------------------------------------------------------------
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
@functools.lru_cache
|
|
68
|
+
def get_dummy_kernel_module() -> Any:
|
|
69
|
+
"""
|
|
70
|
+
Returns a dummy kernel that does nothing. In case a provider fails for some, reason, but we
|
|
71
|
+
want to keep benchmarking we launch this dummy kernel such that during our later analysis of the
|
|
72
|
+
ncu-report we still find the expected number of measured kernels per provider.
|
|
73
|
+
|
|
74
|
+
The measured runtime of this kernel is ignored and the final result of the failed run will be
|
|
75
|
+
reported as NaN.
|
|
76
|
+
|
|
77
|
+
Raises:
|
|
78
|
+
ImportError: If cuda-core is not installed.
|
|
79
|
+
"""
|
|
80
|
+
if not CUDA_CORE_AVAILABLE:
|
|
81
|
+
raise ImportError(CUDA_CORE_UNAVAILABLE_MSG)
|
|
82
|
+
code = "__global__ void dummy_kernel_failure() {}"
|
|
83
|
+
program_options = ProgramOptions(std="c++17")
|
|
84
|
+
prog = Program(code, code_type="c++", options=program_options)
|
|
85
|
+
return prog.compile("cubin", name_expressions=("dummy_kernel_failure",))
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def launch_dummy_kernel_module() -> None:
|
|
89
|
+
"""
|
|
90
|
+
Launch a dummy kernel module.
|
|
91
|
+
|
|
92
|
+
Raises:
|
|
93
|
+
ImportError: If cuda-core is not installed.
|
|
94
|
+
"""
|
|
95
|
+
if not CUDA_CORE_AVAILABLE:
|
|
96
|
+
raise ImportError(CUDA_CORE_UNAVAILABLE_MSG)
|
|
97
|
+
dev = Device()
|
|
98
|
+
dev.set_current()
|
|
99
|
+
stream = dev.create_stream()
|
|
100
|
+
mod = get_dummy_kernel_module()
|
|
101
|
+
kernel = mod.get_kernel("dummy_kernel_failure")
|
|
102
|
+
config = LaunchConfig(grid=1, block=256)
|
|
103
|
+
launch(stream, config, kernel)
|
|
104
|
+
stream.sync()
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def format_time(seconds: float) -> str:
|
|
108
|
+
"""Convert ``seconds`` into ``HH:MM:SS`` format"""
|
|
109
|
+
hours, remainder = divmod(int(seconds), 3600)
|
|
110
|
+
minutes, seconds = divmod(remainder, 60)
|
|
111
|
+
return f"{hours:02}:{minutes:02}:{seconds:02}"
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
# Sincerely stolen (and adjusted) from attention-gym
|
|
115
|
+
def print_header(*lines: str) -> None:
|
|
116
|
+
width = max(80, max(len(line) for line in lines) + 4)
|
|
117
|
+
print(purple("╔" + "═" * (width - 2) + "╗"))
|
|
118
|
+
for line in lines:
|
|
119
|
+
print(purple(f"║ {line.center(width - 4)} ║"))
|
|
120
|
+
print(purple("╚" + "═" * (width - 2) + "╝"))
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
@dataclass
|
|
124
|
+
class NCUActionData:
|
|
125
|
+
name: str
|
|
126
|
+
value: Any
|
|
127
|
+
compute_clock: int
|
|
128
|
+
memory_clock: int
|
|
129
|
+
gpu: str
|
|
130
|
+
|
|
131
|
+
@staticmethod
|
|
132
|
+
def combine(value_reduce_op: Any) -> Any:
|
|
133
|
+
"""
|
|
134
|
+
Combines two NCUActionData objects into a new one by applying the
|
|
135
|
+
value_reduce_op to their values.
|
|
136
|
+
"""
|
|
137
|
+
|
|
138
|
+
def _combine(lhs: "NCUActionData", rhs: "NCUActionData") -> "NCUActionData":
|
|
139
|
+
assert lhs.compute_clock == rhs.compute_clock
|
|
140
|
+
assert lhs.memory_clock == rhs.memory_clock
|
|
141
|
+
assert lhs.gpu == rhs.gpu
|
|
142
|
+
return NCUActionData(
|
|
143
|
+
name=f"{lhs.name}|{rhs.name}",
|
|
144
|
+
value=value_reduce_op(lhs.value, rhs.value),
|
|
145
|
+
compute_clock=lhs.compute_clock,
|
|
146
|
+
memory_clock=lhs.memory_clock,
|
|
147
|
+
gpu=lhs.gpu,
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
return _combine
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def print_progress_bar(
|
|
154
|
+
total_runs: int,
|
|
155
|
+
curr_run: int,
|
|
156
|
+
bar_length: int,
|
|
157
|
+
avg_time_per_run: float,
|
|
158
|
+
overwrite_output: bool,
|
|
159
|
+
) -> None:
|
|
160
|
+
"""
|
|
161
|
+
Prints a dynamic progress bar to the terminal.
|
|
162
|
+
|
|
163
|
+
Args:
|
|
164
|
+
total_runs: Total number of runs to execute.
|
|
165
|
+
curr_run: Current run index.
|
|
166
|
+
bar_length: Length of the progress bar in characters.
|
|
167
|
+
avg_time_per_run: Average time taken per run, used to estimate remaining time.
|
|
168
|
+
overwrite_output: Controls how configurations are printed:
|
|
169
|
+
- **True**: Overwrites the existing progress bar
|
|
170
|
+
- **False**: Writes a new progress bar
|
|
171
|
+
"""
|
|
172
|
+
remaining_time = avg_time_per_run * (total_runs - curr_run)
|
|
173
|
+
formatted_time = format_time(remaining_time)
|
|
174
|
+
|
|
175
|
+
# Print progress after each run
|
|
176
|
+
progress = curr_run / total_runs
|
|
177
|
+
filled_length = int(bar_length * progress)
|
|
178
|
+
bar = "█" * filled_length + "-" * (bar_length - filled_length)
|
|
179
|
+
if overwrite_output:
|
|
180
|
+
sys.stdout.write("\033[1A") # Move cursor up 1 line
|
|
181
|
+
sys.stdout.write("\033[2K\r") # Clear line
|
|
182
|
+
sys.stdout.write(
|
|
183
|
+
f"Progress: [{bar}] {progress * 100:.2f}% | Estimated time remaining: {formatted_time}\n"
|
|
184
|
+
)
|
|
185
|
+
sys.stdout.flush()
|
|
186
|
+
|
|
187
|
+
else:
|
|
188
|
+
print(
|
|
189
|
+
f"Progress: [{bar}] {progress * 100:.2f}% | Estimated time remaining: {formatted_time}"
|
|
190
|
+
)
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
def print_config(
|
|
194
|
+
total_configs: int, curr_config: int, c: Any, overwrite_output: bool
|
|
195
|
+
) -> None:
|
|
196
|
+
"""
|
|
197
|
+
Prints the current configuration being profiled.
|
|
198
|
+
|
|
199
|
+
Args:
|
|
200
|
+
total_configs: Total number of configurations.
|
|
201
|
+
curr_config: Current configuration index.
|
|
202
|
+
c: The current configuration parameters.
|
|
203
|
+
overwrite_output: Controls how configurations are printed:
|
|
204
|
+
- **True**: The configuration is updated in-place
|
|
205
|
+
- **False**: Each configuration is printed on a new line
|
|
206
|
+
"""
|
|
207
|
+
if overwrite_output:
|
|
208
|
+
sys.stdout.write("\033[2F") # Move cursor up two lines
|
|
209
|
+
sys.stdout.write("\033[2K\r") # Clear line
|
|
210
|
+
sys.stdout.write(
|
|
211
|
+
f"Config {curr_config}/{total_configs}: {str(list(map(str, c)))}\n\n"
|
|
212
|
+
)
|
|
213
|
+
sys.stdout.flush()
|
|
214
|
+
|
|
215
|
+
else:
|
|
216
|
+
print_header(f"Config {curr_config}/{total_configs}: {str(list(map(str, c)))}")
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
def batched(iterable: Any, n: int) -> Iterator[tuple[Any, ...]]:
|
|
220
|
+
"""
|
|
221
|
+
Batch an iterable into tuples of size n.
|
|
222
|
+
|
|
223
|
+
This is a minimal backport of itertools.batched for Python 3.10 and 3.11,
|
|
224
|
+
where the standard library implementation is not available.
|
|
225
|
+
"""
|
|
226
|
+
if n < 1:
|
|
227
|
+
raise ValueError("n must be atleast 1")
|
|
228
|
+
|
|
229
|
+
iterator = iter(iterable)
|
|
230
|
+
while batch := tuple(islice(iterator, n)):
|
|
231
|
+
yield batch
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
class LogParser:
|
|
235
|
+
"""
|
|
236
|
+
Base class for parsing the log files
|
|
237
|
+
"""
|
|
238
|
+
|
|
239
|
+
def parse_logs(self, log_file_path: str) -> dict[str, list[str]]:
|
|
240
|
+
"""
|
|
241
|
+
Parses the log file and returns a list of log entries.
|
|
242
|
+
|
|
243
|
+
Args:
|
|
244
|
+
log_file_path: Path to the log file.
|
|
245
|
+
"""
|
|
246
|
+
return {}
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
class NCULogParser(LogParser):
|
|
250
|
+
"""
|
|
251
|
+
Parse NCU log file.
|
|
252
|
+
"""
|
|
253
|
+
|
|
254
|
+
def parse_logs(self, log_file_path: str) -> dict[str, list[str]]:
|
|
255
|
+
"""
|
|
256
|
+
Parses the NCU log file and returns a dictionary of log entries categorized by their type.
|
|
257
|
+
|
|
258
|
+
Args:
|
|
259
|
+
log_file_path: Path to the NCU log file.
|
|
260
|
+
"""
|
|
261
|
+
# Dictionary to categorize logs by their category
|
|
262
|
+
log_entries: dict[str, list[str]] = {"ERROR": [], "PROF": [], "WARNING": []}
|
|
263
|
+
|
|
264
|
+
# Pattern for ==ERROR== messages
|
|
265
|
+
error_pattern = re.compile(r"^==ERROR==\s+(.*)$")
|
|
266
|
+
# Pattern for ==PROF== messages
|
|
267
|
+
prof_pattern = re.compile(r"^==PROF==\s+(.*)$")
|
|
268
|
+
# Pattern for ==WARNING== messages
|
|
269
|
+
warning_pattern = re.compile(r"^==WARNING==\s+(.*)$")
|
|
270
|
+
|
|
271
|
+
with open(log_file_path, "r") as file:
|
|
272
|
+
for line in file:
|
|
273
|
+
line = line.strip()
|
|
274
|
+
if error_match := error_pattern.match(line):
|
|
275
|
+
log_entries["ERROR"].append(error_match.group(1))
|
|
276
|
+
elif prof_match := prof_pattern.match(line):
|
|
277
|
+
log_entries["PROF"].append(prof_match.group(1))
|
|
278
|
+
elif warning_match := warning_pattern.match(line):
|
|
279
|
+
log_entries["WARNING"].append(warning_match.group(1))
|
|
280
|
+
|
|
281
|
+
return log_entries
|
|
282
|
+
|
|
283
|
+
def get_logs(self, log_file_path: str, category: str) -> list[str]:
|
|
284
|
+
"""
|
|
285
|
+
Returns log entries of a specific category from the NCU log file.
|
|
286
|
+
|
|
287
|
+
Args:
|
|
288
|
+
log_file_path: Path to the NCU log file.
|
|
289
|
+
category: Category of logs (e.g., "ERROR", "PROF").
|
|
290
|
+
"""
|
|
291
|
+
logs = self.parse_logs(log_file_path)
|
|
292
|
+
return logs.get(category, [])
|
|
293
|
+
|
|
294
|
+
|
|
295
|
+
def format_ncu_error_message(context: NCUErrorContext) -> str:
|
|
296
|
+
"""
|
|
297
|
+
Format NCU error context into user-friendly error message.
|
|
298
|
+
|
|
299
|
+
Args:
|
|
300
|
+
context: The error context containing all relevant information.
|
|
301
|
+
"""
|
|
302
|
+
|
|
303
|
+
INVALID_METRIC_ERROR_HINT = "Failed to find metric"
|
|
304
|
+
|
|
305
|
+
# FIXME: To support multiple metrics in future, parse error message itself to extract the invalid metric name and display appropriate messages.
|
|
306
|
+
message_parts = ["PROFILING FAILED \nErrors:"]
|
|
307
|
+
|
|
308
|
+
if context.errors and INVALID_METRIC_ERROR_HINT in context.errors[0]:
|
|
309
|
+
message_parts.append(
|
|
310
|
+
f"Invalid value '{context.metric}' for 'metric' parameter for nsight.analyze.kernel(). "
|
|
311
|
+
f"\nPlease refer ncu --query-metrics for list of supported metrics."
|
|
312
|
+
)
|
|
313
|
+
else:
|
|
314
|
+
message_parts.append("\n".join(f"- {error}" for error in context.errors))
|
|
315
|
+
|
|
316
|
+
message_parts.append(
|
|
317
|
+
f"\nRefer Nsight Compute CLI log file: {context.log_file_path} for more details."
|
|
318
|
+
)
|
|
319
|
+
|
|
320
|
+
return "\n".join(message_parts)
|