nsight-python 0.9.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nsight/__init__.py +12 -0
- nsight/analyze.py +363 -0
- nsight/annotation.py +80 -0
- nsight/collection/__init__.py +10 -0
- nsight/collection/core.py +399 -0
- nsight/collection/ncu.py +268 -0
- nsight/exceptions.py +51 -0
- nsight/extraction.py +224 -0
- nsight/thermovision.py +115 -0
- nsight/transformation.py +167 -0
- nsight/utils.py +320 -0
- nsight/visualization.py +470 -0
- nsight_python-0.9.4.dist-info/METADATA +254 -0
- nsight_python-0.9.4.dist-info/RECORD +16 -0
- nsight_python-0.9.4.dist-info/WHEEL +4 -0
- nsight_python-0.9.4.dist-info/licenses/LICENSE +202 -0
|
@@ -0,0 +1,399 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
|
|
4
|
+
import abc
|
|
5
|
+
import dataclasses
|
|
6
|
+
import functools
|
|
7
|
+
import importlib.util
|
|
8
|
+
import inspect
|
|
9
|
+
import os
|
|
10
|
+
import time
|
|
11
|
+
from collections.abc import Callable, Sequence
|
|
12
|
+
from typing import Any
|
|
13
|
+
|
|
14
|
+
import pandas as pd
|
|
15
|
+
|
|
16
|
+
from nsight import exceptions, thermovision, transformation, utils
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _sanitize_configs(
|
|
20
|
+
func: Callable[..., Any],
|
|
21
|
+
*args: Any,
|
|
22
|
+
configs: Sequence[Sequence[Any]] | None = None,
|
|
23
|
+
decorator_configs: Sequence[Sequence[Any]] | None = None,
|
|
24
|
+
**kwargs: Any,
|
|
25
|
+
) -> list[tuple[Any, ...]]:
|
|
26
|
+
"""
|
|
27
|
+
Sanitizes and validates configuration inputs for a profile-decorated function.
|
|
28
|
+
|
|
29
|
+
This function ensures that the provided configurations are consistent and
|
|
30
|
+
handles different cases of passing configurations.
|
|
31
|
+
|
|
32
|
+
1. As regular args+kw - A single configuration -> the function arguments
|
|
33
|
+
2. As configs=, a list of configurations at functionn call time
|
|
34
|
+
3. As decorator_configs=, a list of configurations at decoration time
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
*args: Positional arguments that may contain configuration data.
|
|
38
|
+
configs: A list of configurations provided at runtime.
|
|
39
|
+
decorator_configs: A list of configurations provided
|
|
40
|
+
at decoration time.
|
|
41
|
+
**kwargs: Keyword arguments that may contain additional configuration data.
|
|
42
|
+
|
|
43
|
+
Returns:
|
|
44
|
+
A sanitized list of configurations.
|
|
45
|
+
|
|
46
|
+
Raises:
|
|
47
|
+
exceptions.ProfilerException: If no configurations are provided or if
|
|
48
|
+
configurations are provided both at decoration time and runtime.
|
|
49
|
+
AssertionError: If `configs` is not a list when provided.
|
|
50
|
+
|
|
51
|
+
Notes:
|
|
52
|
+
- If `args` are provided, `configs` and `decorator_configs` must be `None`.
|
|
53
|
+
- If `configs` is provided, `decorator_configs` must be `None`, and vice versa.
|
|
54
|
+
- The function combines `args` and `kwargs` into a single list if `args` are provided.
|
|
55
|
+
- The function assumes that `kwargs` keys are in the expected order when combining.
|
|
56
|
+
"""
|
|
57
|
+
if len(args) > 0:
|
|
58
|
+
# We do not expect any configs in this case
|
|
59
|
+
assert configs is None and decorator_configs is None
|
|
60
|
+
# kwargs not supported here yet
|
|
61
|
+
if len(kwargs) != 0:
|
|
62
|
+
raise exceptions.ProfilerException(
|
|
63
|
+
f"Keyword arguments are not supported yet: {list(kwargs.keys())}"
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
configs = [list(args)]
|
|
67
|
+
|
|
68
|
+
if configs is None:
|
|
69
|
+
if decorator_configs is None:
|
|
70
|
+
raise exceptions.ProfilerException(
|
|
71
|
+
"You have provided no configs. Provide configs at decoration time or at runtime."
|
|
72
|
+
)
|
|
73
|
+
configs = decorator_configs
|
|
74
|
+
|
|
75
|
+
else:
|
|
76
|
+
if decorator_configs is not None:
|
|
77
|
+
raise exceptions.ProfilerException(
|
|
78
|
+
"You have provided configs at decoration time and at runtime. Provide configs at decoration time or at runtime."
|
|
79
|
+
)
|
|
80
|
+
assert isinstance(configs, list), f"configs must be a list, got {type(configs)}"
|
|
81
|
+
|
|
82
|
+
# Validate that all configs have the same number of arguments
|
|
83
|
+
if len(configs) == 0:
|
|
84
|
+
raise exceptions.ProfilerException("configs list cannot be empty")
|
|
85
|
+
config_lengths = [len(config) for config in configs]
|
|
86
|
+
if not all(length == config_lengths[0] for length in config_lengths):
|
|
87
|
+
raise exceptions.ProfilerException(
|
|
88
|
+
f"All configs must have the same number of arguments. Found lengths: {config_lengths}"
|
|
89
|
+
)
|
|
90
|
+
first_config_arg_count = config_lengths[0]
|
|
91
|
+
|
|
92
|
+
# Validate that the number of args matches the number of function parameters
|
|
93
|
+
sig = inspect.signature(func)
|
|
94
|
+
expected_arg_count = len(sig.parameters)
|
|
95
|
+
if first_config_arg_count != expected_arg_count:
|
|
96
|
+
raise exceptions.ProfilerException(
|
|
97
|
+
f"Configs have {first_config_arg_count} arguments, but function expects {expected_arg_count}"
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
return configs # type: ignore[return-value]
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def run_profile_session(
|
|
104
|
+
func: Callable[..., Any],
|
|
105
|
+
configs: Sequence[Sequence[Any]],
|
|
106
|
+
runs: int,
|
|
107
|
+
output_progress: bool,
|
|
108
|
+
output_detailed: bool,
|
|
109
|
+
thermal_control: bool,
|
|
110
|
+
) -> None:
|
|
111
|
+
|
|
112
|
+
if output_progress:
|
|
113
|
+
print("")
|
|
114
|
+
print("")
|
|
115
|
+
|
|
116
|
+
if thermal_control:
|
|
117
|
+
thermovision_initialized = thermovision.init()
|
|
118
|
+
|
|
119
|
+
total_configs = len(configs)
|
|
120
|
+
total_runs = total_configs * runs # Total runs executed
|
|
121
|
+
curr_config = 0
|
|
122
|
+
curr_run = 0
|
|
123
|
+
total_time: float = 0
|
|
124
|
+
bar_length = 100
|
|
125
|
+
progress_time: float = 0
|
|
126
|
+
|
|
127
|
+
# overwrite flag: we do not overwrite when output mode is detailed
|
|
128
|
+
overwrite_output = not output_detailed
|
|
129
|
+
|
|
130
|
+
for c in configs:
|
|
131
|
+
curr_config += 1
|
|
132
|
+
|
|
133
|
+
if output_progress:
|
|
134
|
+
utils.print_config(total_configs, curr_config, c, overwrite_output)
|
|
135
|
+
|
|
136
|
+
for i in range(runs):
|
|
137
|
+
start_time = time.time()
|
|
138
|
+
curr_run += 1
|
|
139
|
+
if thermal_control:
|
|
140
|
+
if thermovision_initialized:
|
|
141
|
+
thermovision.throttle_guard()
|
|
142
|
+
|
|
143
|
+
# Check if func supports the input configs
|
|
144
|
+
if len(inspect.signature(func).parameters) != len(c):
|
|
145
|
+
raise exceptions.ProfilerException(
|
|
146
|
+
f"Function '{func.__name__}' does not support the input configuration"
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
# Run the function with the config
|
|
150
|
+
func(*c)
|
|
151
|
+
|
|
152
|
+
elapsed_time = time.time() - start_time
|
|
153
|
+
if curr_run > 1:
|
|
154
|
+
total_time += elapsed_time
|
|
155
|
+
avg_time_per_run = total_time / curr_run
|
|
156
|
+
else:
|
|
157
|
+
avg_time_per_run = elapsed_time # Use first run's time only
|
|
158
|
+
|
|
159
|
+
# Update time estimates every half second
|
|
160
|
+
if time.time() - progress_time > 0.5:
|
|
161
|
+
if output_progress:
|
|
162
|
+
utils.print_progress_bar(
|
|
163
|
+
total_runs,
|
|
164
|
+
curr_run,
|
|
165
|
+
bar_length,
|
|
166
|
+
avg_time_per_run,
|
|
167
|
+
overwrite_output,
|
|
168
|
+
)
|
|
169
|
+
progress_time = time.time()
|
|
170
|
+
|
|
171
|
+
# Update progress bar at end so it shows 100%
|
|
172
|
+
if output_progress:
|
|
173
|
+
utils.print_progress_bar(
|
|
174
|
+
total_runs,
|
|
175
|
+
curr_run,
|
|
176
|
+
bar_length,
|
|
177
|
+
avg_time_per_run,
|
|
178
|
+
overwrite_output,
|
|
179
|
+
)
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
@dataclasses.dataclass
|
|
183
|
+
class ProfileSettings:
|
|
184
|
+
"""
|
|
185
|
+
Class to hold profile settings for Nsight Python.
|
|
186
|
+
"""
|
|
187
|
+
|
|
188
|
+
configs: Sequence[Sequence[Any]] | None
|
|
189
|
+
"""
|
|
190
|
+
A list of configurations to run the
|
|
191
|
+
function with. Each configuration is a tuple of arguments for the
|
|
192
|
+
decorated function. Nsight Python invokes the decorated function
|
|
193
|
+
``len(configs) * runs`` times. If the configs are not provided at
|
|
194
|
+
decoration time, they must be provided when calling the decorated function.
|
|
195
|
+
"""
|
|
196
|
+
|
|
197
|
+
runs: int
|
|
198
|
+
"""Number of times each configuration should be executed."""
|
|
199
|
+
|
|
200
|
+
output_progress: bool
|
|
201
|
+
"""
|
|
202
|
+
Will display a progress bar on stdout during profiling
|
|
203
|
+
"""
|
|
204
|
+
|
|
205
|
+
output_detailed: bool
|
|
206
|
+
"""
|
|
207
|
+
Will display a progress bar, detailed output for each config along with the profiler logs
|
|
208
|
+
"""
|
|
209
|
+
|
|
210
|
+
derive_metric: Callable[..., float] | None
|
|
211
|
+
"""
|
|
212
|
+
A function to transform the collected metric.
|
|
213
|
+
This can be used to compute derived metrics like TFLOPs that cannot
|
|
214
|
+
be captured by ncu directly. The function takes the metric value and
|
|
215
|
+
the arguments of the profile-decorated function and returns the new
|
|
216
|
+
metric. See the examples for concrete use cases.
|
|
217
|
+
"""
|
|
218
|
+
|
|
219
|
+
normalize_against: str | None
|
|
220
|
+
"""
|
|
221
|
+
Annotation name to normalize metrics against.
|
|
222
|
+
This is useful to compute relative metrics like speedup.
|
|
223
|
+
"""
|
|
224
|
+
|
|
225
|
+
thermal_control: bool
|
|
226
|
+
"""
|
|
227
|
+
Toggles whether to enable thermal control.
|
|
228
|
+
"""
|
|
229
|
+
|
|
230
|
+
output_prefix: str | None
|
|
231
|
+
"""
|
|
232
|
+
All intermediate profiler files are created with this prefix
|
|
233
|
+
"""
|
|
234
|
+
|
|
235
|
+
output_csv: bool
|
|
236
|
+
"""
|
|
237
|
+
Controls whether to output raw and processed profiling data to CSV files
|
|
238
|
+
"""
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
class ProfileResults:
|
|
242
|
+
"""
|
|
243
|
+
Class to hold profile results for Nsight Python
|
|
244
|
+
"""
|
|
245
|
+
|
|
246
|
+
def __init__(self, results: pd.DataFrame):
|
|
247
|
+
"""
|
|
248
|
+
Initialize a ProfileResults object.
|
|
249
|
+
|
|
250
|
+
Args:
|
|
251
|
+
results: Processed profiling results.
|
|
252
|
+
"""
|
|
253
|
+
self._results = results
|
|
254
|
+
|
|
255
|
+
def to_dataframe(self) -> pd.DataFrame:
|
|
256
|
+
"""
|
|
257
|
+
Returns the processed profiling data as a pandas DataFrame.
|
|
258
|
+
|
|
259
|
+
This DataFrame contains aggregated statistics across multiple runs for each
|
|
260
|
+
configuration and annotation combination. The data is equivalent to what is
|
|
261
|
+
written to the ``processed_data-<function_name>-<run_id>.csv`` file when
|
|
262
|
+
``output_csv=True``.
|
|
263
|
+
|
|
264
|
+
Returns:
|
|
265
|
+
Processed profiling data with the following columns:
|
|
266
|
+
|
|
267
|
+
- ``Annotation``: Name of the annotated region being profiled
|
|
268
|
+
- ``<param_name>``: One column for each parameter of the decorated function
|
|
269
|
+
- ``AvgValue``: Average metric value across all runs
|
|
270
|
+
- ``StdDev``: Standard deviation of the metric across runs
|
|
271
|
+
- ``MinValue``: Minimum metric value observed
|
|
272
|
+
- ``MaxValue``: Maximum metric value observed
|
|
273
|
+
- ``NumRuns``: Number of runs used for aggregation
|
|
274
|
+
- ``CI95_Lower``: Lower bound of the 95% confidence interval
|
|
275
|
+
- ``CI95_Upper``: Upper bound of the 95% confidence interval
|
|
276
|
+
- ``RelativeStdDevPct``: Standard deviation as a percentage of the mean
|
|
277
|
+
- ``StableMeasurement``: Boolean indicating if the measurement is stable (low variance). The measurement is stable if ``RelativeStdDevPct`` < 2 % .
|
|
278
|
+
- ``Metric``: The metric being collected
|
|
279
|
+
- ``Transformed``: Name of the function used to transform the metric (specified via ``derive_metric``), or ``False`` if no transformation was applied. For lambda functions, this shows ``"<lambda>"``
|
|
280
|
+
- ``Kernel``: Name of the GPU kernel(s) launched
|
|
281
|
+
- ``GPU``: GPU device name
|
|
282
|
+
- ``Host``: Host machine name
|
|
283
|
+
- ``ComputeClock``: GPU compute clock frequency
|
|
284
|
+
- ``MemoryClock``: GPU memory clock frequency
|
|
285
|
+
"""
|
|
286
|
+
return self._results
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
class NsightCollector(abc.ABC):
|
|
290
|
+
@abc.abstractmethod
|
|
291
|
+
def collect(
|
|
292
|
+
self,
|
|
293
|
+
func: Callable[..., Any],
|
|
294
|
+
configs: Sequence[Sequence[Any]],
|
|
295
|
+
settings: ProfileSettings,
|
|
296
|
+
) -> pd.DataFrame | None:
|
|
297
|
+
"""
|
|
298
|
+
Collects profiling data for the given function and configurations.
|
|
299
|
+
|
|
300
|
+
Args:
|
|
301
|
+
func: The function to be profiled.
|
|
302
|
+
configs: List of configurations for profiling.
|
|
303
|
+
settings: Settings for profiling.
|
|
304
|
+
|
|
305
|
+
Returns:
|
|
306
|
+
Collected profiling data.
|
|
307
|
+
"""
|
|
308
|
+
pass
|
|
309
|
+
|
|
310
|
+
|
|
311
|
+
class NsightProfiler:
|
|
312
|
+
"""
|
|
313
|
+
A decorator class for profiling functions using Nsight Python's profiling tools.
|
|
314
|
+
|
|
315
|
+
This class allows you to wrap a function with profiling capabilities,
|
|
316
|
+
collecting performance data and saving the results to CSV files. It uses
|
|
317
|
+
a collector to gather raw profiling data and processes it according to
|
|
318
|
+
the provided settings.
|
|
319
|
+
|
|
320
|
+
Attributes:
|
|
321
|
+
settings: Configuration settings for profiling,
|
|
322
|
+
including normalization, and other options.
|
|
323
|
+
collector: The collector responsible for gathering
|
|
324
|
+
raw profiling data.
|
|
325
|
+
|
|
326
|
+
Methods:
|
|
327
|
+
__call__(func):
|
|
328
|
+
Wraps the given function with profiling logic. Collects raw
|
|
329
|
+
profiling data, processes it, and saves the results to CSV files.
|
|
330
|
+
Returns the processed data.
|
|
331
|
+
"""
|
|
332
|
+
|
|
333
|
+
def __init__(self, settings: ProfileSettings, collector: NsightCollector):
|
|
334
|
+
self.settings = settings
|
|
335
|
+
self.collector = collector
|
|
336
|
+
|
|
337
|
+
def __call__(
|
|
338
|
+
self, func: Callable[..., Any]
|
|
339
|
+
) -> Callable[..., ProfileResults | None]:
|
|
340
|
+
func._nspy_ncu_run_id = 0 # type: ignore[attr-defined]
|
|
341
|
+
|
|
342
|
+
@functools.wraps(func)
|
|
343
|
+
def wrapper(
|
|
344
|
+
*args: Any, configs: Sequence[Sequence[Any]] | None = None, **kwargs: Any
|
|
345
|
+
) -> ProfileResults | None:
|
|
346
|
+
|
|
347
|
+
tag = f"{func.__name__}-{func._nspy_ncu_run_id}" # type: ignore[attr-defined]
|
|
348
|
+
|
|
349
|
+
configs = _sanitize_configs(
|
|
350
|
+
func,
|
|
351
|
+
*args,
|
|
352
|
+
configs=configs,
|
|
353
|
+
decorator_configs=self.settings.configs,
|
|
354
|
+
**kwargs,
|
|
355
|
+
)
|
|
356
|
+
|
|
357
|
+
raw_df = self.collector.collect(func, configs, self.settings)
|
|
358
|
+
|
|
359
|
+
if raw_df is not None:
|
|
360
|
+
processed = transformation.aggregate_data(
|
|
361
|
+
raw_df,
|
|
362
|
+
func,
|
|
363
|
+
self.settings.normalize_against,
|
|
364
|
+
self.settings.output_progress,
|
|
365
|
+
)
|
|
366
|
+
|
|
367
|
+
# Save to CSV if enabled
|
|
368
|
+
if self.settings.output_csv:
|
|
369
|
+
raw_csv_path = (
|
|
370
|
+
f"{self.settings.output_prefix}profiled_data-{tag}.csv"
|
|
371
|
+
)
|
|
372
|
+
processed_csv_path = (
|
|
373
|
+
f"{self.settings.output_prefix}processed_data-{tag}.csv"
|
|
374
|
+
)
|
|
375
|
+
|
|
376
|
+
raw_df.to_csv(
|
|
377
|
+
raw_csv_path,
|
|
378
|
+
index=False,
|
|
379
|
+
)
|
|
380
|
+
processed.to_csv(
|
|
381
|
+
processed_csv_path,
|
|
382
|
+
index=False,
|
|
383
|
+
)
|
|
384
|
+
|
|
385
|
+
if self.settings.output_progress:
|
|
386
|
+
print(
|
|
387
|
+
f"[NSIGHT-PYTHON] Refer to {raw_csv_path} for the raw profiling data"
|
|
388
|
+
)
|
|
389
|
+
print(
|
|
390
|
+
f"[NSIGHT-PYTHON] Refer to {processed_csv_path} for the processed profiling data"
|
|
391
|
+
)
|
|
392
|
+
|
|
393
|
+
func._nspy_ncu_run_id += 1 # type: ignore[attr-defined]
|
|
394
|
+
|
|
395
|
+
return ProfileResults(results=processed)
|
|
396
|
+
|
|
397
|
+
return None
|
|
398
|
+
|
|
399
|
+
return wrapper
|
nsight/collection/ncu.py
ADDED
|
@@ -0,0 +1,268 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
|
|
4
|
+
"""
|
|
5
|
+
Collection utilities for profiling Nsight Python runs using NVIDIA Nsight Compute (ncu).
|
|
6
|
+
|
|
7
|
+
This module contains logic for launching NVIDIA Nsight Compute with appropriate settings.
|
|
8
|
+
NCU is instructed to profile specific code sections marked by NVTX ranges - the
|
|
9
|
+
Nsight Python annotations.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
import os
|
|
13
|
+
import subprocess
|
|
14
|
+
import sys
|
|
15
|
+
from collections.abc import Callable, Sequence
|
|
16
|
+
from typing import Any, Literal
|
|
17
|
+
|
|
18
|
+
import pandas as pd
|
|
19
|
+
|
|
20
|
+
from nsight import exceptions, extraction, utils
|
|
21
|
+
from nsight.collection import core
|
|
22
|
+
from nsight.exceptions import NCUErrorContext
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def launch_ncu(
|
|
26
|
+
report_path: str,
|
|
27
|
+
name: str,
|
|
28
|
+
metric: str,
|
|
29
|
+
cache_control: Literal["none", "all"],
|
|
30
|
+
clock_control: Literal["none", "base"],
|
|
31
|
+
replay_mode: Literal["kernel", "range"],
|
|
32
|
+
verbose: bool,
|
|
33
|
+
) -> str | None:
|
|
34
|
+
"""
|
|
35
|
+
Launch NVIDIA Nsight Compute to profile the current script with specified options.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
report_path: Path to write report file to.
|
|
39
|
+
metric: Specific metric to collect.
|
|
40
|
+
cache_control: Select cache control option
|
|
41
|
+
clock_control: Select clock control option
|
|
42
|
+
replay_mode: Select replay mode option
|
|
43
|
+
verbose: If False, log is written to a file (ncu_log.txt)
|
|
44
|
+
|
|
45
|
+
Raises:
|
|
46
|
+
NCUNotAvailableError: If NCU is not available on the system.
|
|
47
|
+
SystemExit: If profiling fails due to an error from NVIDIA Nsight Compute.
|
|
48
|
+
|
|
49
|
+
Returns:
|
|
50
|
+
path to the NVIDIA Nsight Compute log file
|
|
51
|
+
Produces NVIDIA Nsight Compute report file with profiling data.
|
|
52
|
+
"""
|
|
53
|
+
assert report_path.endswith(".ncu-rep")
|
|
54
|
+
|
|
55
|
+
# Determine the script being executed
|
|
56
|
+
script_path = os.path.abspath(sys.argv[0])
|
|
57
|
+
script_args = " ".join(sys.argv[1:])
|
|
58
|
+
|
|
59
|
+
# Set an environment variable to detect recursive calls
|
|
60
|
+
env = os.environ.copy()
|
|
61
|
+
env["NSPY_NCU_PROFILE"] = name
|
|
62
|
+
|
|
63
|
+
if cache_control not in ("none", "all"):
|
|
64
|
+
raise ValueError("cache_control must be 'none', or 'all'")
|
|
65
|
+
if clock_control not in ("none", "base"):
|
|
66
|
+
raise ValueError("clock_control must be 'none', or 'base'")
|
|
67
|
+
if replay_mode not in ("kernel", "range"):
|
|
68
|
+
raise ValueError("replay_mode must be 'kernel', or 'range'")
|
|
69
|
+
|
|
70
|
+
cache = f"--cache-control {cache_control}"
|
|
71
|
+
clocks = f"--clock-control {clock_control}"
|
|
72
|
+
replay = f"--replay-mode {replay_mode}"
|
|
73
|
+
log_path = os.path.splitext(report_path)[0] + ".log"
|
|
74
|
+
log = f"--log-file {log_path}"
|
|
75
|
+
nvtx = f'--nvtx --nvtx-include "regex:{utils.NVTX_DOMAIN}@.+/"'
|
|
76
|
+
|
|
77
|
+
# Construct the ncu command
|
|
78
|
+
ncu_command = f"""ncu {log} {cache} {clocks} {replay} {nvtx} --metrics {metric} -f -o {report_path} {sys.executable} {script_path} {script_args}"""
|
|
79
|
+
|
|
80
|
+
# Check if ncu is available on the system
|
|
81
|
+
ncu_available = False
|
|
82
|
+
try:
|
|
83
|
+
subprocess.run(
|
|
84
|
+
["ncu", "--version"],
|
|
85
|
+
check=True,
|
|
86
|
+
stdout=subprocess.DEVNULL,
|
|
87
|
+
stderr=subprocess.DEVNULL,
|
|
88
|
+
)
|
|
89
|
+
ncu_available = True
|
|
90
|
+
except:
|
|
91
|
+
ncu_available = False
|
|
92
|
+
|
|
93
|
+
if ncu_available:
|
|
94
|
+
try:
|
|
95
|
+
subprocess.run(
|
|
96
|
+
ncu_command,
|
|
97
|
+
shell=True,
|
|
98
|
+
check=True,
|
|
99
|
+
env=env,
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
return log_path
|
|
103
|
+
except subprocess.CalledProcessError as e:
|
|
104
|
+
log_parser = utils.NCULogParser()
|
|
105
|
+
error_logs = log_parser.get_logs(log_path, "ERROR")
|
|
106
|
+
|
|
107
|
+
# Create error context
|
|
108
|
+
error_context = NCUErrorContext(
|
|
109
|
+
errors=error_logs,
|
|
110
|
+
log_file_path=log_path,
|
|
111
|
+
metric=metric,
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
error_message = utils.format_ncu_error_message(error_context)
|
|
115
|
+
print(error_message)
|
|
116
|
+
sys.exit(1)
|
|
117
|
+
else:
|
|
118
|
+
subprocess.run([sys.executable, script_path], env=env)
|
|
119
|
+
raise exceptions.NCUNotAvailableError(
|
|
120
|
+
"Nsight Compute CLI (ncu) is not available on this system. Profiling will not be performed.\n"
|
|
121
|
+
"Please install Nsight Compute CLI."
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
class NCUCollector(core.NsightCollector):
|
|
126
|
+
"""
|
|
127
|
+
NCU collector for Nsight Python.
|
|
128
|
+
|
|
129
|
+
Args:
|
|
130
|
+
metric: Metric to collect from
|
|
131
|
+
NVIDIA Nsight Compute. By default we collect kernel runtimes in nanoseconds.
|
|
132
|
+
A list of supported metrics can be found with ``ncu --list-metrics``.
|
|
133
|
+
ignore_kernel_list: List of kernel names to ignore.
|
|
134
|
+
If you call a library within a ``annotation`` context, you might not have
|
|
135
|
+
precise control over which and how many kernels are being launched.
|
|
136
|
+
If some of these kernels should be ignored in the Nsight Python profile, their
|
|
137
|
+
their names can be blacklisted. Default: ``None``
|
|
138
|
+
combine_kernel_metrics: By default, Nsight Python
|
|
139
|
+
expects one kernel launch per annotation. In case an annotated region launches
|
|
140
|
+
multiple kernels, instead of failing the profiling run, you can specify
|
|
141
|
+
how to summarize the collected metrics into a single number. For example,
|
|
142
|
+
if we profile runtime and want to sum the times of all kernels we can specify
|
|
143
|
+
``combine_kernel_metrics = lambda x, y: x + y``. The function should take
|
|
144
|
+
two arguments and return a single value. Default: ``None``.
|
|
145
|
+
clock_control: Select clock_control option
|
|
146
|
+
control in NVIDIA Nsight Compute. If ``None``, we launch ``ncu --clock-control none ...``.
|
|
147
|
+
For more details, see the NVIDIA Nsight Compute Profiling Guide:
|
|
148
|
+
https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#clock-control
|
|
149
|
+
Default: ``None``
|
|
150
|
+
cache_control: Select cache_control option
|
|
151
|
+
control in NVIDIA Nsight Compute. If ``None``, we launch ``ncu --cache-control none ...``.
|
|
152
|
+
For more details, see the NVIDIA Nsight Compute Profiling Guide:
|
|
153
|
+
https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#cache-control
|
|
154
|
+
Default: ``all``
|
|
155
|
+
replay_mode: Select replay mode option
|
|
156
|
+
control in NVIDIA Nsight Compute. If ``None``, we launch ``ncu --replay-mode kernel ...``.
|
|
157
|
+
For more details, see the NVIDIA Nsight Compute Profiling Guide:
|
|
158
|
+
https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#replay
|
|
159
|
+
Default: ``kernel``
|
|
160
|
+
"""
|
|
161
|
+
|
|
162
|
+
def __init__(
|
|
163
|
+
self,
|
|
164
|
+
metric: str = "gpu__time_duration.sum",
|
|
165
|
+
ignore_kernel_list: Sequence[str] | None = None,
|
|
166
|
+
combine_kernel_metrics: Callable[[float, float], float] | None = None,
|
|
167
|
+
clock_control: Literal["base", "none"] = "none",
|
|
168
|
+
cache_control: Literal["all", "none"] = "all",
|
|
169
|
+
replay_mode: Literal["kernel", "range"] = "kernel",
|
|
170
|
+
):
|
|
171
|
+
if clock_control not in ("none", "base"):
|
|
172
|
+
raise ValueError("clock_control must be 'none', or 'base'")
|
|
173
|
+
if cache_control not in ("none", "all"):
|
|
174
|
+
raise ValueError("cache_control must be 'none', or 'all'")
|
|
175
|
+
if replay_mode not in ("kernel", "range"):
|
|
176
|
+
raise ValueError("replay_mode must be 'kernel', or 'range'")
|
|
177
|
+
|
|
178
|
+
self.metric = metric
|
|
179
|
+
self.ignore_kernel_list = ignore_kernel_list or []
|
|
180
|
+
self.combine_kernel_metrics = combine_kernel_metrics
|
|
181
|
+
self.clock_control = clock_control
|
|
182
|
+
self.cache_control = cache_control
|
|
183
|
+
self.replay_mode = replay_mode
|
|
184
|
+
|
|
185
|
+
def collect(
|
|
186
|
+
self,
|
|
187
|
+
func: Callable[..., Any],
|
|
188
|
+
configs: Sequence[Sequence[Any]],
|
|
189
|
+
settings: core.ProfileSettings,
|
|
190
|
+
) -> pd.DataFrame | None:
|
|
191
|
+
"""
|
|
192
|
+
Collects profiling data using NVIDIA Nsight Compute.
|
|
193
|
+
|
|
194
|
+
Args:
|
|
195
|
+
func: The function to profile.
|
|
196
|
+
configs: List of configurations to run the function with.
|
|
197
|
+
settings: Profiling settings.
|
|
198
|
+
|
|
199
|
+
Returns:
|
|
200
|
+
Collected profiling data.
|
|
201
|
+
"""
|
|
202
|
+
|
|
203
|
+
# Check if the script is already running under ncu
|
|
204
|
+
if "NSPY_NCU_PROFILE" not in os.environ:
|
|
205
|
+
|
|
206
|
+
tag = f"{func.__name__}-{func._nspy_ncu_run_id}" # type: ignore[attr-defined]
|
|
207
|
+
report_path = f"{settings.output_prefix}ncu-output-{tag}.ncu-rep"
|
|
208
|
+
|
|
209
|
+
# Launch NVIDIA Nsight Compute
|
|
210
|
+
log_path = launch_ncu(
|
|
211
|
+
report_path,
|
|
212
|
+
func.__name__,
|
|
213
|
+
self.metric,
|
|
214
|
+
self.cache_control,
|
|
215
|
+
self.clock_control,
|
|
216
|
+
self.replay_mode,
|
|
217
|
+
settings.output_detailed,
|
|
218
|
+
)
|
|
219
|
+
|
|
220
|
+
if settings.output_progress:
|
|
221
|
+
print("[NSIGHT-PYTHON] Profiling completed successfully !")
|
|
222
|
+
print(
|
|
223
|
+
f"[NSIGHT-PYTHON] Refer to {report_path} for the NVIDIA Nsight Compute CLI report"
|
|
224
|
+
)
|
|
225
|
+
print(
|
|
226
|
+
f"[NSIGHT-PYTHON] Refer to {log_path} for the NVIDIA Nsight Compute CLI logs"
|
|
227
|
+
)
|
|
228
|
+
|
|
229
|
+
# Extract raw data
|
|
230
|
+
df = extraction.extract_df_from_report(
|
|
231
|
+
report_path,
|
|
232
|
+
self.metric,
|
|
233
|
+
configs, # type: ignore[arg-type]
|
|
234
|
+
settings.runs,
|
|
235
|
+
func,
|
|
236
|
+
settings.derive_metric,
|
|
237
|
+
self.ignore_kernel_list, # type: ignore[arg-type]
|
|
238
|
+
settings.output_progress,
|
|
239
|
+
self.combine_kernel_metrics,
|
|
240
|
+
)
|
|
241
|
+
return df
|
|
242
|
+
|
|
243
|
+
else:
|
|
244
|
+
# If NSPY_NCU_PROFILE is set, just run the function normally
|
|
245
|
+
name = os.environ["NSPY_NCU_PROFILE"]
|
|
246
|
+
|
|
247
|
+
# If this is not the function we are profiling, stop
|
|
248
|
+
if func.__name__ != name:
|
|
249
|
+
return None
|
|
250
|
+
|
|
251
|
+
if settings.output_progress:
|
|
252
|
+
utils.print_header(
|
|
253
|
+
f"Profiling {name}",
|
|
254
|
+
f"{len(configs)} configurations, {settings.runs} runs each",
|
|
255
|
+
)
|
|
256
|
+
|
|
257
|
+
core.run_profile_session(
|
|
258
|
+
func,
|
|
259
|
+
configs,
|
|
260
|
+
settings.runs,
|
|
261
|
+
settings.output_progress,
|
|
262
|
+
settings.output_detailed,
|
|
263
|
+
settings.thermal_control,
|
|
264
|
+
)
|
|
265
|
+
|
|
266
|
+
# Exit after profiling to prevent the rest of the script from running
|
|
267
|
+
# Use os._exit() instead of sys.exit() to avoid pytest catching SystemExit
|
|
268
|
+
os._exit(0)
|