nsight-python 0.9.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
nsight/__init__.py ADDED
@@ -0,0 +1,12 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ from nsight import analyze
5
+ from nsight.annotation import annotate
6
+ from nsight.utils import col_panel, row_panel
7
+
8
+ # Versioning Scheme: major.minor.build
9
+ __version__ = "0.9.4"
10
+
11
+
12
+ __all__ = ["analyze", "annotate"]
nsight/analyze.py ADDED
@@ -0,0 +1,363 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ import contextlib
5
+ import functools
6
+ import os
7
+ import tempfile
8
+ from collections.abc import Callable, Sequence
9
+ from typing import Any, Literal, overload
10
+
11
+ import matplotlib
12
+ import matplotlib.figure
13
+
14
+ import nsight.collection as collection
15
+ import nsight.visualization as visualization
16
+
17
+
18
+ # Overload 1: When used without parentheses: @kernel
19
+ @overload
20
+ def kernel(
21
+ _func: Callable[..., Any],
22
+ ) -> Callable[..., collection.core.ProfileResults]: ...
23
+
24
+
25
+ # Overload 2: When used with parentheses: @kernel() or @kernel(args)
26
+ @overload
27
+ def kernel(
28
+ _func: None = None,
29
+ *,
30
+ configs: Sequence[Sequence[Any]] | None = None,
31
+ runs: int = 1,
32
+ derive_metric: Callable[..., float] | None = None,
33
+ normalize_against: str | None = None,
34
+ output: Literal["quiet", "progress", "verbose"] = "progress",
35
+ metric: str = "gpu__time_duration.sum",
36
+ ignore_kernel_list: Sequence[str] | None = None,
37
+ clock_control: Literal["base", "none"] = "none",
38
+ cache_control: Literal["all", "none"] = "all",
39
+ replay_mode: Literal["kernel", "range"] = "kernel",
40
+ thermal_control: bool = True,
41
+ combine_kernel_metrics: Callable[[float, float], float] | None = None,
42
+ output_prefix: str | None = None,
43
+ output_csv: bool = False,
44
+ ) -> Callable[[Callable[..., Any]], Callable[..., collection.core.ProfileResults]]: ...
45
+
46
+
47
+ # Implementation
48
+ def kernel(
49
+ _func: Callable[..., Any] | None = None,
50
+ *,
51
+ configs: Sequence[Sequence[Any]] | None = None,
52
+ runs: int = 1,
53
+ derive_metric: Callable[..., float] | None = None,
54
+ normalize_against: str | None = None,
55
+ output: Literal["quiet", "progress", "verbose"] = "progress",
56
+ metric: str = "gpu__time_duration.sum",
57
+ ignore_kernel_list: Sequence[str] | None = None,
58
+ clock_control: Literal["base", "none"] = "none",
59
+ cache_control: Literal["all", "none"] = "all",
60
+ replay_mode: Literal["kernel", "range"] = "kernel",
61
+ thermal_control: bool = True,
62
+ combine_kernel_metrics: Callable[[float, float], float] | None = None,
63
+ output_prefix: str | None = None,
64
+ output_csv: bool = False,
65
+ ) -> (
66
+ Callable[..., collection.core.ProfileResults]
67
+ | Callable[[Callable[..., Any]], Callable[..., collection.core.ProfileResults]]
68
+ ):
69
+ """
70
+ A decorator that collects profiling data using NVIDIA Nsight Compute.
71
+
72
+ Can be used with or without parentheses:
73
+ - ``@nsight.analyze.kernel`` (no parentheses)
74
+ - ``@nsight.analyze.kernel()`` (empty parentheses)
75
+ - ``@nsight.analyze.kernel(configs=..., runs=10)`` (with arguments)
76
+
77
+ The decorator returns a wrapped version of your function with the following signature::
78
+
79
+ def wrapped_function(*args, configs=None, **kwargs) -> ProfileResults
80
+
81
+ Where:
82
+ - ``*args``: Original function arguments (when providing a single config)
83
+ - ``configs``: Optional list of configurations (overrides decorator-time configs)
84
+ - ``**kwargs``: Original function keyword arguments
85
+ - Returns ``ProfileResults`` object containing profiling data
86
+
87
+
88
+ Parameters:
89
+ configs:
90
+ A sequence of configurations to run the function with. Each configuration is a tuple of arguments for the decorated function.
91
+ Nsight Python invokes the decorated function ``len(configs) * runs`` times.
92
+ If the configs are not provided at decoration time, they must be provided when calling the decorated function.
93
+ runs: Number of times each configuration should be executed.
94
+ derive_metric:
95
+ A function to transform the collected metric.
96
+ This can be used to compute derived metrics like TFLOPs that cannot
97
+ be captured by ncu directly. The function takes the metric value and
98
+ the arguments of the profile-decorated function and returns the new
99
+ metric. See the examples for concrete use cases.
100
+ normalize_against:
101
+ Annotation name to normalize metrics against.
102
+ This is useful to compute relative metrics like speedup.
103
+ metric: The metric to collect. By default, kernel runtimes in nanoseconds are collected. Default: ``"gpu__time_duration.sum"``. To see the available metrics on your system, use the command: ``ncu --query-metrics``.
104
+ ignore_kernel_list:
105
+ List of kernel names to ignore. If you call a library within an annotated range context, you might not have precise control over which and how many kernels are being launched.
106
+ If some of these kernels should be ignored in the profile, their names can be provided in this parameter. Default: ``None``
107
+ combine_kernel_metrics: By default, Nsight Python
108
+ expects one kernel launch per annotation. In case an annotated region launches
109
+ multiple kernels, instead of failing the profiling run, you can specify
110
+ how to summarize the collected metrics into a single number. For example,
111
+ if we profile runtime and want to sum the times of all kernels we can specify
112
+ ``combine_kernel_metrics = lambda x, y: x + y``. The function should take
113
+ two arguments and return a single value. Default: ``None``.
114
+ clock_control: Control the behavior of the GPU clocks during profiling. Allowed values:
115
+
116
+ - ``"base"``: GPC and memory clocks are locked to their respective base frequency during profiling. This has no impact on thermal throttling. Note that actual clocks might still vary, depending on the level of driver support for this feature. As an alternative, use nvidia-smi to lock the clocks externally and set this option to ``"none"``.
117
+ - ``"none"``: No GPC or memory frequencies are changed during profiling.
118
+
119
+ Default: ``"none"``
120
+ cache_control: Control the behavior of the GPU caches during profiling. Allowed values:
121
+
122
+ - ``"all"``: All GPU caches are flushed before each kernel replay iteration during profiling. While metric values in the execution environment of the application might be slightly different without invalidating the caches, this mode offers the most reproducible metric results across the replay passes and also across multiple runs of the target application.
123
+ - ``"none"``: No GPU caches are flushed during profiling. This can improve performance and better replicates the application behavior if only a single kernel replay pass is necessary for metric collection. However, some metric results will vary depending on prior GPU work, and between replay iterations. This can lead to inconsistent and out-of-bounds metric values.
124
+
125
+ Default: ``"all"``
126
+
127
+ replay_mode: Mechanism used for replaying a kernel launch multiple times to collect selected metrics. Allowed values:
128
+
129
+ - ``"kernel"``: Replay individual kernel launches during the execution of the application.
130
+ - ``"range"``: Replay range of kernel launches during the execution of the application. Ranges are defined using nsight.annotate.
131
+
132
+ Default: ``"kernel"``
133
+
134
+ thermal_control : Toggles whether to enable thermal control. Default: ``True``
135
+ output: Controls the verbosity level of the output.
136
+
137
+ - ``"quiet"``: Suppresses all output.
138
+ - ``"progress"``: Shows a progress bar along with details about profiling and data extraction progress.
139
+ - ``"verbose"``: Displays the progress bar, configuration-specific logs, and profiler logs.
140
+
141
+ output_prefix: When specified, all intermediate profiler files are created with this prefix.
142
+ For example, if `output_prefix="/home/user/run1_"`, the profiler will generate:
143
+
144
+ - /home/user/run1_ncu-output-<name_of_decorated_function>-<run_id>.log
145
+ - /home/user/run1_ncu-output-<name_of_decorated_function>-<run_id>.ncu-rep
146
+ - /home/user/run1_processed_data-<name_of_decorated_function>-<run_id>.csv
147
+ - /home/user/run1_profiled_data-<name_of_decorated_function>-<run_id>.csv
148
+
149
+ Where ``<run_id>`` is a counter that increments each time the decorated function is called
150
+ within the same Python process (0, 1, 2, ...). This allows calling the same decorated function
151
+ multiple times without overwriting previous results.
152
+
153
+ if ``None``, the intermediate profiler files are created in a directory under <TEMP_DIR> prefixed with nspy. <TEMP_DIR> is the system's temporary directory (`$TMPDIR` or `/tmp` on Linux, `%TEMP%` on Windows).
154
+
155
+ output_csv: Controls whether to dump raw and processed profiling data to CSV files. Default: ``False``.
156
+ When enabled, two CSV files are generated:
157
+
158
+ **Raw Data CSV** (``profiled_data-<function_name>-<run_id>.csv``): Contains unprocessed profiling data with one row per run per configuration. Columns include:
159
+
160
+ - ``Annotation``: Name of the annotated region being profiled
161
+ - ``Value``: Raw metric value collected by the profiler
162
+ - ``Metric``: The metric being collected (e.g., ``gpu__time_duration.sum``)
163
+ - ``Transformed``: Name of the function used to transform the metric (specified via ``derive_metric``), or ``False`` if no transformation was applied. For lambda functions, this shows ``"<lambda>"``
164
+ - ``Kernel``: Name of the GPU kernel(s) launched
165
+ - ``GPU``: GPU device name
166
+ - ``Host``: Host machine name
167
+ - ``ComputeClock``: GPU compute clock frequency during profiling
168
+ - ``MemoryClock``: GPU memory clock frequency during profiling
169
+ - ``<param_name>``: One column for each parameter of the decorated function
170
+
171
+ **Processed Data CSV** (``processed_data-<function_name>-<run_id>.csv``): Contains aggregated statistics across multiple runs. Columns include:
172
+
173
+ - ``Annotation``: Name of the annotated region being profiled
174
+ - ``<param_name>``: One column for each parameter of the decorated function
175
+ - ``AvgValue``: Average metric value across all runs
176
+ - ``StdDev``: Standard deviation of the metric across runs
177
+ - ``MinValue``: Minimum metric value observed
178
+ - ``MaxValue``: Maximum metric value observed
179
+ - ``NumRuns``: Number of runs used for aggregation
180
+ - ``CI95_Lower``: Lower bound of the 95% confidence interval
181
+ - ``CI95_Upper``: Upper bound of the 95% confidence interval
182
+ - ``RelativeStdDevPct``: Standard deviation as a percentage of the mean
183
+ - ``StableMeasurement``: Boolean indicating if the measurement is stable (low variance). The measurement is stable if ``RelativeStdDevPct`` < 2 % .
184
+ - ``Metric``: The metric being collected
185
+ - ``Transformed``: Name of the function used to transform the metric (specified via ``derive_metric``), or ``False`` if no transformation was applied. For lambda functions, this shows ``"<lambda>"``
186
+ - ``Kernel``: Name of the GPU kernel(s) launched
187
+ - ``GPU``: GPU device name
188
+ - ``Host``: Host machine name
189
+ - ``ComputeClock``: GPU compute clock frequency
190
+ - ``MemoryClock``: GPU memory clock frequency
191
+ """
192
+
193
+ def _create_profiler() -> collection.core.NsightProfiler:
194
+ """Helper to create the profiler with the given settings."""
195
+ if output not in ("quiet", "progress", "verbose"):
196
+ raise ValueError("output must be 'quiet', 'progress' or 'verbose'")
197
+
198
+ output_progress = output == "progress" or output == "verbose"
199
+ output_detailed = output == "verbose"
200
+
201
+ # Create the output paths needed for the ncu report, ncu logs and the CSVs
202
+ prefix = output_prefix
203
+ if "NSPY_NCU_PROFILE" not in os.environ:
204
+ if prefix is None:
205
+ prefix = tempfile.mkdtemp(prefix="nspy_")
206
+ prefix = os.path.join(
207
+ prefix, ""
208
+ ) # Adds a trailing forward/backward slash
209
+ else:
210
+ os.makedirs(os.path.dirname(prefix) or ".", exist_ok=True)
211
+
212
+ settings = collection.core.ProfileSettings(
213
+ configs=configs,
214
+ runs=runs,
215
+ output_progress=output_progress,
216
+ output_detailed=output_detailed,
217
+ derive_metric=derive_metric,
218
+ normalize_against=normalize_against,
219
+ thermal_control=thermal_control,
220
+ output_prefix=prefix,
221
+ output_csv=output_csv,
222
+ )
223
+ ncu = collection.ncu.NCUCollector(
224
+ metric=metric,
225
+ ignore_kernel_list=ignore_kernel_list,
226
+ combine_kernel_metrics=combine_kernel_metrics,
227
+ clock_control=clock_control,
228
+ cache_control=cache_control,
229
+ replay_mode=replay_mode,
230
+ )
231
+ return collection.core.NsightProfiler(settings, ncu)
232
+
233
+ # Support both @kernel and @kernel() syntax
234
+ if _func is None:
235
+ # Called with parentheses: @kernel() or @kernel(args)
236
+ return _create_profiler() # type: ignore[return-value]
237
+ else:
238
+ # Called without parentheses: @kernel
239
+ # _func is the decorated function, so we need to apply the profiler to it
240
+ profiler = _create_profiler()
241
+ return profiler(_func) # type: ignore[return-value]
242
+
243
+
244
+ def plot(
245
+ filename: str = "plot.png",
246
+ *,
247
+ title: str = "Nsight Analyze Kernel Plot Results",
248
+ ylabel: str | None = None,
249
+ annotate_points: bool = False,
250
+ show_aggregate: str | None = None,
251
+ plot_type: str = "line",
252
+ plot_width: int = 6,
253
+ plot_height: int = 4,
254
+ row_panels: Sequence[str] | None = None,
255
+ col_panels: Sequence[str] | None = None,
256
+ x_keys: Sequence[str] | None = None,
257
+ print_data: bool = False,
258
+ variant_fields: Sequence[str] | None = None,
259
+ variant_annotations: Sequence[str] | None = None,
260
+ plot_callback: Callable[[matplotlib.figure.Figure], None] | None = None,
261
+ ) -> Callable[
262
+ [Callable[..., collection.core.ProfileResults]],
263
+ Callable[..., collection.core.ProfileResults],
264
+ ]:
265
+ """
266
+ A decorator that plots the result of a profile-decorated function.
267
+ This decorator is intended to be only used on functions that have been
268
+ decorated with `@nsight.analyze.kernel`.
269
+
270
+ The decorator returns a wrapped version of your function that maintains the same
271
+ signature as the underlying ``@nsight.analyze.kernel`` decorated function::
272
+
273
+ def wrapped_function(*args, configs=None, **kwargs) -> ProfileResults
274
+
275
+ The function returns ``ProfileResults`` and generates a plot as a side effect.
276
+
277
+ Example usage::
278
+
279
+ @nsight.analyze.plot(title="My Plot")
280
+ @nsight.analyze.kernel
281
+ def my_func(...):
282
+
283
+ Args:
284
+ filename: Filename to save the plot. Default: ``'plot'``
285
+ title: Title for the plot. Default: ``'Nsight Analyze Kernel Plot Results'``
286
+ ylabel: Label for the y-axis in the generated plot.
287
+ Default: ``f'{metric} (avg: {runs} runs)'``
288
+ annotate_points: If True, annotate the points with
289
+ their numeric value in the plot.
290
+ Default: ``False``
291
+ show_aggregate: If “avg”, show the average value in the plot. If “geomean”, show the geometric mean value in the plot.
292
+ Default: None
293
+ plot_type: Type of plot to generate. Options are
294
+ 'line' or 'bar'. Default: ``'line'``
295
+ plot_width: Width of the plot in inches. Default: ``6``
296
+ plot_height: Height of the plot in inches. Default: ``4``
297
+ row_panels: Enables generating subplots along
298
+ the horizontal axis for each unique values of the listed function parameters.
299
+ The provided strings must each match one argument of the
300
+ nsight.analyze.kernel-decorated function. Default: ``None``
301
+ col_panels: Enables generating subplots along
302
+ the vertical axis for each unique values of the listed function parameters.
303
+ The provided strings must each match one argument of the
304
+ nsight.analyze.kernel-decorated function. Default: ``None``
305
+ x_keys: List of fields to use for the x-axis. By
306
+ default, we use all parameters of the decorated function except those
307
+ specified in `row_panels` and `col_panels`.
308
+ print_data: If True, print the data used for plotting.
309
+ Default: ``False``
310
+ variant_fields: List of config fields to use as variant fields (lines).
311
+ variant_annotations: List of annotated range names for which to apply variant splitting. The provided strings must each match one of the names defined using nsight.annotate.
312
+ """
313
+ show_avg = show_aggregate == "avg"
314
+ show_geomean = show_aggregate == "geomean"
315
+
316
+ def decorator(func: Callable[..., Any]) -> Callable[..., Any]:
317
+ @functools.wraps(func)
318
+ def wrapper(*args: Any, **kwargs: Any) -> Any:
319
+ result = func(*args, **kwargs)
320
+
321
+ if "NSPY_NCU_PROFILE" not in os.environ:
322
+ visualization.visualize(
323
+ result.to_dataframe(),
324
+ row_panels=row_panels,
325
+ col_panels=col_panels,
326
+ x_keys=x_keys,
327
+ print_data=print_data,
328
+ title=title,
329
+ filename=filename,
330
+ ylabel=ylabel or "",
331
+ annotate_points=annotate_points,
332
+ show_avg=show_avg,
333
+ show_geomean=show_geomean,
334
+ plot_type=plot_type,
335
+ plot_width=plot_width,
336
+ plot_height=plot_height,
337
+ variant_fields=variant_fields,
338
+ variant_annotations=variant_annotations,
339
+ plot_callback=plot_callback,
340
+ )
341
+ return result
342
+
343
+ return wrapper
344
+
345
+ return decorator
346
+
347
+
348
+ # ------------------------------------------------------------------------------
349
+ # nsight.analyze.ignore_failures context manager
350
+ # For ignoring errors in warmup runs outside nsight.annotate
351
+ # ------------------------------------------------------------------------------
352
+ @contextlib.contextmanager
353
+ def ignore_failures() -> Any:
354
+ """
355
+ Context manager that ignores errors in a code block.
356
+
357
+ Useful when you want failures in the block to be suppressed so they
358
+ do not propagate and cause the decorated function to fail.
359
+ """
360
+ try:
361
+ yield
362
+ except Exception:
363
+ pass
nsight/annotation.py ADDED
@@ -0,0 +1,80 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ import functools
5
+ import importlib.util
6
+ from collections.abc import Callable
7
+ from typing import Any
8
+
9
+ import nvtx
10
+
11
+ import nsight.utils as utils
12
+ from nsight.exceptions import CUDA_CORE_UNAVAILABLE_MSG
13
+
14
+
15
+ class annotate(nvtx.annotate): # type: ignore[misc]
16
+ """
17
+ A decorator/context-manager hybrid for marking profiling regions.
18
+ The encapsulated code will be profiled and associated with an NVTX
19
+ range of the given annotate name.
20
+
21
+ Example usage::
22
+
23
+ # as context manager
24
+ with nsight.annotate("name"):
25
+ # your kernel launch here
26
+
27
+ # as decorator
28
+ @nsight.annotate("name")
29
+ def your_kernel_launcher(...):
30
+ ...
31
+
32
+ Args:
33
+ name: Name of the annotation to be used for profiling.
34
+ ignore_failures: Flag indicating whether to ignore
35
+ failures in the annotate context. If set to ``True``, any exceptions
36
+ raised within the context will be ignored, and the profiling will
37
+ continue. The measured metric for this run will be set to NaN.
38
+ Default: ``False``
39
+
40
+ Note:
41
+ All annotations are created under the NVTX domain ``"nsight-python"``.
42
+ This domain is used internally to filter and identify Nsight Python
43
+ annotations in profiling tools.
44
+
45
+ Note:
46
+ Nested annotations are currently not supported. However, since each
47
+ annotation is expected to contain a single kernel launch by default,
48
+ nested annotations should not be necessary in typical usage scenarios.
49
+
50
+ """
51
+
52
+ def __init__(self, name: str, ignore_failures: bool = False):
53
+ self.name = name
54
+ self.ignore_failures = ignore_failures
55
+
56
+ # Check if cuda-core is available when ignore_failures is True
57
+ if ignore_failures and not utils.CUDA_CORE_AVAILABLE:
58
+ raise ImportError(CUDA_CORE_UNAVAILABLE_MSG)
59
+
60
+ super().__init__(name, domain=utils.NVTX_DOMAIN)
61
+
62
+ def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> bool:
63
+ try:
64
+ if exc_type and self.ignore_failures:
65
+ utils.launch_dummy_kernel_module()
66
+ finally:
67
+ super().__exit__(exc_type, exc_value, traceback)
68
+
69
+ if exc_type and not self.ignore_failures:
70
+ return False # propagate the exception
71
+
72
+ return True
73
+
74
+ def __call__(self, func: Callable[..., Any]) -> Callable[..., Any]:
75
+ @functools.wraps(func)
76
+ def wrapped(*args: Any, **kwargs: Any) -> Any:
77
+ with self:
78
+ return func(*args, **kwargs)
79
+
80
+ return wrapped
@@ -0,0 +1,10 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ import functools
5
+
6
+ import nsight.collection.core as core
7
+ import nsight.collection.ncu as ncu
8
+ import nsight.utils as utils
9
+
10
+ __all__ = ["ncu", "core"]