numba-cuda 0.0.19__py3-none-any.whl → 0.0.21__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- numba_cuda/VERSION +1 -1
- numba_cuda/numba/cuda/codegen.py +36 -14
- numba_cuda/numba/cuda/compiler.py +180 -10
- numba_cuda/numba/cuda/cuda_paths.py +3 -1
- numba_cuda/numba/cuda/cudadrv/driver.py +103 -2
- numba_cuda/numba/cuda/cudadrv/enums.py +1 -1
- numba_cuda/numba/cuda/cudadrv/nvrtc.py +37 -4
- numba_cuda/numba/cuda/dispatcher.py +8 -9
- numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py +74 -18
- numba_cuda/numba/cuda/tests/cudapy/test_debug.py +2 -4
- numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +1 -0
- numba_cuda/numba/cuda/tests/cudapy/test_inspect.py +3 -10
- numba_cuda/numba/cuda/tests/cudapy/test_lineinfo.py +1 -2
- numba_cuda/numba/cuda/tests/cudapy/test_optimization.py +5 -2
- numba_cuda/numba/cuda/tests/cudapy/test_stream_api.py +6 -2
- numba_cuda/numba/cuda/tests/test_binary_generation/Makefile +7 -0
- {numba_cuda-0.0.19.dist-info → numba_cuda-0.0.21.dist-info}/METADATA +12 -8
- {numba_cuda-0.0.19.dist-info → numba_cuda-0.0.21.dist-info}/RECORD +21 -21
- {numba_cuda-0.0.19.dist-info → numba_cuda-0.0.21.dist-info}/LICENSE +0 -0
- {numba_cuda-0.0.19.dist-info → numba_cuda-0.0.21.dist-info}/WHEEL +0 -0
- {numba_cuda-0.0.19.dist-info → numba_cuda-0.0.21.dist-info}/top_level.txt +0 -0
numba_cuda/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.0.
|
1
|
+
0.0.21
|
numba_cuda/numba/cuda/codegen.py
CHANGED
@@ -9,7 +9,6 @@ import os
|
|
9
9
|
import subprocess
|
10
10
|
import tempfile
|
11
11
|
|
12
|
-
|
13
12
|
CUDA_TRIPLE = 'nvptx64-nvidia-cuda'
|
14
13
|
|
15
14
|
|
@@ -181,17 +180,7 @@ class CUDACodeLibrary(serialize.ReduceMixin, CodeLibrary):
|
|
181
180
|
|
182
181
|
return ltoir
|
183
182
|
|
184
|
-
def
|
185
|
-
cc = self._ensure_cc(cc)
|
186
|
-
|
187
|
-
cubin = self._cubin_cache.get(cc, None)
|
188
|
-
if cubin:
|
189
|
-
return cubin
|
190
|
-
|
191
|
-
linker = driver.Linker.new(
|
192
|
-
max_registers=self._max_registers, cc=cc, lto=self._lto
|
193
|
-
)
|
194
|
-
|
183
|
+
def _link_all(self, linker, cc, ignore_nonlto=False):
|
195
184
|
if linker.lto:
|
196
185
|
ltoir = self.get_ltoir(cc=cc)
|
197
186
|
linker.add_ltoir(ltoir)
|
@@ -200,11 +189,44 @@ class CUDACodeLibrary(serialize.ReduceMixin, CodeLibrary):
|
|
200
189
|
linker.add_ptx(ptx.encode())
|
201
190
|
|
202
191
|
for path in self._linking_files:
|
203
|
-
linker.add_file_guess_ext(path)
|
192
|
+
linker.add_file_guess_ext(path, ignore_nonlto)
|
204
193
|
if self.needs_cudadevrt:
|
205
|
-
linker.add_file_guess_ext(
|
194
|
+
linker.add_file_guess_ext(
|
195
|
+
get_cudalib('cudadevrt', static=True), ignore_nonlto
|
196
|
+
)
|
197
|
+
|
198
|
+
def get_cubin(self, cc=None):
|
199
|
+
cc = self._ensure_cc(cc)
|
206
200
|
|
201
|
+
cubin = self._cubin_cache.get(cc, None)
|
202
|
+
if cubin:
|
203
|
+
return cubin
|
204
|
+
|
205
|
+
if self._lto and config.DUMP_ASSEMBLY:
|
206
|
+
linker = driver.Linker.new(
|
207
|
+
max_registers=self._max_registers,
|
208
|
+
cc=cc,
|
209
|
+
additional_flags=["-ptx"],
|
210
|
+
lto=self._lto
|
211
|
+
)
|
212
|
+
# `-ptx` flag is meant to view the optimized PTX for LTO objects.
|
213
|
+
# Non-LTO objects are not passed to linker.
|
214
|
+
self._link_all(linker, cc, ignore_nonlto=True)
|
215
|
+
|
216
|
+
ptx = linker.get_linked_ptx().decode('utf-8')
|
217
|
+
|
218
|
+
print(("ASSEMBLY (AFTER LTO) %s" % self._name).center(80, '-'))
|
219
|
+
print(ptx)
|
220
|
+
print('=' * 80)
|
221
|
+
|
222
|
+
linker = driver.Linker.new(
|
223
|
+
max_registers=self._max_registers,
|
224
|
+
cc=cc,
|
225
|
+
lto=self._lto
|
226
|
+
)
|
227
|
+
self._link_all(linker, cc, ignore_nonlto=False)
|
207
228
|
cubin = linker.complete()
|
229
|
+
|
208
230
|
self._cubin_cache[cc] = cubin
|
209
231
|
self._linkerinfo_cache[cc] = linker.info_log
|
210
232
|
|
@@ -1,6 +1,7 @@
|
|
1
1
|
from llvmlite import ir
|
2
2
|
from numba.core.typing.templates import ConcreteTemplate
|
3
|
-
from numba.core import types, typing, funcdesc, config, compiler,
|
3
|
+
from numba.core import (cgutils, types, typing, funcdesc, config, compiler,
|
4
|
+
sigutils, utils)
|
4
5
|
from numba.core.compiler import (sanitize_compile_result_entries, CompilerBase,
|
5
6
|
DefaultPassBuilder, Flags, Option,
|
6
7
|
CompileResult)
|
@@ -11,7 +12,10 @@ from numba.core.errors import NumbaInvalidConfigWarning
|
|
11
12
|
from numba.core.typed_passes import (IRLegalization, NativeLowering,
|
12
13
|
AnnotateTypes)
|
13
14
|
from warnings import warn
|
15
|
+
from numba.cuda import nvvmutils
|
14
16
|
from numba.cuda.api import get_current_device
|
17
|
+
from numba.cuda.cudadrv import nvvm
|
18
|
+
from numba.cuda.descriptor import cuda_target
|
15
19
|
from numba.cuda.target import CUDACABICallConv
|
16
20
|
|
17
21
|
|
@@ -24,6 +28,15 @@ def _nvvm_options_type(x):
|
|
24
28
|
return x
|
25
29
|
|
26
30
|
|
31
|
+
def _optional_int_type(x):
|
32
|
+
if x is None:
|
33
|
+
return None
|
34
|
+
|
35
|
+
else:
|
36
|
+
assert isinstance(x, int)
|
37
|
+
return x
|
38
|
+
|
39
|
+
|
27
40
|
class CUDAFlags(Flags):
|
28
41
|
nvvm_options = Option(
|
29
42
|
type=_nvvm_options_type,
|
@@ -35,6 +48,16 @@ class CUDAFlags(Flags):
|
|
35
48
|
default=None,
|
36
49
|
doc="Compute Capability",
|
37
50
|
)
|
51
|
+
max_registers = Option(
|
52
|
+
type=_optional_int_type,
|
53
|
+
default=None,
|
54
|
+
doc="Max registers"
|
55
|
+
)
|
56
|
+
lto = Option(
|
57
|
+
type=bool,
|
58
|
+
default=False,
|
59
|
+
doc="Enable Link-time Optimization"
|
60
|
+
)
|
38
61
|
|
39
62
|
|
40
63
|
# The CUDACompileResult (CCR) has a specially-defined entry point equal to its
|
@@ -109,7 +132,11 @@ class CreateLibrary(LoweringPass):
|
|
109
132
|
codegen = state.targetctx.codegen()
|
110
133
|
name = state.func_id.func_qualname
|
111
134
|
nvvm_options = state.flags.nvvm_options
|
112
|
-
|
135
|
+
max_registers = state.flags.max_registers
|
136
|
+
lto = state.flags.lto
|
137
|
+
state.library = codegen.create_library(name, nvvm_options=nvvm_options,
|
138
|
+
max_registers=max_registers,
|
139
|
+
lto=lto)
|
113
140
|
# Enable object caching upfront so that the library can be serialized.
|
114
141
|
state.library.enable_object_caching()
|
115
142
|
|
@@ -152,7 +179,7 @@ class CUDACompiler(CompilerBase):
|
|
152
179
|
@global_compiler_lock
|
153
180
|
def compile_cuda(pyfunc, return_type, args, debug=False, lineinfo=False,
|
154
181
|
inline=False, fastmath=False, nvvm_options=None,
|
155
|
-
cc=None):
|
182
|
+
cc=None, max_registers=None, lto=False):
|
156
183
|
if cc is None:
|
157
184
|
raise ValueError('Compute Capability must be supplied')
|
158
185
|
|
@@ -189,6 +216,8 @@ def compile_cuda(pyfunc, return_type, args, debug=False, lineinfo=False,
|
|
189
216
|
if nvvm_options:
|
190
217
|
flags.nvvm_options = nvvm_options
|
191
218
|
flags.compute_capability = cc
|
219
|
+
flags.max_registers = max_registers
|
220
|
+
flags.lto = lto
|
192
221
|
|
193
222
|
# Run compilation pipeline
|
194
223
|
from numba.core.target_extension import target_override
|
@@ -247,11 +276,155 @@ def cabi_wrap_function(context, lib, fndesc, wrapper_function_name,
|
|
247
276
|
builder, func, restype, argtypes, callargs)
|
248
277
|
builder.ret(return_value)
|
249
278
|
|
279
|
+
if config.DUMP_LLVM:
|
280
|
+
utils.dump_llvm(fndesc, wrapper_module)
|
281
|
+
|
250
282
|
library.add_ir_module(wrapper_module)
|
251
283
|
library.finalize()
|
252
284
|
return library
|
253
285
|
|
254
286
|
|
287
|
+
def kernel_fixup(kernel, debug):
|
288
|
+
if debug:
|
289
|
+
exc_helper = add_exception_store_helper(kernel)
|
290
|
+
|
291
|
+
# Pass 1 - replace:
|
292
|
+
#
|
293
|
+
# ret <value>
|
294
|
+
#
|
295
|
+
# with:
|
296
|
+
#
|
297
|
+
# exc_helper(<value>)
|
298
|
+
# ret void
|
299
|
+
|
300
|
+
for block in kernel.blocks:
|
301
|
+
for i, inst in enumerate(block.instructions):
|
302
|
+
if isinstance(inst, ir.Ret):
|
303
|
+
old_ret = block.instructions.pop()
|
304
|
+
block.terminator = None
|
305
|
+
|
306
|
+
# The original return's metadata will be set on the new
|
307
|
+
# instructions in order to preserve debug info
|
308
|
+
metadata = old_ret.metadata
|
309
|
+
|
310
|
+
builder = ir.IRBuilder(block)
|
311
|
+
if debug:
|
312
|
+
status_code = old_ret.operands[0]
|
313
|
+
exc_helper_call = builder.call(exc_helper, (status_code,))
|
314
|
+
exc_helper_call.metadata = metadata
|
315
|
+
|
316
|
+
new_ret = builder.ret_void()
|
317
|
+
new_ret.metadata = old_ret.metadata
|
318
|
+
|
319
|
+
# Need to break out so we don't carry on modifying what we are
|
320
|
+
# iterating over. There can only be one return in a block
|
321
|
+
# anyway.
|
322
|
+
break
|
323
|
+
|
324
|
+
# Pass 2: remove stores of null pointer to return value argument pointer
|
325
|
+
|
326
|
+
return_value = kernel.args[0]
|
327
|
+
|
328
|
+
for block in kernel.blocks:
|
329
|
+
remove_list = []
|
330
|
+
|
331
|
+
# Find all stores first
|
332
|
+
for inst in block.instructions:
|
333
|
+
if (isinstance(inst, ir.StoreInstr)
|
334
|
+
and inst.operands[1] == return_value):
|
335
|
+
remove_list.append(inst)
|
336
|
+
|
337
|
+
# Remove all stores
|
338
|
+
for to_remove in remove_list:
|
339
|
+
block.instructions.remove(to_remove)
|
340
|
+
|
341
|
+
# Replace non-void return type with void return type and remove return
|
342
|
+
# value
|
343
|
+
|
344
|
+
if isinstance(kernel.type, ir.PointerType):
|
345
|
+
new_type = ir.PointerType(ir.FunctionType(ir.VoidType(),
|
346
|
+
kernel.type.pointee.args[1:]))
|
347
|
+
else:
|
348
|
+
new_type = ir.FunctionType(ir.VoidType(), kernel.type.args[1:])
|
349
|
+
|
350
|
+
kernel.type = new_type
|
351
|
+
kernel.return_value = ir.ReturnValue(kernel, ir.VoidType())
|
352
|
+
kernel.args = kernel.args[1:]
|
353
|
+
|
354
|
+
# Mark as a kernel for NVVM
|
355
|
+
|
356
|
+
nvvm.set_cuda_kernel(kernel)
|
357
|
+
|
358
|
+
if config.DUMP_LLVM:
|
359
|
+
print(f"LLVM DUMP: Post kernel fixup {kernel.name}".center(80, '-'))
|
360
|
+
print(kernel.module)
|
361
|
+
print('=' * 80)
|
362
|
+
|
363
|
+
|
364
|
+
def add_exception_store_helper(kernel):
|
365
|
+
|
366
|
+
# Create global variables for exception state
|
367
|
+
|
368
|
+
def define_error_gv(postfix):
|
369
|
+
name = kernel.name + postfix
|
370
|
+
gv = cgutils.add_global_variable(kernel.module, ir.IntType(32),
|
371
|
+
name)
|
372
|
+
gv.initializer = ir.Constant(gv.type.pointee, None)
|
373
|
+
return gv
|
374
|
+
|
375
|
+
gv_exc = define_error_gv("__errcode__")
|
376
|
+
gv_tid = []
|
377
|
+
gv_ctaid = []
|
378
|
+
for i in 'xyz':
|
379
|
+
gv_tid.append(define_error_gv("__tid%s__" % i))
|
380
|
+
gv_ctaid.append(define_error_gv("__ctaid%s__" % i))
|
381
|
+
|
382
|
+
# Create exception store helper function
|
383
|
+
|
384
|
+
helper_name = kernel.name + "__exc_helper__"
|
385
|
+
helper_type = ir.FunctionType(ir.VoidType(), (ir.IntType(32),))
|
386
|
+
helper_func = ir.Function(kernel.module, helper_type, helper_name)
|
387
|
+
|
388
|
+
block = helper_func.append_basic_block(name="entry")
|
389
|
+
builder = ir.IRBuilder(block)
|
390
|
+
|
391
|
+
# Implement status check / exception store logic
|
392
|
+
|
393
|
+
status_code = helper_func.args[0]
|
394
|
+
call_conv = cuda_target.target_context.call_conv
|
395
|
+
status = call_conv._get_return_status(builder, status_code)
|
396
|
+
|
397
|
+
# Check error status
|
398
|
+
with cgutils.if_likely(builder, status.is_ok):
|
399
|
+
builder.ret_void()
|
400
|
+
|
401
|
+
with builder.if_then(builder.not_(status.is_python_exc)):
|
402
|
+
# User exception raised
|
403
|
+
old = ir.Constant(gv_exc.type.pointee, None)
|
404
|
+
|
405
|
+
# Use atomic cmpxchg to prevent rewriting the error status
|
406
|
+
# Only the first error is recorded
|
407
|
+
|
408
|
+
xchg = builder.cmpxchg(gv_exc, old, status.code,
|
409
|
+
'monotonic', 'monotonic')
|
410
|
+
changed = builder.extract_value(xchg, 1)
|
411
|
+
|
412
|
+
# If the xchange is successful, save the thread ID.
|
413
|
+
sreg = nvvmutils.SRegBuilder(builder)
|
414
|
+
with builder.if_then(changed):
|
415
|
+
for dim, ptr, in zip("xyz", gv_tid):
|
416
|
+
val = sreg.tid(dim)
|
417
|
+
builder.store(val, ptr)
|
418
|
+
|
419
|
+
for dim, ptr, in zip("xyz", gv_ctaid):
|
420
|
+
val = sreg.ctaid(dim)
|
421
|
+
builder.store(val, ptr)
|
422
|
+
|
423
|
+
builder.ret_void()
|
424
|
+
|
425
|
+
return helper_func
|
426
|
+
|
427
|
+
|
255
428
|
@global_compiler_lock
|
256
429
|
def compile(pyfunc, sig, debug=None, lineinfo=False, device=True,
|
257
430
|
fastmath=False, cc=None, opt=None, abi="c", abi_info=None,
|
@@ -347,13 +520,10 @@ def compile(pyfunc, sig, debug=None, lineinfo=False, device=True,
|
|
347
520
|
lib = cabi_wrap_function(tgt, lib, cres.fndesc, wrapper_name,
|
348
521
|
nvvm_options)
|
349
522
|
else:
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
lib, kernel = tgt.prepare_cuda_kernel(cres.library, cres.fndesc, debug,
|
355
|
-
lineinfo, nvvm_options, filename,
|
356
|
-
linenum)
|
523
|
+
lib = cres.library
|
524
|
+
kernel = lib.get_function(cres.fndesc.llvm_func_name)
|
525
|
+
lib._entry_name = cres.fndesc.llvm_func_name
|
526
|
+
kernel_fixup(kernel, debug)
|
357
527
|
|
358
528
|
if lto:
|
359
529
|
code = lib.get_ltoir(cc=cc)
|
@@ -310,7 +310,9 @@ def get_conda_include_dir():
|
|
310
310
|
# though usually it shouldn't.
|
311
311
|
include_dir = os.path.join(sys.prefix, 'include')
|
312
312
|
|
313
|
-
if os.path.exists(include_dir)
|
313
|
+
if (os.path.exists(include_dir) and os.path.isdir(include_dir)
|
314
|
+
and os.path.exists(os.path.join(include_dir,
|
315
|
+
'cuda_device_runtime_api.h'))):
|
314
316
|
return include_dir
|
315
317
|
return
|
316
318
|
|
@@ -21,6 +21,9 @@ import threading
|
|
21
21
|
import traceback
|
22
22
|
import asyncio
|
23
23
|
import pathlib
|
24
|
+
import subprocess
|
25
|
+
import tempfile
|
26
|
+
import re
|
24
27
|
from itertools import product
|
25
28
|
from abc import ABCMeta, abstractmethod
|
26
29
|
from ctypes import (c_int, byref, c_size_t, c_char, c_char_p, addressof,
|
@@ -36,7 +39,7 @@ from .error import CudaSupportError, CudaDriverError
|
|
36
39
|
from .drvapi import API_PROTOTYPES
|
37
40
|
from .drvapi import cu_occupancy_b2d_size, cu_stream_callback_pyobj, cu_uuid
|
38
41
|
from .mappings import FILE_EXTENSION_MAP
|
39
|
-
from .linkable_code import LinkableCode
|
42
|
+
from .linkable_code import LinkableCode, LTOIR, Fatbin, Object
|
40
43
|
from numba.cuda.cudadrv import enums, drvapi, nvrtc
|
41
44
|
|
42
45
|
USE_NV_BINDING = config.CUDA_USE_NVIDIA_BINDING
|
@@ -2683,12 +2686,18 @@ class Linker(metaclass=ABCMeta):
|
|
2683
2686
|
cu = f.read()
|
2684
2687
|
self.add_cu(cu, os.path.basename(path))
|
2685
2688
|
|
2686
|
-
def add_file_guess_ext(self, path_or_code):
|
2689
|
+
def add_file_guess_ext(self, path_or_code, ignore_nonlto=False):
|
2687
2690
|
"""
|
2688
2691
|
Add a file or LinkableCode object to the link. If a file is
|
2689
2692
|
passed, the type will be inferred from the extension. A LinkableCode
|
2690
2693
|
object represents a file already in memory.
|
2694
|
+
|
2695
|
+
When `ignore_nonlto` is set to true, do not add code that will not
|
2696
|
+
be LTO-ed in the linking process. This is useful in inspecting the
|
2697
|
+
LTO-ed portion of the PTX when linker is added with objects that can be
|
2698
|
+
both LTO-ed and not LTO-ed.
|
2691
2699
|
"""
|
2700
|
+
|
2692
2701
|
if isinstance(path_or_code, str):
|
2693
2702
|
ext = pathlib.Path(path_or_code).suffix
|
2694
2703
|
if ext == '':
|
@@ -2704,6 +2713,26 @@ class Linker(metaclass=ABCMeta):
|
|
2704
2713
|
"Don't know how to link file with extension "
|
2705
2714
|
f"{ext}"
|
2706
2715
|
)
|
2716
|
+
|
2717
|
+
if ignore_nonlto:
|
2718
|
+
warn_and_return = False
|
2719
|
+
if kind in (
|
2720
|
+
FILE_EXTENSION_MAP["fatbin"], FILE_EXTENSION_MAP["o"]
|
2721
|
+
):
|
2722
|
+
entry_types = inspect_obj_content(path_or_code)
|
2723
|
+
if "nvvm" not in entry_types:
|
2724
|
+
warn_and_return = True
|
2725
|
+
elif kind != FILE_EXTENSION_MAP["ltoir"]:
|
2726
|
+
warn_and_return = True
|
2727
|
+
|
2728
|
+
if warn_and_return:
|
2729
|
+
warnings.warn(
|
2730
|
+
f"Not adding {path_or_code} as it is not "
|
2731
|
+
"optimizable at link time, and `ignore_nonlto == "
|
2732
|
+
"True`."
|
2733
|
+
)
|
2734
|
+
return
|
2735
|
+
|
2707
2736
|
self.add_file(path_or_code, kind)
|
2708
2737
|
return
|
2709
2738
|
else:
|
@@ -2716,6 +2745,25 @@ class Linker(metaclass=ABCMeta):
|
|
2716
2745
|
if path_or_code.kind == "cu":
|
2717
2746
|
self.add_cu(path_or_code.data, path_or_code.name)
|
2718
2747
|
else:
|
2748
|
+
if ignore_nonlto:
|
2749
|
+
warn_and_return = False
|
2750
|
+
if isinstance(path_or_code, (Fatbin, Object)):
|
2751
|
+
with tempfile.NamedTemporaryFile("w") as fp:
|
2752
|
+
fp.write(path_or_code.data)
|
2753
|
+
entry_types = inspect_obj_content(fp.name)
|
2754
|
+
if "nvvm" not in entry_types:
|
2755
|
+
warn_and_return = True
|
2756
|
+
elif not isinstance(path_or_code, LTOIR):
|
2757
|
+
warn_and_return = True
|
2758
|
+
|
2759
|
+
if warn_and_return:
|
2760
|
+
warnings.warn(
|
2761
|
+
f"Not adding {path_or_code.name} as it is not "
|
2762
|
+
"optimizable at link time, and `ignore_nonlto == "
|
2763
|
+
"True`."
|
2764
|
+
)
|
2765
|
+
return
|
2766
|
+
|
2719
2767
|
self.add_data(
|
2720
2768
|
path_or_code.data, path_or_code.kind, path_or_code.name
|
2721
2769
|
)
|
@@ -3065,6 +3113,28 @@ class PyNvJitLinker(Linker):
|
|
3065
3113
|
name = pathlib.Path(path).name
|
3066
3114
|
self.add_data(data, kind, name)
|
3067
3115
|
|
3116
|
+
def add_cu(self, cu, name):
|
3117
|
+
"""Add CUDA source in a string to the link. The name of the source
|
3118
|
+
file should be specified in `name`."""
|
3119
|
+
with driver.get_active_context() as ac:
|
3120
|
+
dev = driver.get_device(ac.devnum)
|
3121
|
+
cc = dev.compute_capability
|
3122
|
+
|
3123
|
+
program, log = nvrtc.compile(cu, name, cc, ltoir=self.lto)
|
3124
|
+
|
3125
|
+
if not self.lto and config.DUMP_ASSEMBLY:
|
3126
|
+
print(("ASSEMBLY %s" % name).center(80, "-"))
|
3127
|
+
print(program)
|
3128
|
+
print("=" * 80)
|
3129
|
+
|
3130
|
+
suffix = ".ltoir" if self.lto else ".ptx"
|
3131
|
+
program_name = os.path.splitext(name)[0] + suffix
|
3132
|
+
# Link the program's PTX or LTOIR using the normal linker mechanism
|
3133
|
+
if self.lto:
|
3134
|
+
self.add_ltoir(program, program_name)
|
3135
|
+
else:
|
3136
|
+
self.add_ptx(program.encode(), program_name)
|
3137
|
+
|
3068
3138
|
def add_data(self, data, kind, name):
|
3069
3139
|
if kind == FILE_EXTENSION_MAP["cubin"]:
|
3070
3140
|
fn = self._linker.add_cubin
|
@@ -3086,6 +3156,12 @@ class PyNvJitLinker(Linker):
|
|
3086
3156
|
except NvJitLinkError as e:
|
3087
3157
|
raise LinkerError from e
|
3088
3158
|
|
3159
|
+
def get_linked_ptx(self):
|
3160
|
+
try:
|
3161
|
+
return self._linker.get_linked_ptx()
|
3162
|
+
except NvJitLinkError as e:
|
3163
|
+
raise LinkerError from e
|
3164
|
+
|
3089
3165
|
def complete(self):
|
3090
3166
|
try:
|
3091
3167
|
return self._linker.get_linked_cubin()
|
@@ -3361,3 +3437,28 @@ def get_version():
|
|
3361
3437
|
Return the driver version as a tuple of (major, minor)
|
3362
3438
|
"""
|
3363
3439
|
return driver.get_version()
|
3440
|
+
|
3441
|
+
|
3442
|
+
def inspect_obj_content(objpath: str):
|
3443
|
+
"""
|
3444
|
+
Given path to a fatbin or object, use `cuobjdump` to examine its content
|
3445
|
+
Return the set of entries in the object.
|
3446
|
+
"""
|
3447
|
+
code_types :set[str] = set()
|
3448
|
+
|
3449
|
+
try:
|
3450
|
+
out = subprocess.run(["cuobjdump", objpath], check=True,
|
3451
|
+
capture_output=True)
|
3452
|
+
except FileNotFoundError as e:
|
3453
|
+
msg = ("cuobjdump has not been found. You may need "
|
3454
|
+
"to install the CUDA toolkit and ensure that "
|
3455
|
+
"it is available on your PATH.\n")
|
3456
|
+
raise RuntimeError(msg) from e
|
3457
|
+
|
3458
|
+
objtable = out.stdout.decode('utf-8')
|
3459
|
+
entry_pattern = r"Fatbin (.*) code"
|
3460
|
+
for line in objtable.split("\n"):
|
3461
|
+
if match := re.match(entry_pattern, line):
|
3462
|
+
code_types.add(match.group(1))
|
3463
|
+
|
3464
|
+
return code_types
|
@@ -55,7 +55,7 @@ CUDA_ERROR_INVALID_HANDLE = 400
|
|
55
55
|
CUDA_ERROR_ILLEGAL_STATE = 401
|
56
56
|
CUDA_ERROR_NOT_FOUND = 500
|
57
57
|
CUDA_ERROR_NOT_READY = 600
|
58
|
-
|
58
|
+
CUDA_ERROR_ILLEGAL_ADDRESS = 700
|
59
59
|
CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES = 701
|
60
60
|
CUDA_ERROR_LAUNCH_TIMEOUT = 702
|
61
61
|
CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING = 703
|
@@ -61,6 +61,14 @@ class NVRTC:
|
|
61
61
|
NVVM interface. Initialization is protected by a lock and uses the standard
|
62
62
|
(for Numba) open_cudalib function to load the NVRTC library.
|
63
63
|
"""
|
64
|
+
|
65
|
+
_CU12ONLY_PROTOTYPES = {
|
66
|
+
# nvrtcResult nvrtcGetLTOIRSize(nvrtcProgram prog, size_t *ltoSizeRet);
|
67
|
+
"nvrtcGetLTOIRSize": (nvrtc_result, nvrtc_program, POINTER(c_size_t)),
|
68
|
+
# nvrtcResult nvrtcGetLTOIR(nvrtcProgram prog, char *lto);
|
69
|
+
"nvrtcGetLTOIR": (nvrtc_result, nvrtc_program, c_char_p)
|
70
|
+
}
|
71
|
+
|
64
72
|
_PROTOTYPES = {
|
65
73
|
# nvrtcResult nvrtcVersion(int *major, int *minor)
|
66
74
|
'nvrtcVersion': (nvrtc_result, POINTER(c_int), POINTER(c_int)),
|
@@ -110,6 +118,10 @@ class NVRTC:
|
|
110
118
|
cls.__INSTANCE = None
|
111
119
|
raise NvrtcSupportError("NVRTC cannot be loaded") from e
|
112
120
|
|
121
|
+
from numba.cuda.cudadrv.runtime import get_version
|
122
|
+
if get_version() >= (12, 0):
|
123
|
+
inst._PROTOTYPES |= inst._CU12ONLY_PROTOTYPES
|
124
|
+
|
113
125
|
# Find & populate functions
|
114
126
|
for name, proto in inst._PROTOTYPES.items():
|
115
127
|
func = getattr(lib, name)
|
@@ -208,10 +220,22 @@ class NVRTC:
|
|
208
220
|
|
209
221
|
return ptx.value.decode()
|
210
222
|
|
223
|
+
def get_lto(self, program):
|
224
|
+
"""
|
225
|
+
Get the compiled LTOIR as a Python bytes object.
|
226
|
+
"""
|
227
|
+
lto_size = c_size_t()
|
228
|
+
self.nvrtcGetLTOIRSize(program.handle, byref(lto_size))
|
229
|
+
|
230
|
+
lto = b" " * lto_size.value
|
231
|
+
self.nvrtcGetLTOIR(program.handle, lto)
|
232
|
+
|
233
|
+
return lto
|
211
234
|
|
212
|
-
|
235
|
+
|
236
|
+
def compile(src, name, cc, ltoir=False):
|
213
237
|
"""
|
214
|
-
Compile a CUDA C/C++ source to PTX for a given compute capability.
|
238
|
+
Compile a CUDA C/C++ source to PTX or LTOIR for a given compute capability.
|
215
239
|
|
216
240
|
:param src: The source code to compile
|
217
241
|
:type src: str
|
@@ -219,6 +243,8 @@ def compile(src, name, cc):
|
|
219
243
|
:type name: str
|
220
244
|
:param cc: A tuple ``(major, minor)`` of the compute capability
|
221
245
|
:type cc: tuple
|
246
|
+
:param ltoir: Compile into LTOIR if True, otherwise into PTX
|
247
|
+
:type ltoir: bool
|
222
248
|
:return: The compiled PTX and compilation log
|
223
249
|
:rtype: tuple
|
224
250
|
"""
|
@@ -242,6 +268,9 @@ def compile(src, name, cc):
|
|
242
268
|
numba_include = f'-I{numba_cuda_path}'
|
243
269
|
options = [arch, *cuda_include, numba_include, '-rdc', 'true']
|
244
270
|
|
271
|
+
if ltoir:
|
272
|
+
options.append("-dlto")
|
273
|
+
|
245
274
|
if nvrtc.get_version() < (12, 0):
|
246
275
|
options += ["-std=c++17"]
|
247
276
|
|
@@ -261,5 +290,9 @@ def compile(src, name, cc):
|
|
261
290
|
msg = (f"NVRTC log messages whilst compiling {name}:\n\n{log}")
|
262
291
|
warnings.warn(msg)
|
263
292
|
|
264
|
-
|
265
|
-
|
293
|
+
if ltoir:
|
294
|
+
ltoir = nvrtc.get_lto(program)
|
295
|
+
return ltoir, log
|
296
|
+
else:
|
297
|
+
ptx = nvrtc.get_ptx(program)
|
298
|
+
return ptx, log
|
@@ -14,7 +14,7 @@ from numba.core.typing.typeof import Purpose, typeof
|
|
14
14
|
|
15
15
|
from numba.cuda.api import get_current_device
|
16
16
|
from numba.cuda.args import wrap_arg
|
17
|
-
from numba.cuda.compiler import compile_cuda, CUDACompiler
|
17
|
+
from numba.cuda.compiler import compile_cuda, CUDACompiler, kernel_fixup
|
18
18
|
from numba.cuda.cudadrv import driver
|
19
19
|
from numba.cuda.cudadrv.devices import get_context
|
20
20
|
from numba.cuda.descriptor import cuda_target
|
@@ -102,15 +102,14 @@ class _Kernel(serialize.ReduceMixin):
|
|
102
102
|
inline=inline,
|
103
103
|
fastmath=fastmath,
|
104
104
|
nvvm_options=nvvm_options,
|
105
|
-
cc=cc
|
105
|
+
cc=cc,
|
106
|
+
max_registers=max_registers,
|
107
|
+
lto=lto)
|
106
108
|
tgt_ctx = cres.target_context
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
debug, lineinfo, nvvm_options,
|
112
|
-
filename, linenum,
|
113
|
-
max_registers, lto)
|
109
|
+
lib = cres.library
|
110
|
+
kernel = lib.get_function(cres.fndesc.llvm_func_name)
|
111
|
+
lib._entry_name = cres.fndesc.llvm_func_name
|
112
|
+
kernel_fixup(kernel, self.debug)
|
114
113
|
|
115
114
|
if not link:
|
116
115
|
link = []
|
@@ -5,6 +5,10 @@ from numba.cuda.cudadrv.driver import PyNvJitLinker
|
|
5
5
|
|
6
6
|
import itertools
|
7
7
|
import os
|
8
|
+
import io
|
9
|
+
import contextlib
|
10
|
+
import warnings
|
11
|
+
|
8
12
|
from numba.cuda import get_current_device
|
9
13
|
from numba import cuda
|
10
14
|
from numba import config
|
@@ -23,6 +27,9 @@ if TEST_BIN_DIR:
|
|
23
27
|
test_device_functions_fatbin = os.path.join(
|
24
28
|
TEST_BIN_DIR, "test_device_functions.fatbin"
|
25
29
|
)
|
30
|
+
test_device_functions_fatbin_multi = os.path.join(
|
31
|
+
TEST_BIN_DIR, "test_device_functions_multi.fatbin"
|
32
|
+
)
|
26
33
|
test_device_functions_o = os.path.join(
|
27
34
|
TEST_BIN_DIR, "test_device_functions.o"
|
28
35
|
)
|
@@ -156,32 +163,81 @@ class TestLinker(CUDATestCase):
|
|
156
163
|
test_device_functions_o,
|
157
164
|
test_device_functions_ptx,
|
158
165
|
)
|
166
|
+
for lto in [True, False]:
|
167
|
+
for file in files:
|
168
|
+
with self.subTest(file=file):
|
169
|
+
sig = "uint32(uint32, uint32)"
|
170
|
+
add_from_numba = cuda.declare_device("add_from_numba", sig)
|
171
|
+
|
172
|
+
@cuda.jit(link=[file], lto=lto)
|
173
|
+
def kernel(result):
|
174
|
+
result[0] = add_from_numba(1, 2)
|
175
|
+
|
176
|
+
result = cuda.device_array(1)
|
177
|
+
kernel[1, 1](result)
|
178
|
+
assert result[0] == 3
|
179
|
+
|
180
|
+
def test_nvjitlink_jit_with_linkable_code_lto_dump_assembly(self):
|
181
|
+
files = [
|
182
|
+
test_device_functions_cu,
|
183
|
+
test_device_functions_ltoir,
|
184
|
+
test_device_functions_fatbin_multi
|
185
|
+
]
|
186
|
+
|
187
|
+
config.DUMP_ASSEMBLY = True
|
188
|
+
|
159
189
|
for file in files:
|
160
190
|
with self.subTest(file=file):
|
161
|
-
|
162
|
-
|
191
|
+
f = io.StringIO()
|
192
|
+
with contextlib.redirect_stdout(f):
|
193
|
+
sig = "uint32(uint32, uint32)"
|
194
|
+
add_from_numba = cuda.declare_device("add_from_numba", sig)
|
163
195
|
|
164
|
-
|
165
|
-
|
166
|
-
|
196
|
+
@cuda.jit(link=[file], lto=True)
|
197
|
+
def kernel(result):
|
198
|
+
result[0] = add_from_numba(1, 2)
|
167
199
|
|
168
|
-
|
169
|
-
|
170
|
-
|
200
|
+
result = cuda.device_array(1)
|
201
|
+
kernel[1, 1](result)
|
202
|
+
assert result[0] == 3
|
171
203
|
|
172
|
-
|
173
|
-
file = test_device_functions_ltoir
|
204
|
+
self.assertTrue("ASSEMBLY (AFTER LTO)" in f.getvalue())
|
174
205
|
|
175
|
-
|
176
|
-
add_from_numba = cuda.declare_device("add_from_numba", sig)
|
206
|
+
config.DUMP_ASSEMBLY = False
|
177
207
|
|
178
|
-
|
179
|
-
|
180
|
-
|
208
|
+
def test_nvjitlink_jit_with_linkable_code_lto_dump_assembly_warn(self):
|
209
|
+
files = [
|
210
|
+
test_device_functions_a,
|
211
|
+
test_device_functions_cubin,
|
212
|
+
test_device_functions_fatbin,
|
213
|
+
test_device_functions_o,
|
214
|
+
test_device_functions_ptx,
|
215
|
+
]
|
181
216
|
|
182
|
-
|
183
|
-
|
184
|
-
|
217
|
+
config.DUMP_ASSEMBLY = True
|
218
|
+
|
219
|
+
for file in files:
|
220
|
+
with self.subTest(file=file):
|
221
|
+
with warnings.catch_warnings(record=True) as w:
|
222
|
+
with contextlib.redirect_stdout(None): # suppress other PTX
|
223
|
+
sig = "uint32(uint32, uint32)"
|
224
|
+
add_from_numba = cuda.declare_device(
|
225
|
+
"add_from_numba", sig
|
226
|
+
)
|
227
|
+
|
228
|
+
@cuda.jit(link=[file], lto=True)
|
229
|
+
def kernel(result):
|
230
|
+
result[0] = add_from_numba(1, 2)
|
231
|
+
|
232
|
+
result = cuda.device_array(1)
|
233
|
+
kernel[1, 1](result)
|
234
|
+
assert result[0] == 3
|
235
|
+
|
236
|
+
assert len(w) == 1
|
237
|
+
self.assertIn("it is not optimizable at link time, and "
|
238
|
+
"`ignore_nonlto == True`", str(w[0].message))
|
239
|
+
|
240
|
+
config.DUMP_ASSEMBLY = False
|
185
241
|
|
186
242
|
def test_nvjitlink_jit_with_invalid_linkable_code(self):
|
187
243
|
with open(test_device_functions_cubin, "rb") as f:
|
@@ -48,13 +48,11 @@ class TestDebugOutput(CUDATestCase):
|
|
48
48
|
self.assertRaises(AssertionError, check_meth, out)
|
49
49
|
|
50
50
|
def _check_dump_bytecode(self, out):
|
51
|
-
if PYVERSION
|
51
|
+
if PYVERSION > (3, 10):
|
52
52
|
# binop with arg=0 is binary add, see CPython dis.py and opcode.py
|
53
53
|
self.assertIn('BINARY_OP(arg=0', out)
|
54
|
-
elif PYVERSION in ((3, 9), (3, 10)):
|
55
|
-
self.assertIn('BINARY_ADD', out)
|
56
54
|
else:
|
57
|
-
|
55
|
+
self.assertIn('BINARY_ADD', out)
|
58
56
|
|
59
57
|
def _check_dump_cfg(self, out):
|
60
58
|
self.assertIn('CFG dominators', out)
|
@@ -33,10 +33,7 @@ class TestInspect(CUDATestCase):
|
|
33
33
|
self.assertIn("foo", llvm)
|
34
34
|
|
35
35
|
# Kernel in LLVM
|
36
|
-
self.assertIn(
|
37
|
-
|
38
|
-
# Wrapped device function body in LLVM
|
39
|
-
self.assertIn("define linkonce_odr i32", llvm)
|
36
|
+
self.assertIn("define void @", llvm)
|
40
37
|
|
41
38
|
asm = foo.inspect_asm(sig)
|
42
39
|
|
@@ -72,12 +69,8 @@ class TestInspect(CUDATestCase):
|
|
72
69
|
self.assertIn("foo", llvmirs[float64, float64])
|
73
70
|
|
74
71
|
# Kernels in LLVM
|
75
|
-
self.assertIn(
|
76
|
-
self.assertIn(
|
77
|
-
|
78
|
-
# Wrapped device function bodies in LLVM
|
79
|
-
self.assertIn("define linkonce_odr i32", llvmirs[intp, intp])
|
80
|
-
self.assertIn("define linkonce_odr i32", llvmirs[float64, float64])
|
72
|
+
self.assertIn("define void @", llvmirs[intp, intp])
|
73
|
+
self.assertIn("define void @", llvmirs[float64, float64])
|
81
74
|
|
82
75
|
asmdict = foo.inspect_asm()
|
83
76
|
|
@@ -170,10 +170,9 @@ class TestCudaLineInfo(CUDATestCase):
|
|
170
170
|
subprograms += 1
|
171
171
|
|
172
172
|
# One DISubprogram for each of:
|
173
|
-
# - The kernel wrapper
|
174
173
|
# - The caller
|
175
174
|
# - The callee
|
176
|
-
expected_subprograms =
|
175
|
+
expected_subprograms = 2
|
177
176
|
|
178
177
|
self.assertEqual(subprograms, expected_subprograms,
|
179
178
|
f'"Expected {expected_subprograms} DISubprograms; '
|
@@ -14,8 +14,11 @@ def device_func(x, y, z):
|
|
14
14
|
|
15
15
|
|
16
16
|
# Fragments of code that are removed from kernel_func's PTX when optimization
|
17
|
-
# is on
|
18
|
-
|
17
|
+
# is on. Previously this list was longer when kernel wrappers were used - if
|
18
|
+
# the test function were more complex it may be possible to isolate additional
|
19
|
+
# fragments of PTX we could check for the absence / presence of, but removal of
|
20
|
+
# the use of local memory is a good indicator that optimization was applied.
|
21
|
+
removed_by_opt = ( '__local_depot0',)
|
19
22
|
|
20
23
|
|
21
24
|
@skip_on_cudasim('Simulator does not optimize code')
|
@@ -1,6 +1,6 @@
|
|
1
1
|
from numba.cuda.testing import (skip_on_cudasim, skip_unless_cudasim, unittest,
|
2
2
|
CUDATestCase)
|
3
|
-
from numba import cuda
|
3
|
+
from numba import config, cuda
|
4
4
|
|
5
5
|
# Basic tests that stream APIs execute on the hardware and in the simulator.
|
6
6
|
#
|
@@ -34,7 +34,11 @@ class TestStreamAPI(CUDATestCase):
|
|
34
34
|
# We don't test synchronization on the stream because it's not a real
|
35
35
|
# stream - we used a dummy pointer for testing the API, so we just
|
36
36
|
# ensure that the stream handle matches the external stream pointer.
|
37
|
-
|
37
|
+
if config.CUDA_USE_NVIDIA_BINDING:
|
38
|
+
value = int(s.handle)
|
39
|
+
else:
|
40
|
+
value = s.handle.value
|
41
|
+
self.assertEqual(ptr, value)
|
38
42
|
|
39
43
|
@skip_unless_cudasim("External streams are usable with hardware")
|
40
44
|
def test_external_stream_simulator_unavailable(self):
|
@@ -14,9 +14,14 @@ endif
|
|
14
14
|
# Gencode flags suitable for most tests
|
15
15
|
GENCODE := -gencode arch=compute_$(GPU_CC),code=sm_$(GPU_CC)
|
16
16
|
|
17
|
+
MULTI_GENCODE := -gencode arch=compute_$(GPU_CC),code=[sm_$(GPU_CC),lto_$(GPU_CC)]
|
18
|
+
|
17
19
|
# Fatbin tests need to generate code for an additional compute capability
|
18
20
|
FATBIN_GENCODE := $(GENCODE) -gencode arch=compute_$(ALT_CC),code=sm_$(ALT_CC)
|
19
21
|
|
22
|
+
# Fatbin that contains both LTO, SASS for multiple architectures
|
23
|
+
MULTI_FATBIN_GENCODE := $(MULTI_GENCODE) -gencode arch=compute_$(ALT_CC),code=[sm_$(ALT_CC),lto_$(ALT_CC)]
|
24
|
+
|
20
25
|
# LTO-IR tests need to generate for the LTO "architecture" instead
|
21
26
|
LTOIR_GENCODE := -gencode arch=lto_$(GPU_CC),code=lto_$(GPU_CC)
|
22
27
|
|
@@ -30,6 +35,7 @@ PTX_FLAGS := $(GENCODE) -ptx
|
|
30
35
|
OBJECT_FLAGS := $(GENCODE) -dc
|
31
36
|
LIBRARY_FLAGS := $(GENCODE) -lib
|
32
37
|
FATBIN_FLAGS := $(FATBIN_GENCODE) --fatbin
|
38
|
+
MULTI_FATBIN_FLAGS := $(MULTI_FATBIN_GENCODE) --fatbin
|
33
39
|
LTOIR_FLAGS := $(LTOIR_GENCODE) -dc
|
34
40
|
|
35
41
|
OUTPUT_DIR := ./
|
@@ -41,6 +47,7 @@ all:
|
|
41
47
|
nvcc $(NVCC_FLAGS) $(CUBIN_FLAGS) -o $(OUTPUT_DIR)/undefined_extern.cubin undefined_extern.cu
|
42
48
|
nvcc $(NVCC_FLAGS) $(CUBIN_FLAGS) -o $(OUTPUT_DIR)/test_device_functions.cubin test_device_functions.cu
|
43
49
|
nvcc $(NVCC_FLAGS) $(FATBIN_FLAGS) -o $(OUTPUT_DIR)/test_device_functions.fatbin test_device_functions.cu
|
50
|
+
nvcc $(NVCC_FLAGS) $(MULTI_FATBIN_FLAGS) -o $(OUTPUT_DIR)/test_device_functions_multi.fatbin test_device_functions.cu
|
44
51
|
nvcc $(NVCC_FLAGS) $(PTX_FLAGS) -o $(OUTPUT_DIR)/test_device_functions.ptx test_device_functions.cu
|
45
52
|
nvcc $(NVCC_FLAGS) $(OBJECT_FLAGS) -o $(OUTPUT_DIR)/test_device_functions.o test_device_functions.cu
|
46
53
|
nvcc $(NVCC_FLAGS) $(LIBRARY_FLAGS) -o $(OUTPUT_DIR)/test_device_functions.a test_device_functions.cu
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: numba-cuda
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.21
|
4
4
|
Summary: CUDA target for Numba
|
5
5
|
Author: Anaconda Inc., NVIDIA Corporation
|
6
6
|
License: BSD 2-clause
|
@@ -13,17 +13,21 @@ Description-Content-Type: text/markdown
|
|
13
13
|
License-File: LICENSE
|
14
14
|
Requires-Dist: numba>=0.59.1
|
15
15
|
|
16
|
+
<div align="center"><img src="docs/source/_static/numba-green-icon-rgb.svg" width="200"/></div>
|
17
|
+
|
16
18
|
# Numba CUDA Target
|
17
19
|
|
18
|
-
|
20
|
+
The CUDA target for Numba. Please visit the [official
|
21
|
+
documentation](https://nvidia.github.io/numba-cuda) to get started!
|
22
|
+
|
19
23
|
|
20
|
-
|
21
|
-
|
22
|
-
used as the `numba.cuda` module instead of the code from the `numba` package.
|
24
|
+
To report issues or file feature requests, please use the [issue
|
25
|
+
tracker](https://github.com/NVIDIA/numba-cuda/issues).
|
23
26
|
|
24
|
-
|
27
|
+
To raise questions or initiate discussions, please use the [Numba Discourse
|
28
|
+
forum](https://numba.discourse.group).
|
25
29
|
|
26
|
-
## Building
|
30
|
+
## Building from source
|
27
31
|
|
28
32
|
Install as an editable install:
|
29
33
|
|
@@ -31,7 +35,7 @@ Install as an editable install:
|
|
31
35
|
pip install -e .
|
32
36
|
```
|
33
37
|
|
34
|
-
Running tests
|
38
|
+
## Running tests
|
35
39
|
|
36
40
|
```
|
37
41
|
python -m numba.runtests numba.cuda.tests
|
@@ -1,6 +1,6 @@
|
|
1
1
|
_numba_cuda_redirector.pth,sha256=cmfMMmV0JPh3yEpl4bGeM9AuXiVVMSo6Z_b7RaQL3XE,30
|
2
2
|
_numba_cuda_redirector.py,sha256=rc56rnb40w3AtrqnhS66JSgYTSTsi3iTn8yP3NuoQV8,2401
|
3
|
-
numba_cuda/VERSION,sha256=
|
3
|
+
numba_cuda/VERSION,sha256=N0wu4MReU0U_7uoeU-17rOqTT3ZYtrLE_x8SJjefmc8,7
|
4
4
|
numba_cuda/__init__.py,sha256=atXeUvJKR3JHcAiCFbXCVOJQUHgB1TulmsqSL_9RT3Q,114
|
5
5
|
numba_cuda/_version.py,sha256=jbdUsbR7sVllw0KxQNB0-FMd929CGg3kH2fhHdrlkuc,719
|
6
6
|
numba_cuda/numba/cuda/__init__.py,sha256=idyVHOObC9lTYnp62v7rVprSacRM4d5F6vhXfG5ElTI,621
|
@@ -8,12 +8,12 @@ numba_cuda/numba/cuda/api.py,sha256=shLu7NEZHRMcaZAMEXSoyA5Gi5m0tm6ZRymxKLEKCSg,
|
|
8
8
|
numba_cuda/numba/cuda/api_util.py,sha256=aQfUV2-4RM_oGVvckMjbMr5e3effOQNX04v1T0O2EfQ,861
|
9
9
|
numba_cuda/numba/cuda/args.py,sha256=HloHkw_PQal2DT-I70Xf_XbnGObS1jiUgcRrQ85Gq28,1978
|
10
10
|
numba_cuda/numba/cuda/cg.py,sha256=9V1uZqyGOJX1aFd9c6GAPbLSqq83lE8LoP-vxxrKENY,1490
|
11
|
-
numba_cuda/numba/cuda/codegen.py,sha256=
|
12
|
-
numba_cuda/numba/cuda/compiler.py,sha256=
|
11
|
+
numba_cuda/numba/cuda/codegen.py,sha256=ghdYBKZ3Mzk2UlLE64HkrAjb60PN9fibSNkWFRQuj4M,13184
|
12
|
+
numba_cuda/numba/cuda/compiler.py,sha256=XQHzUCuXl6WCtWWxv1X3Y9ebcVQVJEkzOuckNwKa4Gg,21249
|
13
13
|
numba_cuda/numba/cuda/cpp_function_wrappers.cu,sha256=iv84_F6Q9kFjV_kclrQz1msh6Dud8mI3qNkswTid7Qc,953
|
14
14
|
numba_cuda/numba/cuda/cuda_fp16.h,sha256=1IC0mdNdkvKbvAe0-f4uYVS7WFrVqOyI1nRUbBiqr6A,126844
|
15
15
|
numba_cuda/numba/cuda/cuda_fp16.hpp,sha256=vJ7NUr2X2tKhAP7ojydAiCoOjVO6n4QGoXD6m9Srrlw,89130
|
16
|
-
numba_cuda/numba/cuda/cuda_paths.py,sha256=
|
16
|
+
numba_cuda/numba/cuda/cuda_paths.py,sha256=C0gA72QLWUMfvXkFpw1WqqaFqfsQ7HM72hQVXG0A7RU,10023
|
17
17
|
numba_cuda/numba/cuda/cudadecl.py,sha256=ynUidit8oPGjedc6p1miMGtS20DOji3DiQHzwmx6m0s,23192
|
18
18
|
numba_cuda/numba/cuda/cudaimpl.py,sha256=3YMxQSCv2KClBrpuXGchrTNICV1F6NIjjL2rie5fDZ4,38628
|
19
19
|
numba_cuda/numba/cuda/cudamath.py,sha256=EFNtdzEytAZuwijdRoFGzVKCeal76UzzaNy7wUFQx8I,3978
|
@@ -21,7 +21,7 @@ numba_cuda/numba/cuda/decorators.py,sha256=qSpir16-jPYSe2YuRZ6g9INeobmsMNg6ab9IZ
|
|
21
21
|
numba_cuda/numba/cuda/descriptor.py,sha256=rNMaurJkjNjIBmHPozDoLC35DMURE0fn_LtnXRmaG_w,985
|
22
22
|
numba_cuda/numba/cuda/device_init.py,sha256=lP79tCsQ0Np9xcbjv_lXcH4JOiVZvV8nwg3INdETxsc,3586
|
23
23
|
numba_cuda/numba/cuda/deviceufunc.py,sha256=yxAH71dpgJWK8okmCJm0FUV6z2AqdThCYOTZspT7z0M,30775
|
24
|
-
numba_cuda/numba/cuda/dispatcher.py,sha256=
|
24
|
+
numba_cuda/numba/cuda/dispatcher.py,sha256=JuUr0-6xQtDkyaZv7CirWaU5_sSNX4BKCTDgQG5c1xc,41116
|
25
25
|
numba_cuda/numba/cuda/errors.py,sha256=XwWHzCllx0DXU6BQdoRH0m3pznGxnTFOBTVYXMmCfqg,1724
|
26
26
|
numba_cuda/numba/cuda/extending.py,sha256=URsyBYls2te-mgE0yvDY6akvawYCA0blBFfD7Lf9DO4,142
|
27
27
|
numba_cuda/numba/cuda/initialize.py,sha256=TQGHGLQoq4ch4J6CLDcJdGsZzXM-g2kDgdyO1u-Rbhg,546
|
@@ -47,16 +47,16 @@ numba_cuda/numba/cuda/vectorizers.py,sha256=u_0EzaD5tqVH8uOz4Gmqn3FgPC1rckwDAQuR
|
|
47
47
|
numba_cuda/numba/cuda/cudadrv/__init__.py,sha256=0TL4MZcJXUoo9qA7uu0vLv7eHrXRerVmyfi7O149ITw,199
|
48
48
|
numba_cuda/numba/cuda/cudadrv/devicearray.py,sha256=06kM7iFcx1TYiFhs1o9r1kyoA3k5yS7mFAdZDf6nrxA,31215
|
49
49
|
numba_cuda/numba/cuda/cudadrv/devices.py,sha256=6SneNmoq83gue0txFWWx4A65vViAa8xA06FzkApoqAk,7992
|
50
|
-
numba_cuda/numba/cuda/cudadrv/driver.py,sha256=
|
50
|
+
numba_cuda/numba/cuda/cudadrv/driver.py,sha256=bjlGcJvyjwMjRCNkNqmBIAA0HO_fzbrW2afXsp-YiCg,114794
|
51
51
|
numba_cuda/numba/cuda/cudadrv/drvapi.py,sha256=52ms3X6hfPaQB8E1jb6g7QKqRvHzBMlDQ-V2DM1rXxQ,17178
|
52
52
|
numba_cuda/numba/cuda/cudadrv/dummyarray.py,sha256=nXRngdr-k3h_BNGQuJUxmp89yGNWxqEDJedpwDPEZ44,14209
|
53
|
-
numba_cuda/numba/cuda/cudadrv/enums.py,sha256=
|
53
|
+
numba_cuda/numba/cuda/cudadrv/enums.py,sha256=Wy5dzukTk4TnWCowg_PLceET_v2xEyiWLu9TyH8pXr8,23742
|
54
54
|
numba_cuda/numba/cuda/cudadrv/error.py,sha256=zEIryW6aIy8GG4ypmTliB6RgY4Gy2n8ckz7I6W99LUM,524
|
55
55
|
numba_cuda/numba/cuda/cudadrv/libs.py,sha256=Gk9zQ1CKcsZsWl-_9QneXeP9VH5q5R1I3Cx043UOytk,7240
|
56
56
|
numba_cuda/numba/cuda/cudadrv/linkable_code.py,sha256=Q_YTv0apBo9t8pkMlKrthPPSVeLd376ZTmVDF5NtVVo,1328
|
57
57
|
numba_cuda/numba/cuda/cudadrv/mappings.py,sha256=-dTPHvAkDjdH6vS5OjgrB71AFuqKO6CRgf7hpOk2wiw,802
|
58
58
|
numba_cuda/numba/cuda/cudadrv/ndarray.py,sha256=HtULWWFyDlgqvrH5459yyPTvU4UbUo2DSdtcNfvbH00,473
|
59
|
-
numba_cuda/numba/cuda/cudadrv/nvrtc.py,sha256=
|
59
|
+
numba_cuda/numba/cuda/cudadrv/nvrtc.py,sha256=RR096Ic2_Zu96C-GGh8x8WTOyxnmDkwtcwag8a_npkQ,10898
|
60
60
|
numba_cuda/numba/cuda/cudadrv/nvvm.py,sha256=v2hJJTAQeRmoG59-hnhgMEp5BSVA73QHtEoy636VKao,24107
|
61
61
|
numba_cuda/numba/cuda/cudadrv/rtapi.py,sha256=WdeUoWzsYNYodx8kMRLVIjnNs0QzwpCihd2Q0AaqItE,226
|
62
62
|
numba_cuda/numba/cuda/cudadrv/runtime.py,sha256=Tj9ACrzQqNmDSO6xfpzw12EsQknSywQ-ZGuWMbDdHnQ,4255
|
@@ -103,7 +103,7 @@ numba_cuda/numba/cuda/tests/cudadrv/test_is_fp16.py,sha256=0KPe4E9wOZsSV_0QI0Lmj
|
|
103
103
|
numba_cuda/numba/cuda/tests/cudadrv/test_linker.py,sha256=_l2_EQEko2Jet5ooj4XMT0L4BjOuqLjbONGj1_MVI50,10161
|
104
104
|
numba_cuda/numba/cuda/tests/cudadrv/test_managed_alloc.py,sha256=kYXYMkx_3GPAITKp4reLeM8KSzKkpxiC8nxnBvXpaTA,4979
|
105
105
|
numba_cuda/numba/cuda/tests/cudadrv/test_mvc.py,sha256=984jATSa01SRoSrVqxPeO6ujJ7w2jsnZa39ABInFLVI,1529
|
106
|
-
numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py,sha256=
|
106
|
+
numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py,sha256=VOOl5fLxQL5IKHEi8hL47hAH0BUf_D8NyIxptLxIwus,8856
|
107
107
|
numba_cuda/numba/cuda/tests/cudadrv/test_nvvm_driver.py,sha256=DF7KV5uh-yMztks0f47NhpalV64dvsNy-f8HY6GhAhE,7373
|
108
108
|
numba_cuda/numba/cuda/tests/cudadrv/test_pinned.py,sha256=u_TthSS2N-2J4eBIuF4PGg33AjD-wxly7MKpz0vRAKc,944
|
109
109
|
numba_cuda/numba/cuda/tests/cudadrv/test_profiler.py,sha256=MQWZx1j3lbEpWmIpQ1bV9szrGOV3VHN0QrEnJRjAhW4,508
|
@@ -137,8 +137,8 @@ numba_cuda/numba/cuda/tests/cudapy/test_cooperative_groups.py,sha256=ZQuct24GEZn
|
|
137
137
|
numba_cuda/numba/cuda/tests/cudapy/test_cuda_array_interface.py,sha256=73FCQbNaAKpuybAwMOt4eW_dL_K6ZjrRgQw09ojkSbY,15844
|
138
138
|
numba_cuda/numba/cuda/tests/cudapy/test_cuda_jit_no_types.py,sha256=y7cNQZOZJo5Sv16ql3E5QaRksw-U3RkXss9YDcNeiTk,2137
|
139
139
|
numba_cuda/numba/cuda/tests/cudapy/test_datetime.py,sha256=2in1Cq8y9zAFoka7H72wF1D0awEd3n7bv56sUPgoNAQ,3508
|
140
|
-
numba_cuda/numba/cuda/tests/cudapy/test_debug.py,sha256=
|
141
|
-
numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py,sha256=
|
140
|
+
numba_cuda/numba/cuda/tests/cudapy/test_debug.py,sha256=3MYNiMe75rgBF1T0vsJ7r-nkW5jPvov_tDms9KXo2UU,3449
|
141
|
+
numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py,sha256=8Tm1iD2x1BRryB1QY6qp6tdjJCE6Tx9p0LzcYwiExIU,7922
|
142
142
|
numba_cuda/numba/cuda/tests/cudapy/test_device_func.py,sha256=aTRyZSOJB3sAShw0YAEgHILrR-TCuowW9KYjtlRErKM,6892
|
143
143
|
numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py,sha256=oX-l_L4H8rME1IolwhAyordSGJ152nnuqGAFdWjfgas,26587
|
144
144
|
numba_cuda/numba/cuda/tests/cudapy/test_enums.py,sha256=0GWiwvZ1FTzSl1FfMxttkWaWrowASfXrSDT8XAR4ZHw,3560
|
@@ -154,14 +154,14 @@ numba_cuda/numba/cuda/tests/cudapy/test_gufunc.py,sha256=0NWfQqHmx7tFh6vdS7QtxT8
|
|
154
154
|
numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scalar.py,sha256=Uhe8Q0u42jySrpwAZh8vCf4GMYkiy9NOMolyzEBuri0,5382
|
155
155
|
numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scheduling.py,sha256=luDtBxFS_5ZbVemXe1Z7gfqMliaU_EAOR4SuLsU5rhw,2677
|
156
156
|
numba_cuda/numba/cuda/tests/cudapy/test_idiv.py,sha256=HLJ_f2lX8m_NNJjUbl_8zZ0-8GsBlRdBP2CUo_yWb0Y,1056
|
157
|
-
numba_cuda/numba/cuda/tests/cudapy/test_inspect.py,sha256=
|
157
|
+
numba_cuda/numba/cuda/tests/cudapy/test_inspect.py,sha256=hzK1Kk2c-aKCIL2QSodHpyxemOYaghgsMx7H1WvMHX8,4879
|
158
158
|
numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py,sha256=M6-pad8nVM0fuL18uFxvE6tmHw0spLNhnMBLVlO0FKU,36400
|
159
159
|
numba_cuda/numba/cuda/tests/cudapy/test_ipc.py,sha256=fggyy-kmsOkCb906_q3kXPGRziccWu7Co7ir83zBMwM,10536
|
160
160
|
numba_cuda/numba/cuda/tests/cudapy/test_iterators.py,sha256=daQW3kSkp7icCmlTn9pCvnaauz60k_eBf4x1UQF-XVY,2344
|
161
161
|
numba_cuda/numba/cuda/tests/cudapy/test_lang.py,sha256=U1BCVZMjU1AZ4wDSmjsRIPPcAReiq4dB77Cz7GmrdmA,1691
|
162
162
|
numba_cuda/numba/cuda/tests/cudapy/test_laplace.py,sha256=yD--H5p_NrBHklFNCnxuQ0S8yUIBYScBkvn7hBlZ5ZM,3211
|
163
163
|
numba_cuda/numba/cuda/tests/cudapy/test_libdevice.py,sha256=4NsZBXweDPQpqfgo6T7eQHaWDVBof1CZDTpI1QTkV74,6545
|
164
|
-
numba_cuda/numba/cuda/tests/cudapy/test_lineinfo.py,sha256=
|
164
|
+
numba_cuda/numba/cuda/tests/cudapy/test_lineinfo.py,sha256=cimoEJqCWepvJPIqUumpLjQimg80je-WNul1MfT6KVY,6824
|
165
165
|
numba_cuda/numba/cuda/tests/cudapy/test_localmem.py,sha256=uv9UYuytIXQgzHpPgEoWVVVq5-a7-6Io_mWMiNsZ45I,5376
|
166
166
|
numba_cuda/numba/cuda/tests/cudapy/test_mandel.py,sha256=crVQBw46l4iyAv8_pu7v1eBy9ZJG7OkigB5zsyi6s3A,1085
|
167
167
|
numba_cuda/numba/cuda/tests/cudapy/test_math.py,sha256=T-KRh9qzwOL3usl_6Cly3FVlvauzGhGnedfAG1hBQy8,27615
|
@@ -173,7 +173,7 @@ numba_cuda/numba/cuda/tests/cudapy/test_multiprocessing.py,sha256=AjYbSa9nOlv_yc
|
|
173
173
|
numba_cuda/numba/cuda/tests/cudapy/test_multithreads.py,sha256=MfCbyJZu1XsCJOCSw6vvhs4eiP4LZPcF-e9huPmW-ys,2861
|
174
174
|
numba_cuda/numba/cuda/tests/cudapy/test_nondet.py,sha256=mYMX0R1tmBLRe5ZAwiDVFFuSyMuPav5guuqL3WHWGPY,1378
|
175
175
|
numba_cuda/numba/cuda/tests/cudapy/test_operator.py,sha256=0nJej4D898_JU-jhlif44fR2yu42keK4GoCLP810l3U,13295
|
176
|
-
numba_cuda/numba/cuda/tests/cudapy/test_optimization.py,sha256=
|
176
|
+
numba_cuda/numba/cuda/tests/cudapy/test_optimization.py,sha256=IRTI-b7hwMaJxtxFRzoTjpzzeqWGzNyCJPT6C4GugX4,2925
|
177
177
|
numba_cuda/numba/cuda/tests/cudapy/test_overload.py,sha256=u4yUDVFcV9E3NWMlNjM81e3IW4KaIkcDtXig8JYevsw,8538
|
178
178
|
numba_cuda/numba/cuda/tests/cudapy/test_powi.py,sha256=TI82rYRnkSnwv9VN6PMpBnr9JqMJ_F3HhH4cKY6O8tw,3276
|
179
179
|
numba_cuda/numba/cuda/tests/cudapy/test_print.py,sha256=r2xmMNx80_ANi3uFB3CQt3AHAXG_JdhStY1S796hlK0,4466
|
@@ -187,7 +187,7 @@ numba_cuda/numba/cuda/tests/cudapy/test_serialize.py,sha256=alE5-lTwbjz3Tv6OvQPS
|
|
187
187
|
numba_cuda/numba/cuda/tests/cudapy/test_slicing.py,sha256=bAh_sIk5V9_0_dOVGdzmyjwZkHMLjEbQuEI4e5zRMoU,903
|
188
188
|
numba_cuda/numba/cuda/tests/cudapy/test_sm.py,sha256=kh1F0wwQ2_bd54Q4GUX99y2oiWHQwBpyC__ckk-jiTU,14575
|
189
189
|
numba_cuda/numba/cuda/tests/cudapy/test_sm_creation.py,sha256=bTXDjU94ezo6Bz_lktlPyowTcJHBOWfy7-nJB9e-B_s,7231
|
190
|
-
numba_cuda/numba/cuda/tests/cudapy/test_stream_api.py,sha256=
|
190
|
+
numba_cuda/numba/cuda/tests/cudapy/test_stream_api.py,sha256=pCU0B-yBavHLgyhlKYAs1SCG8BWim9dSvl2BjXkhgQ4,1868
|
191
191
|
numba_cuda/numba/cuda/tests/cudapy/test_sync.py,sha256=Y851UqNkT80U9q_C05SQfvPRCY7jjRARHOMk6g0lU4Y,7837
|
192
192
|
numba_cuda/numba/cuda/tests/cudapy/test_transpose.py,sha256=JAQX2EUHwlpKCfJDGspaldmsIRbHxnXpsNUrvRrnIEE,3134
|
193
193
|
numba_cuda/numba/cuda/tests/cudapy/test_ufuncs.py,sha256=-ehvkxelr45aT8sUNL9Hq8cn2GU_K4GL1yWeX-rHqEM,9680
|
@@ -232,12 +232,12 @@ numba_cuda/numba/cuda/tests/nocuda/test_nvvm.py,sha256=n0_-xFaw6QqiZbhe55oy7lnEe
|
|
232
232
|
numba_cuda/numba/cuda/tests/nrt/__init__.py,sha256=43EXdiXXRBd6yIcVGMrU9F_EJCD9Uw3mzOP3SB53AEE,260
|
233
233
|
numba_cuda/numba/cuda/tests/nrt/mock_numpy.py,sha256=Qtn52GoKZ_ydre3oqkLWVdImC37tuPClUy4uHSutaJo,1568
|
234
234
|
numba_cuda/numba/cuda/tests/nrt/test_nrt.py,sha256=Ox6ei2DldvSSS-CndTXRxLnsvWdteOQNgn6GvKHB244,2789
|
235
|
-
numba_cuda/numba/cuda/tests/test_binary_generation/Makefile,sha256=
|
235
|
+
numba_cuda/numba/cuda/tests/test_binary_generation/Makefile,sha256=P2WzCc5d64JGq6pJwHEwmKVmJOJxPBtsMTbnuzqYkik,2679
|
236
236
|
numba_cuda/numba/cuda/tests/test_binary_generation/generate_raw_ltoir.py,sha256=V0raLZLGSiWbE_K-JluI0CnmNkXbhlMVj-TH7P1OV8E,5014
|
237
237
|
numba_cuda/numba/cuda/tests/test_binary_generation/test_device_functions.cu,sha256=cUf-t6ZM9MK_x7X_aKwsrKW1LdR97XcpR-qnYr5faOE,453
|
238
238
|
numba_cuda/numba/cuda/tests/test_binary_generation/undefined_extern.cu,sha256=q3oxZziT8KDodeNcEBiWULH6vMrHCWucmJmtrg8C0d0,128
|
239
|
-
numba_cuda-0.0.
|
240
|
-
numba_cuda-0.0.
|
241
|
-
numba_cuda-0.0.
|
242
|
-
numba_cuda-0.0.
|
243
|
-
numba_cuda-0.0.
|
239
|
+
numba_cuda-0.0.21.dist-info/LICENSE,sha256=eHeYE-XjASmwbxfsP5AImgfzRwZurZGqH1f6OFwJ4io,1326
|
240
|
+
numba_cuda-0.0.21.dist-info/METADATA,sha256=U_oWdBsw_mdsI2AnFJDXdxTXL2ytOeuTHwS3wCZswTI,1497
|
241
|
+
numba_cuda-0.0.21.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
|
242
|
+
numba_cuda-0.0.21.dist-info/top_level.txt,sha256=C50SsH-8tXDmt7I0Y3nlJYhS5s6pqWflCPdobe9vx2M,11
|
243
|
+
numba_cuda-0.0.21.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|