numba-cuda 0.3.0__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- numba_cuda/VERSION +1 -1
- numba_cuda/numba/cuda/compiler.py +7 -6
- numba_cuda/numba/cuda/cudadecl.py +6 -2
- numba_cuda/numba/cuda/cudadrv/devicearray.py +4 -1
- numba_cuda/numba/cuda/cudadrv/driver.py +1 -20
- numba_cuda/numba/cuda/cudadrv/linkable_code.py +13 -9
- numba_cuda/numba/cuda/cudadrv/nvrtc.py +5 -1
- numba_cuda/numba/cuda/cudadrv/nvvm.py +6 -1
- numba_cuda/numba/cuda/decorators.py +9 -2
- numba_cuda/numba/cuda/dispatcher.py +22 -3
- numba_cuda/numba/cuda/runtime/__init__.py +1 -0
- numba_cuda/numba/cuda/runtime/memsys.cu +94 -0
- numba_cuda/numba/cuda/runtime/memsys.cuh +17 -0
- numba_cuda/numba/cuda/runtime/nrt.cu +19 -22
- numba_cuda/numba/cuda/runtime/nrt.py +318 -0
- numba_cuda/numba/cuda/testing.py +11 -1
- numba_cuda/numba/cuda/tests/__init__.py +1 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_ndarray.py +31 -0
- numba_cuda/numba/cuda/tests/cudapy/test_device_func.py +145 -11
- numba_cuda/numba/cuda/tests/doc_examples/test_ffi.py +10 -7
- numba_cuda/numba/cuda/tests/nrt/mock_numpy.py +105 -1
- numba_cuda/numba/cuda/tests/nrt/test_nrt.py +162 -40
- numba_cuda/numba/cuda/tests/nrt/test_nrt_refct.py +114 -0
- numba_cuda/numba/cuda/tests/support.py +11 -0
- numba_cuda/numba/cuda/utils.py +22 -0
- {numba_cuda-0.3.0.dist-info → numba_cuda-0.5.0.dist-info}/METADATA +21 -3
- {numba_cuda-0.3.0.dist-info → numba_cuda-0.5.0.dist-info}/RECORD +30 -23
- {numba_cuda-0.3.0.dist-info → numba_cuda-0.5.0.dist-info}/WHEEL +1 -1
- {numba_cuda-0.3.0.dist-info → numba_cuda-0.5.0.dist-info}/LICENSE +0 -0
- {numba_cuda-0.3.0.dist-info → numba_cuda-0.5.0.dist-info}/top_level.txt +0 -0
numba_cuda/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.5.0
|
@@ -570,16 +570,16 @@ def compile_ptx_for_current_device(pyfunc, sig, debug=None, lineinfo=False,
|
|
570
570
|
abi=abi, abi_info=abi_info)
|
571
571
|
|
572
572
|
|
573
|
-
def declare_device_function(name, restype, argtypes):
|
574
|
-
return declare_device_function_template(name, restype, argtypes).key
|
573
|
+
def declare_device_function(name, restype, argtypes, link):
|
574
|
+
return declare_device_function_template(name, restype, argtypes, link).key
|
575
575
|
|
576
576
|
|
577
|
-
def declare_device_function_template(name, restype, argtypes):
|
577
|
+
def declare_device_function_template(name, restype, argtypes, link):
|
578
578
|
from .descriptor import cuda_target
|
579
579
|
typingctx = cuda_target.typing_context
|
580
580
|
targetctx = cuda_target.target_context
|
581
581
|
sig = typing.signature(restype, *argtypes)
|
582
|
-
extfn = ExternFunction(name, sig)
|
582
|
+
extfn = ExternFunction(name, sig, link)
|
583
583
|
|
584
584
|
class device_function_template(ConcreteTemplate):
|
585
585
|
key = extfn
|
@@ -593,7 +593,8 @@ def declare_device_function_template(name, restype, argtypes):
|
|
593
593
|
return device_function_template
|
594
594
|
|
595
595
|
|
596
|
-
class ExternFunction
|
597
|
-
def __init__(self, name, sig):
|
596
|
+
class ExternFunction:
|
597
|
+
def __init__(self, name, sig, link):
|
598
598
|
self.name = name
|
599
599
|
self.sig = sig
|
600
|
+
self.link = link
|
@@ -403,16 +403,20 @@ _genfp16_binary_operator(operator.itruediv)
|
|
403
403
|
|
404
404
|
|
405
405
|
def _resolve_wrapped_unary(fname):
|
406
|
+
link = tuple()
|
406
407
|
decl = declare_device_function_template(f'__numba_wrapper_{fname}',
|
407
408
|
types.float16,
|
408
|
-
(types.float16,)
|
409
|
+
(types.float16,),
|
410
|
+
link)
|
409
411
|
return types.Function(decl)
|
410
412
|
|
411
413
|
|
412
414
|
def _resolve_wrapped_binary(fname):
|
415
|
+
link = tuple()
|
413
416
|
decl = declare_device_function_template(f'__numba_wrapper_{fname}',
|
414
417
|
types.float16,
|
415
|
-
(types.float16, types.float16,)
|
418
|
+
(types.float16, types.float16,),
|
419
|
+
link)
|
416
420
|
return types.Function(decl)
|
417
421
|
|
418
422
|
|
@@ -570,10 +570,13 @@ class DeviceNDArray(DeviceNDArrayBase):
|
|
570
570
|
'''
|
571
571
|
return self._dummy.is_c_contig
|
572
572
|
|
573
|
-
def __array__(self, dtype=None):
|
573
|
+
def __array__(self, dtype=None, copy=None):
|
574
574
|
"""
|
575
575
|
:return: an `numpy.ndarray`, so copies to the host.
|
576
576
|
"""
|
577
|
+
if copy is False:
|
578
|
+
msg = "`copy=False` is not supported. A copy is always created."
|
579
|
+
raise ValueError(msg)
|
577
580
|
if dtype:
|
578
581
|
return self.copy_to_host().__array__(dtype)
|
579
582
|
else:
|
@@ -18,7 +18,6 @@ import functools
|
|
18
18
|
import warnings
|
19
19
|
import logging
|
20
20
|
import threading
|
21
|
-
import traceback
|
22
21
|
import asyncio
|
23
22
|
import pathlib
|
24
23
|
import subprocess
|
@@ -40,6 +39,7 @@ from .drvapi import API_PROTOTYPES
|
|
40
39
|
from .drvapi import cu_occupancy_b2d_size, cu_stream_callback_pyobj, cu_uuid
|
41
40
|
from .mappings import FILE_EXTENSION_MAP
|
42
41
|
from .linkable_code import LinkableCode, LTOIR, Fatbin, Object
|
42
|
+
from numba.cuda.utils import _readenv
|
43
43
|
from numba.cuda.cudadrv import enums, drvapi, nvrtc
|
44
44
|
|
45
45
|
try:
|
@@ -66,25 +66,6 @@ _py_decref.argtypes = [ctypes.py_object]
|
|
66
66
|
_py_incref.argtypes = [ctypes.py_object]
|
67
67
|
|
68
68
|
|
69
|
-
def _readenv(name, ctor, default):
|
70
|
-
value = os.environ.get(name)
|
71
|
-
if value is None:
|
72
|
-
return default() if callable(default) else default
|
73
|
-
try:
|
74
|
-
if ctor is bool:
|
75
|
-
return value.lower() in {'1', "true"}
|
76
|
-
return ctor(value)
|
77
|
-
except Exception:
|
78
|
-
warnings.warn(
|
79
|
-
f"Environment variable '{name}' is defined but its associated "
|
80
|
-
f"value '{value}' could not be parsed.\n"
|
81
|
-
"The parse failed with exception:\n"
|
82
|
-
f"{traceback.format_exc()}",
|
83
|
-
RuntimeWarning
|
84
|
-
)
|
85
|
-
return default
|
86
|
-
|
87
|
-
|
88
69
|
_MVC_ERROR_MESSAGE = (
|
89
70
|
"Minor version compatibility requires ptxcompiler and cubinlinker packages "
|
90
71
|
"to be available"
|
@@ -2,8 +2,12 @@ from .mappings import FILE_EXTENSION_MAP
|
|
2
2
|
|
3
3
|
|
4
4
|
class LinkableCode:
|
5
|
-
"""An object that
|
6
|
-
|
5
|
+
"""An object that holds code to be linked from memory.
|
6
|
+
|
7
|
+
:param data: A buffer containing the data to link.
|
8
|
+
:param name: The name of the file to be referenced in any compilation or
|
9
|
+
linking errors that may be produced.
|
10
|
+
"""
|
7
11
|
|
8
12
|
def __init__(self, data, name=None):
|
9
13
|
self.data = data
|
@@ -15,49 +19,49 @@ class LinkableCode:
|
|
15
19
|
|
16
20
|
|
17
21
|
class PTXSource(LinkableCode):
|
18
|
-
"""PTX
|
22
|
+
"""PTX source code in memory."""
|
19
23
|
|
20
24
|
kind = FILE_EXTENSION_MAP["ptx"]
|
21
25
|
default_name = "<unnamed-ptx>"
|
22
26
|
|
23
27
|
|
24
28
|
class CUSource(LinkableCode):
|
25
|
-
"""CUDA C/C++
|
29
|
+
"""CUDA C/C++ source code in memory."""
|
26
30
|
|
27
31
|
kind = "cu"
|
28
32
|
default_name = "<unnamed-cu>"
|
29
33
|
|
30
34
|
|
31
35
|
class Fatbin(LinkableCode):
|
32
|
-
"""
|
36
|
+
"""An ELF Fatbin in memory."""
|
33
37
|
|
34
38
|
kind = FILE_EXTENSION_MAP["fatbin"]
|
35
39
|
default_name = "<unnamed-fatbin>"
|
36
40
|
|
37
41
|
|
38
42
|
class Cubin(LinkableCode):
|
39
|
-
"""
|
43
|
+
"""An ELF Cubin in memory."""
|
40
44
|
|
41
45
|
kind = FILE_EXTENSION_MAP["cubin"]
|
42
46
|
default_name = "<unnamed-cubin>"
|
43
47
|
|
44
48
|
|
45
49
|
class Archive(LinkableCode):
|
46
|
-
"""An archive of objects in memory"""
|
50
|
+
"""An archive of objects in memory."""
|
47
51
|
|
48
52
|
kind = FILE_EXTENSION_MAP["a"]
|
49
53
|
default_name = "<unnamed-archive>"
|
50
54
|
|
51
55
|
|
52
56
|
class Object(LinkableCode):
|
53
|
-
"""An object file in memory"""
|
57
|
+
"""An object file in memory."""
|
54
58
|
|
55
59
|
kind = FILE_EXTENSION_MAP["o"]
|
56
60
|
default_name = "<unnamed-object>"
|
57
61
|
|
58
62
|
|
59
63
|
class LTOIR(LinkableCode):
|
60
|
-
"""An LTOIR file in memory"""
|
64
|
+
"""An LTOIR file in memory."""
|
61
65
|
|
62
66
|
kind = "ltoir"
|
63
67
|
default_name = "<unnamed-ltoir>"
|
@@ -266,7 +266,11 @@ def compile(src, name, cc, ltoir=False):
|
|
266
266
|
cudadrv_path = os.path.dirname(os.path.abspath(__file__))
|
267
267
|
numba_cuda_path = os.path.dirname(cudadrv_path)
|
268
268
|
numba_include = f'-I{numba_cuda_path}'
|
269
|
-
|
269
|
+
|
270
|
+
nrt_path = os.path.join(numba_cuda_path, "runtime")
|
271
|
+
nrt_include = f'-I{nrt_path}'
|
272
|
+
|
273
|
+
options = [arch, *cuda_include, numba_include, nrt_include, '-rdc', 'true']
|
270
274
|
|
271
275
|
if ltoir:
|
272
276
|
options.append("-dlto")
|
@@ -314,7 +314,9 @@ COMPUTE_CAPABILITIES = (
|
|
314
314
|
(6, 0), (6, 1), (6, 2),
|
315
315
|
(7, 0), (7, 2), (7, 5),
|
316
316
|
(8, 0), (8, 6), (8, 7), (8, 9),
|
317
|
-
(9, 0)
|
317
|
+
(9, 0),
|
318
|
+
(10, 0), (10, 1),
|
319
|
+
(12, 0),
|
318
320
|
)
|
319
321
|
|
320
322
|
# Maps CTK version -> (min supported cc, max supported cc) inclusive
|
@@ -331,6 +333,9 @@ CTK_SUPPORTED = {
|
|
331
333
|
(12, 2): ((5, 0), (9, 0)),
|
332
334
|
(12, 3): ((5, 0), (9, 0)),
|
333
335
|
(12, 4): ((5, 0), (9, 0)),
|
336
|
+
(12, 5): ((5, 0), (9, 0)),
|
337
|
+
(12, 6): ((5, 0), (9, 0)),
|
338
|
+
(12, 8): ((5, 0), (12, 0)),
|
334
339
|
}
|
335
340
|
|
336
341
|
|
@@ -173,7 +173,7 @@ def jit(func_or_sig=None, device=False, inline=False, link=[], debug=None,
|
|
173
173
|
return disp
|
174
174
|
|
175
175
|
|
176
|
-
def declare_device(name, sig):
|
176
|
+
def declare_device(name, sig, link=None):
|
177
177
|
"""
|
178
178
|
Declare the signature of a foreign function. Returns a descriptor that can
|
179
179
|
be used to call the function from a Python kernel.
|
@@ -181,10 +181,17 @@ def declare_device(name, sig):
|
|
181
181
|
:param name: The name of the foreign function.
|
182
182
|
:type name: str
|
183
183
|
:param sig: The Numba signature of the function.
|
184
|
+
:param link: External code to link when calling the function.
|
184
185
|
"""
|
186
|
+
if link is None:
|
187
|
+
link = tuple()
|
188
|
+
else:
|
189
|
+
if not isinstance(link, (list, tuple, set)):
|
190
|
+
link = (link,)
|
191
|
+
|
185
192
|
argtypes, restype = sigutils.normalize_signature(sig)
|
186
193
|
if restype is None:
|
187
194
|
msg = 'Return type must be provided for device declarations'
|
188
195
|
raise TypeError(msg)
|
189
196
|
|
190
|
-
return declare_device_function(name, restype, argtypes)
|
197
|
+
return declare_device_function(name, restype, argtypes, link)
|
@@ -11,16 +11,18 @@ from numba.core.compiler_lock import global_compiler_lock
|
|
11
11
|
from numba.core.dispatcher import Dispatcher
|
12
12
|
from numba.core.errors import NumbaPerformanceWarning
|
13
13
|
from numba.core.typing.typeof import Purpose, typeof
|
14
|
-
|
14
|
+
from numba.core.types.functions import Function
|
15
15
|
from numba.cuda.api import get_current_device
|
16
16
|
from numba.cuda.args import wrap_arg
|
17
|
-
from numba.cuda.compiler import compile_cuda, CUDACompiler, kernel_fixup
|
17
|
+
from numba.cuda.compiler import (compile_cuda, CUDACompiler, kernel_fixup,
|
18
|
+
ExternFunction)
|
18
19
|
from numba.cuda.cudadrv import driver
|
19
20
|
from numba.cuda.cudadrv.devices import get_context
|
20
21
|
from numba.cuda.descriptor import cuda_target
|
21
22
|
from numba.cuda.errors import (missing_launch_config_msg,
|
22
23
|
normalize_kernel_dimensions)
|
23
24
|
from numba.cuda import types as cuda_types
|
25
|
+
from numba.cuda.runtime.nrt import rtsys
|
24
26
|
|
25
27
|
from numba import cuda
|
26
28
|
from numba import _dispatcher
|
@@ -157,6 +159,16 @@ class _Kernel(serialize.ReduceMixin):
|
|
157
159
|
|
158
160
|
self.maybe_link_nrt(link, tgt_ctx, asm)
|
159
161
|
|
162
|
+
for k, v in cres.fndesc.typemap.items():
|
163
|
+
if not isinstance(v, Function):
|
164
|
+
continue
|
165
|
+
|
166
|
+
if not isinstance(v.typing_key, ExternFunction):
|
167
|
+
continue
|
168
|
+
|
169
|
+
for obj in v.typing_key.link:
|
170
|
+
lib.add_linking_file(obj)
|
171
|
+
|
160
172
|
for filepath in link:
|
161
173
|
lib.add_linking_file(filepath)
|
162
174
|
|
@@ -253,7 +265,14 @@ class _Kernel(serialize.ReduceMixin):
|
|
253
265
|
"""
|
254
266
|
Force binding to current CUDA context
|
255
267
|
"""
|
256
|
-
self._codelibrary.get_cufunc()
|
268
|
+
cufunc = self._codelibrary.get_cufunc()
|
269
|
+
|
270
|
+
if hasattr(self, "target_context") and self.target_context.enable_nrt:
|
271
|
+
rtsys.ensure_initialized()
|
272
|
+
rtsys.set_memsys_to_module(cufunc.module)
|
273
|
+
# We don't know which stream the kernel will be launched on, so
|
274
|
+
# we force synchronize here.
|
275
|
+
cuda.synchronize()
|
257
276
|
|
258
277
|
@property
|
259
278
|
def regs_per_thread(self):
|
@@ -0,0 +1 @@
|
|
1
|
+
from numba.cuda.runtime.nrt import rtsys # noqa: F401
|
@@ -0,0 +1,94 @@
|
|
1
|
+
#include "memsys.cuh"
|
2
|
+
|
3
|
+
__device__ size_t memsys_size = sizeof(NRT_MemSys);
|
4
|
+
|
5
|
+
namespace detail
|
6
|
+
{
|
7
|
+
void __device__ check_memsys()
|
8
|
+
{
|
9
|
+
if (TheMSys == nullptr)
|
10
|
+
{
|
11
|
+
assert(false && "TheMSys pointer is null. Please use NRT_MemSys_set to set pointer first.");
|
12
|
+
}
|
13
|
+
}
|
14
|
+
}
|
15
|
+
|
16
|
+
extern "C" __global__ void NRT_MemSys_set(NRT_MemSys *memsys_ptr)
|
17
|
+
{
|
18
|
+
TheMSys = memsys_ptr;
|
19
|
+
}
|
20
|
+
|
21
|
+
extern "C" __global__ void NRT_MemSys_read(uint64_t *managed_memsys)
|
22
|
+
{
|
23
|
+
detail::check_memsys();
|
24
|
+
managed_memsys[0] = TheMSys->stats.alloc;
|
25
|
+
managed_memsys[1] = TheMSys->stats.free;
|
26
|
+
managed_memsys[2] = TheMSys->stats.mi_alloc;
|
27
|
+
managed_memsys[3] = TheMSys->stats.mi_free;
|
28
|
+
}
|
29
|
+
|
30
|
+
extern "C" __global__ void NRT_MemSys_read_alloc(uint64_t *managed_result)
|
31
|
+
{
|
32
|
+
detail::check_memsys();
|
33
|
+
managed_result[0] = TheMSys->stats.alloc;
|
34
|
+
}
|
35
|
+
|
36
|
+
extern "C" __global__ void NRT_MemSys_read_free(uint64_t *managed_result)
|
37
|
+
{
|
38
|
+
detail::check_memsys();
|
39
|
+
managed_result[0] = TheMSys->stats.free;
|
40
|
+
}
|
41
|
+
|
42
|
+
extern "C" __global__ void NRT_MemSys_read_mi_alloc(uint64_t *managed_result)
|
43
|
+
{
|
44
|
+
detail::check_memsys();
|
45
|
+
managed_result[0] = TheMSys->stats.mi_alloc;
|
46
|
+
}
|
47
|
+
|
48
|
+
extern "C" __global__ void NRT_MemSys_read_mi_free(uint64_t *managed_result)
|
49
|
+
{
|
50
|
+
detail::check_memsys();
|
51
|
+
managed_result[0] = TheMSys->stats.mi_free;
|
52
|
+
}
|
53
|
+
|
54
|
+
extern "C" __global__ void NRT_MemSys_init(void)
|
55
|
+
{
|
56
|
+
detail::check_memsys();
|
57
|
+
TheMSys->stats.enabled = false;
|
58
|
+
TheMSys->stats.alloc = 0;
|
59
|
+
TheMSys->stats.free = 0;
|
60
|
+
TheMSys->stats.mi_alloc = 0;
|
61
|
+
TheMSys->stats.mi_free = 0;
|
62
|
+
}
|
63
|
+
|
64
|
+
extern "C" __global__ void NRT_MemSys_enable_stats(void)
|
65
|
+
{
|
66
|
+
detail::check_memsys();
|
67
|
+
TheMSys->stats.enabled = true;
|
68
|
+
}
|
69
|
+
|
70
|
+
extern "C" __global__ void NRT_MemSys_disable_stats(void)
|
71
|
+
{
|
72
|
+
detail::check_memsys();
|
73
|
+
TheMSys->stats.enabled = false;
|
74
|
+
}
|
75
|
+
|
76
|
+
extern "C" __global__ void NRT_MemSys_stats_enabled(uint8_t *enabled)
|
77
|
+
{
|
78
|
+
detail::check_memsys();
|
79
|
+
*enabled = static_cast<uint8_t>(TheMSys->stats.enabled);
|
80
|
+
}
|
81
|
+
|
82
|
+
extern "C" __global__ void NRT_MemSys_print(void)
|
83
|
+
{
|
84
|
+
if (TheMSys != nullptr)
|
85
|
+
{
|
86
|
+
printf("TheMSys->stats.enabled %d\n", TheMSys->stats.enabled);
|
87
|
+
printf("TheMSys->stats.alloc %lu\n", TheMSys->stats.alloc.load());
|
88
|
+
printf("TheMSys->stats.free %lu\n", TheMSys->stats.free.load());
|
89
|
+
printf("TheMSys->stats.mi_alloc %lu\n", TheMSys->stats.mi_alloc.load());
|
90
|
+
printf("TheMSys->stats.mi_free %lu\n", TheMSys->stats.mi_free.load());
|
91
|
+
} else {
|
92
|
+
printf("TheMsys is null.\n");
|
93
|
+
}
|
94
|
+
}
|
@@ -0,0 +1,17 @@
|
|
1
|
+
#include <cuda/atomic>
|
2
|
+
|
3
|
+
// Globally needed variables
|
4
|
+
struct NRT_MemSys {
|
5
|
+
struct {
|
6
|
+
bool enabled;
|
7
|
+
cuda::atomic<size_t, cuda::thread_scope_device> alloc;
|
8
|
+
cuda::atomic<size_t, cuda::thread_scope_device> free;
|
9
|
+
cuda::atomic<size_t, cuda::thread_scope_device> mi_alloc;
|
10
|
+
cuda::atomic<size_t, cuda::thread_scope_device> mi_free;
|
11
|
+
} stats;
|
12
|
+
};
|
13
|
+
|
14
|
+
/* The Memory System object */
|
15
|
+
__device__ NRT_MemSys* TheMSys;
|
16
|
+
|
17
|
+
extern "C" __global__ void NRT_MemSys_set(NRT_MemSys *memsys_ptr);
|
@@ -3,6 +3,8 @@
|
|
3
3
|
|
4
4
|
#include <cuda/atomic>
|
5
5
|
|
6
|
+
#include "memsys.cuh"
|
7
|
+
|
6
8
|
typedef void (*NRT_dtor_function)(void* ptr, size_t size, void* info);
|
7
9
|
typedef void (*NRT_dealloc_func)(void* ptr, void* dealloc_info);
|
8
10
|
|
@@ -18,29 +20,21 @@ struct MemInfo {
|
|
18
20
|
};
|
19
21
|
}
|
20
22
|
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
cuda::atomic<size_t, cuda::thread_scope_device> alloc;
|
26
|
-
cuda::atomic<size_t, cuda::thread_scope_device> free;
|
27
|
-
cuda::atomic<size_t, cuda::thread_scope_device> mi_alloc;
|
28
|
-
cuda::atomic<size_t, cuda::thread_scope_device> mi_free;
|
29
|
-
} stats;
|
30
|
-
};
|
23
|
+
extern "C" __global__ void NRT_MemSys_set(NRT_MemSys *memsys_ptr)
|
24
|
+
{
|
25
|
+
TheMSys = memsys_ptr;
|
26
|
+
}
|
31
27
|
|
32
28
|
static __device__ void *nrt_allocate_meminfo_and_data_align(size_t size, unsigned align, NRT_MemInfo **mi);
|
33
29
|
static __device__ void *nrt_allocate_meminfo_and_data(size_t size, NRT_MemInfo **mi_out);
|
34
30
|
extern "C" __device__ void* NRT_Allocate_External(size_t size);
|
35
31
|
|
36
|
-
/* The Memory System object */
|
37
|
-
__device__ NRT_MemSys* TheMSys;
|
38
|
-
|
39
32
|
extern "C" __device__ void* NRT_Allocate(size_t size)
|
40
33
|
{
|
41
34
|
void* ptr = NULL;
|
42
35
|
ptr = malloc(size);
|
43
|
-
|
36
|
+
if (TheMSys && TheMSys->stats.enabled) {
|
37
|
+
TheMSys->stats.alloc.fetch_add(1, cuda::memory_order_relaxed); }
|
44
38
|
return ptr;
|
45
39
|
}
|
46
40
|
|
@@ -49,14 +43,14 @@ extern "C" __device__ void NRT_MemInfo_init(NRT_MemInfo* mi,
|
|
49
43
|
size_t size,
|
50
44
|
NRT_dtor_function dtor,
|
51
45
|
void* dtor_info)
|
52
|
-
// NRT_MemSys* TheMSys)
|
53
46
|
{
|
54
47
|
mi->refct = 1; /* starts with 1 refct */
|
55
48
|
mi->dtor = dtor;
|
56
49
|
mi->dtor_info = dtor_info;
|
57
50
|
mi->data = data;
|
58
51
|
mi->size = size;
|
59
|
-
|
52
|
+
if (TheMSys && TheMSys->stats.enabled) {
|
53
|
+
TheMSys->stats.mi_alloc.fetch_add(1, cuda::memory_order_relaxed); }
|
60
54
|
}
|
61
55
|
|
62
56
|
extern "C"
|
@@ -71,7 +65,8 @@ __device__ NRT_MemInfo* NRT_MemInfo_new(
|
|
71
65
|
extern "C" __device__ void NRT_Free(void* ptr)
|
72
66
|
{
|
73
67
|
free(ptr);
|
74
|
-
|
68
|
+
if (TheMSys && TheMSys->stats.enabled) {
|
69
|
+
TheMSys->stats.free.fetch_add(1, cuda::memory_order_relaxed); }
|
75
70
|
}
|
76
71
|
|
77
72
|
extern "C" __device__ void NRT_dealloc(NRT_MemInfo* mi)
|
@@ -82,8 +77,10 @@ extern "C" __device__ void NRT_dealloc(NRT_MemInfo* mi)
|
|
82
77
|
extern "C" __device__ void NRT_MemInfo_destroy(NRT_MemInfo* mi)
|
83
78
|
{
|
84
79
|
NRT_dealloc(mi);
|
85
|
-
|
80
|
+
if (TheMSys && TheMSys->stats.enabled) {
|
81
|
+
TheMSys->stats.mi_free.fetch_add(1, cuda::memory_order_relaxed); }
|
86
82
|
}
|
83
|
+
|
87
84
|
extern "C" __device__ void NRT_MemInfo_call_dtor(NRT_MemInfo* mi)
|
88
85
|
{
|
89
86
|
if (mi->dtor) /* We have a destructor */
|
@@ -158,10 +155,10 @@ extern "C" __device__ void* NRT_Allocate_External(size_t size) {
|
|
158
155
|
ptr = malloc(size);
|
159
156
|
//NRT_Debug(nrt_debug_print("NRT_Allocate_External bytes=%zu ptr=%p\n", size, ptr));
|
160
157
|
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
158
|
+
if (TheMSys && TheMSys->stats.enabled)
|
159
|
+
{
|
160
|
+
TheMSys->stats.alloc.fetch_add(1, cuda::memory_order_relaxed);
|
161
|
+
}
|
165
162
|
return ptr;
|
166
163
|
}
|
167
164
|
|