tinygrad 0.8.0__py3-none-any.whl → 0.9.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tinygrad/__init__.py +6 -6
- tinygrad/codegen/__init__.py +0 -0
- tinygrad/codegen/kernel.py +253 -225
- tinygrad/codegen/linearizer.py +398 -436
- tinygrad/codegen/uops.py +451 -0
- tinygrad/device.py +268 -274
- tinygrad/dtype.py +56 -40
- tinygrad/engine/__init__.py +0 -0
- tinygrad/engine/graph.py +100 -0
- tinygrad/engine/jit.py +198 -0
- tinygrad/engine/realize.py +192 -0
- tinygrad/engine/schedule.py +370 -0
- tinygrad/engine/search.py +199 -0
- tinygrad/{mlops.py → function.py} +40 -32
- tinygrad/helpers.py +144 -46
- tinygrad/lazy.py +143 -242
- tinygrad/multi.py +173 -0
- tinygrad/nn/__init__.py +180 -9
- tinygrad/nn/datasets.py +8 -0
- tinygrad/nn/optim.py +106 -28
- tinygrad/nn/state.py +87 -19
- tinygrad/ops.py +104 -45
- tinygrad/renderer/__init__.py +65 -0
- tinygrad/renderer/assembly.py +269 -0
- tinygrad/renderer/cstyle.py +308 -210
- tinygrad/renderer/llvmir.py +119 -124
- tinygrad/runtime/__init__.py +0 -0
- tinygrad/runtime/autogen/amd_gpu.py +13403 -0
- tinygrad/runtime/autogen/comgr.py +891 -0
- tinygrad/runtime/autogen/cuda.py +5923 -0
- tinygrad/runtime/autogen/hip.py +5909 -0
- tinygrad/runtime/autogen/hsa.py +5893 -0
- tinygrad/runtime/autogen/io_uring.py +1486 -0
- tinygrad/runtime/autogen/kfd.py +812 -0
- tinygrad/runtime/autogen/nv_gpu.py +33597 -0
- tinygrad/runtime/autogen/opencl.py +1795 -0
- tinygrad/runtime/driver/__init__.py +0 -0
- tinygrad/runtime/driver/hip_comgr.py +56 -0
- tinygrad/runtime/graph/__init__.py +0 -0
- tinygrad/runtime/graph/clang.py +39 -0
- tinygrad/runtime/graph/cuda.py +59 -54
- tinygrad/runtime/graph/hcq.py +187 -0
- tinygrad/runtime/graph/metal.py +37 -41
- tinygrad/runtime/ops_amd.py +550 -0
- tinygrad/runtime/ops_clang.py +16 -14
- tinygrad/runtime/ops_cuda.py +129 -37
- tinygrad/runtime/ops_disk.py +111 -43
- tinygrad/runtime/ops_gpu.py +52 -50
- tinygrad/runtime/ops_llvm.py +36 -56
- tinygrad/runtime/ops_metal.py +41 -24
- tinygrad/runtime/ops_npy.py +9 -0
- tinygrad/runtime/ops_nv.py +625 -0
- tinygrad/runtime/ops_python.py +208 -0
- tinygrad/shape/__init__.py +0 -0
- tinygrad/shape/shapetracker.py +46 -107
- tinygrad/shape/symbolic.py +99 -98
- tinygrad/shape/view.py +162 -45
- tinygrad/tensor.py +2492 -483
- {tinygrad-0.8.0.dist-info → tinygrad-0.9.1.dist-info}/LICENSE +1 -1
- {tinygrad-0.8.0.dist-info → tinygrad-0.9.1.dist-info}/METADATA +31 -13
- tinygrad-0.9.1.dist-info/RECORD +63 -0
- {tinygrad-0.8.0.dist-info → tinygrad-0.9.1.dist-info}/WHEEL +1 -1
- tinygrad/features/image.py +0 -93
- tinygrad/features/multi.py +0 -103
- tinygrad/features/search.py +0 -160
- tinygrad/graph.py +0 -106
- tinygrad/jit.py +0 -152
- tinygrad/realize.py +0 -50
- tinygrad/runtime/graph/hip.py +0 -24
- tinygrad/runtime/ops_cpu.py +0 -45
- tinygrad/runtime/ops_hip.py +0 -97
- tinygrad/runtime/ops_torch.py +0 -49
- tinygrad-0.8.0.dist-info/RECORD +0 -41
- {tinygrad-0.8.0.dist-info → tinygrad-0.9.1.dist-info}/top_level.txt +0 -0
tinygrad/runtime/ops_cuda.py
CHANGED
@@ -1,12 +1,13 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
import subprocess, hashlib, tempfile, ctypes, ctypes.util, functools, re
|
3
3
|
from pathlib import Path
|
4
|
-
from typing import Tuple, Optional
|
5
|
-
import
|
6
|
-
from tinygrad.helpers import DEBUG, getenv, from_mv,
|
7
|
-
from tinygrad.device import Compiled, LRUAllocator, MallocAllocator
|
8
|
-
from tinygrad.codegen.kernel import LinearizerOptions
|
4
|
+
from typing import Tuple, Optional, List
|
5
|
+
import tinygrad.runtime.autogen.cuda as cuda
|
6
|
+
from tinygrad.helpers import DEBUG, getenv, from_mv, to_char_p_p, init_c_var, init_c_struct_t, colored, cpu_time_execution
|
7
|
+
from tinygrad.device import Compiled, Compiler, CompileError, BufferOptions, LRUAllocator, MallocAllocator
|
9
8
|
from tinygrad.renderer.cstyle import CUDARenderer
|
9
|
+
from tinygrad.renderer.assembly import PTXRenderer
|
10
|
+
if getenv("IOCTL"): import extra.nv_gpu_driver.nv_ioctl # noqa: F401 # pylint: disable=unused-import
|
10
11
|
|
11
12
|
def pretty_ptx(s):
|
12
13
|
# all expressions match `<valid_before><expr><valid_after>` and replace it with `<valid_before>color(<expr>)<valid_after>`
|
@@ -22,72 +23,163 @@ CUDACPU = getenv("CUDACPU") == 1
|
|
22
23
|
if CUDACPU:
|
23
24
|
gpuocelot_lib = ctypes.CDLL(ctypes.util.find_library("gpuocelot"))
|
24
25
|
gpuocelot_lib.ptx_run.argtypes = [ctypes.c_char_p, ctypes.c_int, ctypes.POINTER(ctypes.c_void_p), ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int] # noqa: E501
|
25
|
-
cuda.cuLaunchKernel = lambda src, gx, gy, gz, lx, ly, lz, shared, stream, unused_extra, args: gpuocelot_lib.ptx_run(src, len(args), (ctypes.c_void_p * len(args))(*[ctypes.cast(x, ctypes.c_void_p) for x in args]), lx, ly, lz, gx, gy, gz, shared) # noqa: E501
|
26
|
+
cuda.cuLaunchKernel = lambda src, gx, gy, gz, lx, ly, lz, shared, stream, unused_extra, args: gpuocelot_lib.ptx_run(src, len(args), (ctypes.c_void_p * len(args))(*[ctypes.cast(x, ctypes.c_void_p) for x in args]), lx, ly, lz, gx, gy, gz, shared) # type: ignore # noqa: E501
|
26
27
|
|
27
28
|
def check(status):
|
28
29
|
if status != 0: raise RuntimeError(f"CUDA Error {status}, {ctypes.string_at(init_c_var(ctypes.POINTER(ctypes.c_char)(), lambda x: cuda.cuGetErrorString(status, ctypes.byref(x)))).decode()}") # noqa: E501
|
29
30
|
|
30
|
-
def
|
31
|
+
def encode_args(args, vals) -> Tuple[ctypes.Structure, ctypes.Array]:
|
32
|
+
c_args = init_c_struct_t(tuple([(f'f{i}', cuda.CUdeviceptr_v2) for i in range(len(args))] +
|
33
|
+
[(f'v{i}', ctypes.c_int) for i in range(len(vals))]))(*args, *vals)
|
34
|
+
vargs = (ctypes.c_void_p * 5)(ctypes.c_void_p(1), ctypes.cast(ctypes.byref(c_args), ctypes.c_void_p), ctypes.c_void_p(2),
|
35
|
+
ctypes.cast(ctypes.pointer(ctypes.c_size_t(ctypes.sizeof(c_args))), ctypes.c_void_p), ctypes.c_void_p(0))
|
36
|
+
return c_args, vargs
|
31
37
|
|
32
|
-
def
|
38
|
+
def cu_time_execution(cb, enable=False) -> Optional[float]:
|
39
|
+
if CUDACPU: return cpu_time_execution(cb, enable=enable)
|
40
|
+
if not enable: return cb()
|
41
|
+
evs = [init_c_var(cuda.CUevent(), lambda x: cuda.cuEventCreate(ctypes.byref(x), 0)) for _ in range(2)]
|
42
|
+
cuda.cuEventRecord(evs[0], None)
|
43
|
+
cb()
|
44
|
+
cuda.cuEventRecord(evs[1], None)
|
45
|
+
check(cuda.cuEventSynchronize(evs[1]))
|
46
|
+
cuda.cuEventElapsedTime(ctypes.byref(ret := ctypes.c_float()), evs[0], evs[1])
|
47
|
+
for ev in evs: cuda.cuEventDestroy_v2(ev)
|
48
|
+
return ret.value * 1e-3
|
49
|
+
|
50
|
+
def _get_bytes(arg, get_str, get_sz, check) -> bytes:
|
51
|
+
sz = init_c_var(ctypes.c_size_t(), lambda x: check(get_sz(arg, ctypes.byref(x))))
|
52
|
+
return ctypes.string_at(init_c_var(ctypes.create_string_buffer(sz.value), lambda x: check(get_str(arg, x))), size=sz.value)
|
53
|
+
|
54
|
+
class PTXCompiler(Compiler):
|
55
|
+
def __init__(self, arch:str):
|
56
|
+
self.arch = arch
|
57
|
+
self.version = "7.8" if arch >= "sm_89" else "7.5"
|
58
|
+
super().__init__(f"compile_ptx_{self.arch}")
|
59
|
+
def compile(self, src:str) -> bytes: return src.replace("TARGET", self.arch).replace("VERSION", self.version).encode()
|
60
|
+
|
61
|
+
class CUDACompiler(Compiler):
|
62
|
+
def __init__(self, arch:str):
|
63
|
+
self.arch = arch
|
64
|
+
check(cuda.nvrtcVersion((nvrtcMajor := ctypes.c_int()), (nvrtcMinor := ctypes.c_int())))
|
65
|
+
self.compile_options = [f'--gpu-architecture={arch}', "-I/usr/local/cuda/include", "-I/usr/include", "-I/opt/cuda/include/"]
|
66
|
+
if (nvrtcMajor.value, nvrtcMinor.value) >= (12, 4): self.compile_options.append("--minimal")
|
67
|
+
super().__init__(f"compile_cuda_{self.arch}")
|
68
|
+
def compile(self, src:str) -> bytes:
|
69
|
+
check(cuda.nvrtcCreateProgram(ctypes.byref(prog := cuda.nvrtcProgram()), src.encode(), "<null>".encode(), 0, None, None))
|
70
|
+
status = cuda.nvrtcCompileProgram(prog, len(self.compile_options), to_char_p_p([o.encode() for o in self.compile_options]))
|
71
|
+
|
72
|
+
if status != 0: raise CompileError(f"compile failed: {_get_bytes(prog, cuda.nvrtcGetProgramLog, cuda.nvrtcGetProgramLogSize, check).decode()}")
|
73
|
+
return _get_bytes(prog, cuda.nvrtcGetPTX, cuda.nvrtcGetPTXSize, check)
|
74
|
+
|
75
|
+
def cuda_disassemble(lib, arch):
|
76
|
+
try:
|
77
|
+
fn = (Path(tempfile.gettempdir()) / f"tinycuda_{hashlib.md5(lib).hexdigest()}").as_posix()
|
78
|
+
with open(fn + ".ptx", "wb") as f: f.write(lib)
|
79
|
+
subprocess.run(["ptxas", f"-arch={arch}", "-o", fn, fn+".ptx"], check=True)
|
80
|
+
print(subprocess.check_output(['nvdisasm', fn]).decode('utf-8'))
|
81
|
+
except Exception as e: print("failed to generate SASS", str(e))
|
33
82
|
|
34
83
|
class CUDAProgram:
|
35
84
|
def __init__(self, device:CUDADevice, name:str, lib:bytes):
|
36
85
|
self.device, self.name, self.lib = device, name, lib
|
37
|
-
if DEBUG >= 5: print(pretty_ptx(lib.decode('utf-8')))
|
38
|
-
if DEBUG >= 6:
|
39
|
-
try:
|
40
|
-
fn = (Path(tempfile.gettempdir()) / f"tinycuda_{hashlib.md5(lib).hexdigest()}").as_posix()
|
41
|
-
with open(fn + ".ptx", "wb") as f: f.write(lib)
|
42
|
-
subprocess.run(["ptxas", f"-arch={CUDADevice.default_arch_name}", "-o", fn, fn+".ptx"], check=True)
|
43
|
-
print(subprocess.check_output(['nvdisasm', fn]).decode('utf-8'))
|
44
|
-
except Exception as e: print("failed to generate SASS", str(e))
|
86
|
+
if DEBUG >= 5: print("\n".join([f"{i+1:>3} {line}" for i, line in enumerate(pretty_ptx(lib.decode('utf-8')).split("\n"))]))
|
87
|
+
if DEBUG >= 6: cuda_disassemble(lib, device.arch)
|
45
88
|
|
46
|
-
if
|
89
|
+
if CUDACPU: self.prg = lib
|
90
|
+
else:
|
47
91
|
check(cuda.cuCtxSetCurrent(self.device.context))
|
48
|
-
self.module =
|
92
|
+
self.module = cuda.CUmodule()
|
93
|
+
status = cuda.cuModuleLoadData(ctypes.byref(self.module), lib)
|
94
|
+
if status != 0:
|
95
|
+
del self.module
|
96
|
+
cuda_disassemble(lib, device.arch)
|
97
|
+
raise RuntimeError(f"module load failed with status code {status}: {cuda.cudaError_enum__enumvalues[status]}")
|
49
98
|
check(cuda.cuModuleGetFunction(ctypes.byref(prg := cuda.CUfunction()), self.module, name.encode("utf-8")))
|
50
|
-
|
99
|
+
self.prg = prg #type: ignore
|
51
100
|
|
52
101
|
def __del__(self):
|
53
|
-
if
|
102
|
+
if hasattr(self, 'module'): check(cuda.cuModuleUnload(self.module))
|
54
103
|
|
55
|
-
def __call__(self, *
|
56
|
-
if
|
57
|
-
|
58
|
-
|
104
|
+
def __call__(self, *args, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1), vals:Tuple[int, ...]=(), wait=False):
|
105
|
+
if CUDACPU: self.vargs = args+tuple(vals)
|
106
|
+
else:
|
107
|
+
check(cuda.cuCtxSetCurrent(self.device.context))
|
108
|
+
if not hasattr(self, "vargs"):
|
109
|
+
self.c_args, self.vargs = encode_args(args, vals) #type: ignore
|
110
|
+
else:
|
111
|
+
for i in range(len(args)): self.c_args.__setattr__(f'f{i}', args[i])
|
112
|
+
for i in range(len(vals)): self.c_args.__setattr__(f'v{i}', vals[i])
|
113
|
+
return cu_time_execution(lambda: check(cuda.cuLaunchKernel(self.prg, *global_size, *local_size, 0, None, None, self.vargs)), enable=wait)
|
59
114
|
|
60
115
|
class CUDAAllocator(LRUAllocator):
|
61
116
|
def __init__(self, device:CUDADevice):
|
62
117
|
self.device = device
|
63
118
|
super().__init__()
|
64
|
-
def _alloc(self, size):
|
119
|
+
def _alloc(self, size, options:BufferOptions):
|
65
120
|
check(cuda.cuCtxSetCurrent(self.device.context))
|
121
|
+
if options.host: return init_c_var(ctypes.c_void_p(), lambda x: check(cuda.cuMemHostAlloc(ctypes.byref(x), size, 0x01)))
|
66
122
|
return init_c_var(cuda.CUdeviceptr(), lambda x: check(cuda.cuMemAlloc_v2(ctypes.byref(x), size)))
|
67
|
-
def _free(self, opaque):
|
123
|
+
def _free(self, opaque, options:BufferOptions):
|
124
|
+
if options.host: check(cuda.cuMemFreeHost(opaque))
|
125
|
+
else: check(cuda.cuMemFree_v2(opaque))
|
68
126
|
def copyin(self, dest, src:memoryview):
|
69
127
|
check(cuda.cuCtxSetCurrent(self.device.context))
|
70
|
-
|
128
|
+
host_mem = self.alloc(len(src), BufferOptions(host=True))
|
129
|
+
self.device.pending_copyin.append((host_mem, len(src), BufferOptions(host=True)))
|
130
|
+
ctypes.memmove(host_mem, from_mv(src), len(src))
|
131
|
+
check(cuda.cuMemcpyHtoDAsync_v2(dest, host_mem, len(src), None))
|
71
132
|
def copyout(self, dest:memoryview, src):
|
133
|
+
CUDADevice.synchronize_system()
|
72
134
|
check(cuda.cuCtxSetCurrent(self.device.context))
|
73
135
|
check(cuda.cuMemcpyDtoH_v2(from_mv(dest), src, len(dest)))
|
136
|
+
def transfer(self, dest, src, sz:int, src_dev, dest_dev):
|
137
|
+
check(cuda.cuCtxSetCurrent(src_dev.context))
|
138
|
+
check(cuda.cuEventCreate(ctypes.byref(sync_event := cuda.CUevent()), 0))
|
139
|
+
check(cuda.cuMemcpyDtoDAsync_v2(dest, src, sz, None))
|
140
|
+
check(cuda.cuEventRecord(sync_event, None))
|
141
|
+
check(cuda.cuCtxSetCurrent(dest_dev.context))
|
142
|
+
check(cuda.cuStreamWaitEvent(None, sync_event, 0)) # sync the default stream on the dest dev
|
143
|
+
def offset(self, buf, size:int, offset:int): return ctypes.c_ulong(buf.value + offset)
|
74
144
|
|
75
145
|
class CUDADevice(Compiled):
|
76
|
-
|
146
|
+
devices: List[CUDADevice] = []
|
147
|
+
peer_access = False
|
148
|
+
|
77
149
|
def __init__(self, device:str):
|
78
150
|
device_id = int(device.split(":")[1]) if ":" in device else 0
|
79
151
|
if not CUDACPU:
|
80
152
|
check(cuda.cuInit(0))
|
81
|
-
check(cuda.cuDeviceGet(ctypes.byref(
|
82
|
-
self.context = init_c_var(cuda.CUcontext(), lambda x: check(cuda.cuCtxCreate_v2(ctypes.byref(x), 0,
|
153
|
+
self.cu_device = init_c_var(cuda.CUdevice(), lambda x: check(cuda.cuDeviceGet(ctypes.byref(x), device_id)))
|
154
|
+
self.context = init_c_var(cuda.CUcontext(), lambda x: check(cuda.cuCtxCreate_v2(ctypes.byref(x), 0, self.cu_device)))
|
83
155
|
check(cuda.cuDeviceComputeCapability(ctypes.byref(major := ctypes.c_int()), ctypes.byref(minor := ctypes.c_int()), device_id))
|
84
|
-
|
156
|
+
|
157
|
+
for dev in CUDADevice.devices:
|
158
|
+
check(cuda.cuDeviceCanAccessPeer(ctypes.byref(val := ctypes.c_int()), self.cu_device, dev.cu_device))
|
159
|
+
if val.value != 1: continue
|
160
|
+
check(cuda.cuCtxSetCurrent(dev.context))
|
161
|
+
check(cuda.cuCtxEnablePeerAccess(self.context, 0))
|
162
|
+
check(cuda.cuCtxSetCurrent(self.context))
|
163
|
+
check(cuda.cuCtxEnablePeerAccess(dev.context, 0))
|
164
|
+
CUDADevice.peer_access = True
|
165
|
+
|
166
|
+
self.arch = f"sm_{major.value}{minor.value}" if not CUDACPU else "sm_35"
|
167
|
+
self.pending_copyin: List[Tuple[int, int, Optional[BufferOptions]]] = []
|
168
|
+
CUDADevice.devices.append(self)
|
85
169
|
|
86
170
|
from tinygrad.runtime.graph.cuda import CUDAGraph
|
87
|
-
super().__init__(CUDAAllocator(self) if not CUDACPU else MallocAllocator,
|
88
|
-
|
89
|
-
|
171
|
+
super().__init__(device, CUDAAllocator(self) if not CUDACPU else MallocAllocator,
|
172
|
+
PTXRenderer(self.arch) if getenv("PTX") else CUDARenderer(self.arch),
|
173
|
+
PTXCompiler(self.arch) if getenv("PTX") else CUDACompiler(self.arch),
|
174
|
+
functools.partial(CUDAProgram, self), graph=CUDAGraph if not CUDACPU else None)
|
175
|
+
|
90
176
|
def synchronize(self):
|
91
|
-
if
|
92
|
-
|
93
|
-
|
177
|
+
if CUDACPU: return
|
178
|
+
check(cuda.cuCtxSetCurrent(self.context))
|
179
|
+
check(cuda.cuCtxSynchronize())
|
180
|
+
for opaque,sz,options in self.pending_copyin: self.allocator.free(opaque, sz, options)
|
181
|
+
self.pending_copyin.clear()
|
182
|
+
|
183
|
+
@staticmethod
|
184
|
+
def synchronize_system():
|
185
|
+
for d in CUDADevice.devices: d.synchronize()
|
tinygrad/runtime/ops_disk.py
CHANGED
@@ -1,57 +1,125 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
from
|
4
|
-
from tinygrad.helpers import
|
5
|
-
from tinygrad.device import
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
def __del__(self):
|
12
|
-
if self.fd is not None: os.close(self.fd)
|
1
|
+
from __future__ import annotations
|
2
|
+
import os, mmap, _posixshmem, io, ctypes, ctypes.util, platform, contextlib
|
3
|
+
from typing import Optional, Generator, Tuple, Callable, List
|
4
|
+
from tinygrad.helpers import OSX, round_up
|
5
|
+
from tinygrad.device import Compiled, Allocator
|
6
|
+
import tinygrad.runtime.autogen.io_uring as io_uring
|
7
|
+
|
8
|
+
libc = ctypes.CDLL(ctypes.util.find_library("c"))
|
9
|
+
libc.mmap.argtypes = [ctypes.c_void_p, ctypes.c_size_t, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_long]
|
10
|
+
libc.mmap.restype = ctypes.c_void_p
|
13
11
|
|
14
12
|
class DiskBuffer:
|
15
|
-
def __init__(self,
|
16
|
-
self.
|
17
|
-
def __repr__(self): return f"<DiskBuffer size={self.size}
|
18
|
-
def
|
19
|
-
|
20
|
-
|
21
|
-
return DiskBuffer(self.ud, self.size, arg[0], offset=self.offset)
|
22
|
-
def as_strided(self, arg):
|
23
|
-
assert strides_for_shape(arg[0]) == arg[1], "disk tensors don't support strides"
|
24
|
-
return DiskBuffer(self.ud, prod(arg[0]), self.dtype, offset=self.offset+arg[2]*self.dtype.itemsize)
|
25
|
-
def _buf(self) -> memoryview: return memoryview(self.ud.mem)[self.offset:self.offset+self.size*self.dtype.itemsize]
|
26
|
-
|
27
|
-
disk_fxn_for_op: Dict[Op, Callable] = { UnaryOps.CAST: DiskBuffer.cast, MovementOps.AS_STRIDED: DiskBuffer.as_strided }
|
13
|
+
def __init__(self, device:DiskDevice, size:int, offset=0):
|
14
|
+
self.device, self.size, self.offset = device, size, offset
|
15
|
+
def __repr__(self): return f"<DiskBuffer size={self.size} offset={self.offset}>"
|
16
|
+
def _buf(self) -> memoryview:
|
17
|
+
assert self.device.mem is not None, "DiskBuffer wasn't opened"
|
18
|
+
return memoryview(self.device.mem)[self.offset:self.offset+self.size]
|
28
19
|
|
29
20
|
MAP_LOCKED, MAP_POPULATE = 0 if OSX else 0x2000, getattr(mmap, "MAP_POPULATE", 0 if OSX else 0x008000)
|
30
21
|
class DiskAllocator(Allocator):
|
31
|
-
def __init__(self, device): self.device = device
|
32
|
-
def _alloc(self, size):
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
os.close(fd)
|
37
|
-
fd = None
|
38
|
-
else:
|
39
|
-
try: fd = os.open(self.device, os.O_RDWR|os.O_CREAT|(0 if OSX else os.O_DIRECT))
|
40
|
-
except OSError: fd = os.open(self.device, os.O_RDWR|os.O_CREAT)
|
41
|
-
if os.fstat(fd).st_size < size: os.ftruncate(fd, size)
|
42
|
-
mem = mmap.mmap(fd, size)
|
43
|
-
if (hp := getattr(mmap, "MADV_HUGEPAGE", None)) is not None: mem.madvise(hp) # type: ignore
|
44
|
-
return DiskBuffer(UnderlyingDiskBuffer(fd, mem), size)
|
22
|
+
def __init__(self, device:DiskDevice): self.device = device
|
23
|
+
def _alloc(self, size:int, options):
|
24
|
+
self.device._might_open(size)
|
25
|
+
return DiskBuffer(self.device, size)
|
26
|
+
def _free(self, opaque, options): self.device._might_close()
|
45
27
|
def as_buffer(self, src:DiskBuffer): return src._buf()
|
46
28
|
def copyin(self, dest:DiskBuffer, src:memoryview): dest._buf()[:] = src
|
47
29
|
def copyout(self, dest:memoryview, src:DiskBuffer):
|
48
|
-
if OSX and
|
30
|
+
if OSX and hasattr(self.device, 'fd'):
|
49
31
|
# OSX doesn't seem great at mmap, this is faster
|
50
|
-
with io.FileIO(
|
32
|
+
with io.FileIO(self.device.fd, "a+b", closefd=False) as fo:
|
51
33
|
fo.seek(src.offset)
|
52
34
|
fo.readinto(dest)
|
53
35
|
else:
|
54
36
|
dest[:] = src._buf()
|
55
37
|
|
56
|
-
|
57
|
-
|
38
|
+
def _copyout_sharded(self, src:DiskBuffer, size:int, _get_free_buf:Callable, seg_len:int) -> Generator[Tuple[int, int, int, int], None, None]:
|
39
|
+
assert hasattr(DiskDevice, 'io_uring'), "function requires io uring support"
|
40
|
+
|
41
|
+
fd_offset = src.offset - (minor_offset := src.offset % mmap.PAGESIZE)
|
42
|
+
processed_reqs_cnt, copied_in, next_read_offset, total_copy_size = 0, 0, 0, round_up(size + minor_offset, mmap.PAGESIZE)
|
43
|
+
reqs: List[Tuple[int, int, int, int]] = []
|
44
|
+
|
45
|
+
while next_read_offset < total_copy_size or len(reqs) != processed_reqs_cnt:
|
46
|
+
if next_read_offset < total_copy_size and (copy_batch := _get_free_buf()) is not None:
|
47
|
+
# Prepare sqe
|
48
|
+
sqe_index = (tail:=DiskDevice.io_uring.sq.ktail[0]) & DiskDevice.io_uring.sq.kring_mask[0]
|
49
|
+
sqe = DiskDevice.io_uring.sq.sqes[sqe_index]
|
50
|
+
sqe.opcode, sqe.fd, sqe.off = io_uring.IORING_OP_READ, self.device.fd, fd_offset + next_read_offset
|
51
|
+
sqe.addr, sqe.len, sqe.user_data = copy_batch[0], min(seg_len, total_copy_size - next_read_offset), len(reqs)
|
52
|
+
|
53
|
+
# Send sqe
|
54
|
+
DiskDevice.io_uring.sq.array[sqe_index] = sqe_index
|
55
|
+
DiskDevice.io_uring.sq.ktail[0] = tail + 1
|
56
|
+
libc.syscall(io_uring.NR_io_uring_enter, DiskDevice.io_uring.ring_fd, 1, 1, io_uring.IORING_ENTER_GETEVENTS)
|
57
|
+
|
58
|
+
reqs.append((copy_batch, copied_in, minor_offset, real_copy_size:=min(sqe.len - minor_offset, size - copied_in)))
|
59
|
+
next_read_offset += sqe.len
|
60
|
+
copied_in += real_copy_size
|
61
|
+
minor_offset = 0
|
62
|
+
|
63
|
+
if (head:=DiskDevice.io_uring.cq.khead[0]) != DiskDevice.io_uring.cq.ktail[0]:
|
64
|
+
cqe = DiskDevice.io_uring.cq.cqes[head & DiskDevice.io_uring.cq.kring_mask[0]]
|
65
|
+
assert cqe.res >= 0, f"read from disk failed, err: {cqe.res}"
|
66
|
+
yield reqs[cqe.user_data]
|
67
|
+
DiskDevice.io_uring.cq.khead[0] = head + 1 # advance
|
68
|
+
processed_reqs_cnt += 1
|
69
|
+
|
70
|
+
def offset(self, buf:DiskBuffer, size:int, offset:int): return DiskBuffer(buf.device, size, offset)
|
71
|
+
|
72
|
+
class DiskDevice(Compiled):
|
73
|
+
_tried_io_uring_init = False
|
74
|
+
|
75
|
+
def __init__(self, device:str):
|
76
|
+
if not DiskDevice._tried_io_uring_init: self._iouring_setup()
|
77
|
+
|
78
|
+
self.size: Optional[int] = None
|
79
|
+
self.count = 0
|
80
|
+
super().__init__(device, DiskAllocator(self), None, None, None)
|
81
|
+
def _might_open(self, size):
|
82
|
+
self.count += 1
|
83
|
+
assert self.size is None or size <= self.size, f"can't reopen Disk tensor with larger size, opened with {self.size}, tried to open with {size}"
|
84
|
+
if self.size is not None: return
|
85
|
+
filename = self.dname[len("disk:"):]
|
86
|
+
self.size = size
|
87
|
+
|
88
|
+
if filename.startswith("shm:"):
|
89
|
+
fd = _posixshmem.shm_open("/"+filename[4:].lstrip("/"), os.O_RDWR, 0o600)
|
90
|
+
self.mem = mmap.mmap(fd, self.size, mmap.MAP_SHARED | MAP_POPULATE | MAP_LOCKED)
|
91
|
+
os.close(fd)
|
92
|
+
else:
|
93
|
+
try: self.fd = os.open(filename, os.O_RDWR|os.O_CREAT|(0 if OSX else os.O_DIRECT))
|
94
|
+
except OSError: self.fd = os.open(filename, os.O_RDWR|os.O_CREAT)
|
95
|
+
if os.fstat(self.fd).st_size < self.size: os.ftruncate(self.fd, self.size)
|
96
|
+
self.mem = mmap.mmap(self.fd, self.size)
|
97
|
+
if (hp := getattr(mmap, "MADV_HUGEPAGE", None)) is not None:
|
98
|
+
with contextlib.suppress(OSError): self.mem.madvise(hp) # some systems have transparent_hugepage disabled
|
99
|
+
def _might_close(self):
|
100
|
+
self.count -= 1
|
101
|
+
if self.count == 0:
|
102
|
+
if hasattr(self, 'fd'): os.close(self.fd)
|
103
|
+
self.size = None
|
104
|
+
def _iouring_setup(self):
|
105
|
+
DiskDevice._tried_io_uring_init = True
|
106
|
+
|
107
|
+
if platform.system() != 'Linux': return
|
108
|
+
|
109
|
+
fd = libc.syscall(io_uring.NR_io_uring_setup, 4096, ctypes.byref(p:=io_uring.struct_io_uring_params()))
|
110
|
+
if fd < 0: return
|
111
|
+
|
112
|
+
sq_ptr = libc.mmap(0, p.sq_off.array + p.sq_entries * 4, mmap.PROT_READ | mmap.PROT_WRITE, mmap.MAP_SHARED | MAP_POPULATE, fd, 0)
|
113
|
+
cq_ptr = libc.mmap(0, p.cq_off.cqes + p.cq_entries * ctypes.sizeof(io_uring.struct_io_uring_cqe),
|
114
|
+
mmap.PROT_READ | mmap.PROT_WRITE, mmap.MAP_SHARED | MAP_POPULATE, fd, io_uring.IORING_OFF_CQ_RING)
|
115
|
+
sqes = libc.mmap(0, p.sq_entries * ctypes.sizeof(io_uring.struct_io_uring_sqe),
|
116
|
+
mmap.PROT_READ | mmap.PROT_WRITE, mmap.MAP_SHARED | MAP_POPULATE, fd, io_uring.IORING_OFF_SQES)
|
117
|
+
|
118
|
+
def u32ptr(val): return ctypes.cast(val, ctypes.POINTER(ctypes.c_uint32))
|
119
|
+
sqdesc = io_uring.struct_io_uring_sq(khead=u32ptr(sq_ptr+p.sq_off.head), ktail=u32ptr(sq_ptr+p.sq_off.tail), array=u32ptr(sq_ptr+p.sq_off.array),
|
120
|
+
kring_mask=u32ptr(sq_ptr+p.sq_off.ring_mask), sqes=ctypes.cast(sqes, ctypes.POINTER(io_uring.struct_io_uring_sqe)))
|
121
|
+
|
122
|
+
cqdesc = io_uring.struct_io_uring_cq(khead=u32ptr(cq_ptr+p.cq_off.head), ktail=u32ptr(cq_ptr+p.cq_off.tail),
|
123
|
+
kring_mask=u32ptr(sq_ptr+p.cq_off.ring_mask), cqes=ctypes.cast(cq_ptr+p.cq_off.cqes, ctypes.POINTER(io_uring.struct_io_uring_cqe)))
|
124
|
+
|
125
|
+
DiskDevice.io_uring = io_uring.struct_io_uring(ring_fd=fd, sq=sqdesc, cq=cqdesc) # type: ignore
|
tinygrad/runtime/ops_gpu.py
CHANGED
@@ -1,12 +1,10 @@
|
|
1
1
|
from __future__ import annotations
|
2
|
-
from typing import Tuple, Optional, List
|
3
|
-
import ctypes, functools
|
4
|
-
import
|
2
|
+
from typing import Tuple, Optional, List, cast
|
3
|
+
import ctypes, functools, hashlib
|
4
|
+
import tinygrad.runtime.autogen.opencl as cl
|
5
5
|
from tinygrad.helpers import init_c_var, to_char_p_p, from_mv, OSX, DEBUG
|
6
|
-
from tinygrad.dtype import ImageDType
|
7
|
-
from tinygrad.codegen.kernel import LinearizerOptions
|
8
6
|
from tinygrad.renderer.cstyle import OpenCLRenderer
|
9
|
-
from tinygrad.device import Compiled,
|
7
|
+
from tinygrad.device import BufferOptions, LRUAllocator, Compiled, Compiler, CompileError
|
10
8
|
|
11
9
|
# see test/external/external_osx_profiling.py to determine this ratio. it's in like GPU clocks or something
|
12
10
|
OSX_TIMING_RATIO = (125/3) if OSX else 1.0
|
@@ -15,44 +13,47 @@ def check(status):
|
|
15
13
|
if status != 0: raise RuntimeError(f"OpenCL Error {status}")
|
16
14
|
def checked(ret, status): return (check(status.value), ret)[1]
|
17
15
|
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
cl.
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
16
|
+
class CLCompiler(Compiler):
|
17
|
+
def __init__(self, device:CLDevice, compile_key:str):
|
18
|
+
self.device = device
|
19
|
+
super().__init__(f"compile_cl_{compile_key}")
|
20
|
+
def compile(self, src:str) -> bytes:
|
21
|
+
program = checked(cl.clCreateProgramWithSource(self.device.context, 1, to_char_p_p([src.encode()]), None, status := ctypes.c_int32()), status)
|
22
|
+
build_status: int = cl.clBuildProgram(program, 1, self.device.device_id, None, cl.clBuildProgram.argtypes[4](), None)
|
23
|
+
if build_status != 0:
|
24
|
+
cl.clGetProgramBuildInfo(program, self.device.device_id, cl.CL_PROGRAM_BUILD_LOG, 0, None, log_size := ctypes.c_size_t())
|
25
|
+
cl.clGetProgramBuildInfo(program, self.device.device_id, cl.CL_PROGRAM_BUILD_LOG, log_size.value, mstr := ctypes.create_string_buffer(log_size.value), None) # noqa: E501
|
26
|
+
raise CompileError(f"OpenCL Compile Error\n\n{mstr.value.decode()}")
|
27
|
+
check(cl.clGetProgramInfo(program, cl.CL_PROGRAM_BINARY_SIZES, ctypes.sizeof(ctypes.c_size_t), binary_sizes := (ctypes.c_size_t * 1)(), None))
|
28
|
+
check(cl.clGetProgramInfo(program, cl.CL_PROGRAM_BINARIES, ctypes.sizeof(ctypes.c_void_p), (ctypes.c_void_p * 1)(ctypes.addressof(binary := ctypes.create_string_buffer(binary_sizes[0]))), None)) # noqa: E501
|
29
|
+
check(cl.clReleaseProgram(program))
|
30
|
+
return bytes(binary)
|
31
31
|
|
32
32
|
class CLProgram:
|
33
33
|
def __init__(self, device:CLDevice, name:str, lib:bytes):
|
34
34
|
self.device, self.name, self.lib = device, name, lib
|
35
|
-
self.program = checked(cl.clCreateProgramWithBinary(device.context, 1,
|
36
|
-
to_char_p_p([lib], ctypes.c_ubyte),
|
37
|
-
|
35
|
+
self.program = checked(cl.clCreateProgramWithBinary(device.context, 1, device.device_id, (ctypes.c_size_t * 1)(len(lib)),
|
36
|
+
to_char_p_p([lib], ctypes.c_ubyte), binary_status := ctypes.c_int32(),
|
37
|
+
errcode_ret := ctypes.c_int32()), errcode_ret)
|
38
38
|
check(binary_status.value)
|
39
|
-
check(cl.clBuildProgram(self.program, 1,
|
40
|
-
self.kernel = checked(cl.clCreateKernel(self.program, name.encode(),
|
39
|
+
check(cl.clBuildProgram(self.program, 1, device.device_id, None, cl.clBuildProgram.argtypes[4](), None)) # NOTE: OSX requires this
|
40
|
+
self.kernel = checked(cl.clCreateKernel(self.program, name.encode(), status := ctypes.c_int32()), status)
|
41
41
|
|
42
42
|
def __del__(self):
|
43
|
-
check(cl.clReleaseKernel(self.kernel))
|
44
|
-
check(cl.clReleaseProgram(self.program))
|
43
|
+
if hasattr(self, 'kernel'): check(cl.clReleaseKernel(self.kernel))
|
44
|
+
if hasattr(self, 'program'): check(cl.clReleaseProgram(self.program))
|
45
45
|
|
46
|
-
def __call__(self, *bufs:
|
46
|
+
def __call__(self, *bufs:ctypes._CData, global_size:Tuple[int,int,int]=(1,1,1), local_size:Optional[Tuple[int,int,int]]=None, vals:Tuple[int, ...]=(), wait=False) -> Optional[float]: # noqa: E501
|
47
47
|
for i,b in enumerate(bufs): cl.clSetKernelArg(self.kernel, i, ctypes.sizeof(b), ctypes.byref(b))
|
48
|
-
for i,
|
49
|
-
if local_size is not None: global_size = tuple(int(g*l) for g,l in zip(global_size, local_size))
|
48
|
+
for i,v in enumerate(vals,start=len(bufs)): cl.clSetKernelArg(self.kernel, i, 4, ctypes.byref(ctypes.c_int32(v)))
|
49
|
+
if local_size is not None: global_size = cast(Tuple[int,int,int], tuple(int(g*l) for g,l in zip(global_size, local_size)))
|
50
50
|
event = cl.cl_event() if wait else None
|
51
51
|
check(cl.clEnqueueNDRangeKernel(self.device.queue, self.kernel, len(global_size), None, (ctypes.c_size_t * len(global_size))(*global_size), (ctypes.c_size_t * len(local_size))(*local_size) if local_size else None, 0, None, event)) # noqa: E501
|
52
52
|
if wait:
|
53
|
-
|
54
|
-
|
55
|
-
|
53
|
+
assert event is not None
|
54
|
+
check(cl.clWaitForEvents(1, event))
|
55
|
+
check(cl.clGetEventProfilingInfo(event, cl.CL_PROFILING_COMMAND_START, 8, ctypes.byref(start := ctypes.c_uint64()), None))
|
56
|
+
check(cl.clGetEventProfilingInfo(event, cl.CL_PROFILING_COMMAND_END, 8, ctypes.byref(end := ctypes.c_uint64()), None))
|
56
57
|
return float(end.value-start.value) * OSX_TIMING_RATIO * 1e-9
|
57
58
|
return None
|
58
59
|
|
@@ -60,40 +61,41 @@ class CLAllocator(LRUAllocator):
|
|
60
61
|
def __init__(self, device:CLDevice):
|
61
62
|
self.device = device
|
62
63
|
super().__init__()
|
63
|
-
def _alloc(self, size:int) ->
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
def _free(self,
|
70
|
-
def copyin(self, dest:
|
64
|
+
def _alloc(self, size:int, options:BufferOptions) -> ctypes._CData:
|
65
|
+
if options.image is not None:
|
66
|
+
return checked(cl.clCreateImage2D(self.device.context, cl.CL_MEM_READ_WRITE,
|
67
|
+
cl.cl_image_format(cl.CL_RGBA, {2: cl.CL_HALF_FLOAT, 4: cl.CL_FLOAT}[options.image.itemsize]),
|
68
|
+
options.image.shape[1], options.image.shape[0], 0, None, status := ctypes.c_int32()), status)
|
69
|
+
return checked(cl.clCreateBuffer(self.device.context, cl.CL_MEM_READ_WRITE, size, None, status := ctypes.c_int32()), status)
|
70
|
+
def _free(self, opaque:ctypes._CData, options:BufferOptions): check(cl.clReleaseMemObject(opaque))
|
71
|
+
def copyin(self, dest:ctypes._CData, src:memoryview):
|
71
72
|
check(cl.clEnqueueWriteBuffer(self.device.queue, dest, False, 0, len(src)*src.itemsize, from_mv(src), 0, None, None))
|
72
73
|
self.device.pending_copyin.append(src) # NOTE: these can't be freed until the GPU actually executes this command
|
73
|
-
def copyout(self, dest:memoryview, src:
|
74
|
+
def copyout(self, dest:memoryview, src:ctypes._CData):
|
74
75
|
check(cl.clEnqueueReadBuffer(self.device.queue, src, False, 0, len(dest)*dest.itemsize, from_mv(dest), 0, None, None))
|
75
76
|
self.device.synchronize()
|
76
77
|
|
77
78
|
class CLDevice(Compiled):
|
78
79
|
device_ids = None # this is global and only initted once
|
79
|
-
compiler_context = None # this is the first created context. we make an assumption they are all the same for the compiler
|
80
80
|
def __init__(self, device:str=""):
|
81
81
|
if CLDevice.device_ids is None:
|
82
|
-
|
83
|
-
platform_ids
|
82
|
+
check(cl.clGetPlatformIDs(0, None, num_platforms := ctypes.c_uint32()))
|
83
|
+
check(cl.clGetPlatformIDs(num_platforms.value, platform_ids := (cl.cl_platform_id * num_platforms.value)(), None))
|
84
84
|
for device_type in [cl.CL_DEVICE_TYPE_GPU, cl.CL_DEVICE_TYPE_DEFAULT]:
|
85
|
-
|
86
|
-
err = cl.clGetDeviceIDs(platform_ids[0], device_type, 0, None, ctypes.byref(num_devices))
|
85
|
+
err = cl.clGetDeviceIDs(platform_ids[0], device_type, 0, None, num_devices := ctypes.c_uint32())
|
87
86
|
if err == 0 and num_devices.value != 0: break
|
88
87
|
if DEBUG >= 1: print(f"CLDevice: got {num_platforms.value} platforms and {num_devices.value} devices")
|
89
88
|
CLDevice.device_ids = init_c_var((cl.cl_device_id * num_devices.value)(), lambda x: check(cl.clGetDeviceIDs(platform_ids[0], device_type, num_devices, x, None))) # noqa: E501
|
90
89
|
|
91
90
|
self.device_id = CLDevice.device_ids[0 if ":" not in device else int(device.split(":")[1])]
|
92
|
-
self.
|
93
|
-
|
94
|
-
self.
|
91
|
+
self.device_name = (cl.clGetDeviceInfo(self.device_id, cl.CL_DEVICE_NAME, 256, buf := ctypes.create_string_buffer(256), None), buf.value.decode())[1] # noqa: E501
|
92
|
+
self.driver_version = (cl.clGetDeviceInfo(self.device_id, cl.CL_DRIVER_VERSION, 256, buf := ctypes.create_string_buffer(256), None), buf.value.decode())[1] # noqa: E501
|
93
|
+
self.context = checked(cl.clCreateContext(None, 1, self.device_id, cl.clCreateContext.argtypes[3](), None, status := ctypes.c_int32()), status)
|
94
|
+
self.queue = checked(cl.clCreateCommandQueue(self.context, self.device_id, cl.CL_QUEUE_PROFILING_ENABLE, status), status)
|
95
95
|
self.pending_copyin: List[memoryview] = []
|
96
|
-
|
96
|
+
|
97
|
+
compile_key = hashlib.md5(self.device_name.encode() + self.driver_version.encode()).hexdigest()
|
98
|
+
super().__init__(device, CLAllocator(self), OpenCLRenderer(), CLCompiler(self, f"compile_cl_{compile_key}"), functools.partial(CLProgram, self))
|
97
99
|
def synchronize(self):
|
98
100
|
check(cl.clFinish(self.queue))
|
99
101
|
self.pending_copyin.clear()
|