tinygrad 0.8.0__py3-none-any.whl → 0.9.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. tinygrad/__init__.py +6 -6
  2. tinygrad/codegen/__init__.py +0 -0
  3. tinygrad/codegen/kernel.py +253 -225
  4. tinygrad/codegen/linearizer.py +398 -436
  5. tinygrad/codegen/uops.py +451 -0
  6. tinygrad/device.py +268 -274
  7. tinygrad/dtype.py +56 -40
  8. tinygrad/engine/__init__.py +0 -0
  9. tinygrad/engine/graph.py +100 -0
  10. tinygrad/engine/jit.py +198 -0
  11. tinygrad/engine/realize.py +192 -0
  12. tinygrad/engine/schedule.py +370 -0
  13. tinygrad/engine/search.py +199 -0
  14. tinygrad/{mlops.py → function.py} +40 -32
  15. tinygrad/helpers.py +144 -46
  16. tinygrad/lazy.py +143 -242
  17. tinygrad/multi.py +173 -0
  18. tinygrad/nn/__init__.py +180 -9
  19. tinygrad/nn/datasets.py +8 -0
  20. tinygrad/nn/optim.py +106 -28
  21. tinygrad/nn/state.py +87 -19
  22. tinygrad/ops.py +104 -45
  23. tinygrad/renderer/__init__.py +65 -0
  24. tinygrad/renderer/assembly.py +269 -0
  25. tinygrad/renderer/cstyle.py +308 -210
  26. tinygrad/renderer/llvmir.py +119 -124
  27. tinygrad/runtime/__init__.py +0 -0
  28. tinygrad/runtime/autogen/amd_gpu.py +13403 -0
  29. tinygrad/runtime/autogen/comgr.py +891 -0
  30. tinygrad/runtime/autogen/cuda.py +5923 -0
  31. tinygrad/runtime/autogen/hip.py +5909 -0
  32. tinygrad/runtime/autogen/hsa.py +5893 -0
  33. tinygrad/runtime/autogen/io_uring.py +1486 -0
  34. tinygrad/runtime/autogen/kfd.py +812 -0
  35. tinygrad/runtime/autogen/nv_gpu.py +33597 -0
  36. tinygrad/runtime/autogen/opencl.py +1795 -0
  37. tinygrad/runtime/driver/__init__.py +0 -0
  38. tinygrad/runtime/driver/hip_comgr.py +56 -0
  39. tinygrad/runtime/graph/__init__.py +0 -0
  40. tinygrad/runtime/graph/clang.py +39 -0
  41. tinygrad/runtime/graph/cuda.py +59 -54
  42. tinygrad/runtime/graph/hcq.py +187 -0
  43. tinygrad/runtime/graph/metal.py +37 -41
  44. tinygrad/runtime/ops_amd.py +550 -0
  45. tinygrad/runtime/ops_clang.py +16 -14
  46. tinygrad/runtime/ops_cuda.py +129 -37
  47. tinygrad/runtime/ops_disk.py +111 -43
  48. tinygrad/runtime/ops_gpu.py +52 -50
  49. tinygrad/runtime/ops_llvm.py +36 -56
  50. tinygrad/runtime/ops_metal.py +41 -24
  51. tinygrad/runtime/ops_npy.py +9 -0
  52. tinygrad/runtime/ops_nv.py +625 -0
  53. tinygrad/runtime/ops_python.py +208 -0
  54. tinygrad/shape/__init__.py +0 -0
  55. tinygrad/shape/shapetracker.py +46 -107
  56. tinygrad/shape/symbolic.py +99 -98
  57. tinygrad/shape/view.py +162 -45
  58. tinygrad/tensor.py +2492 -483
  59. {tinygrad-0.8.0.dist-info → tinygrad-0.9.1.dist-info}/LICENSE +1 -1
  60. {tinygrad-0.8.0.dist-info → tinygrad-0.9.1.dist-info}/METADATA +31 -13
  61. tinygrad-0.9.1.dist-info/RECORD +63 -0
  62. {tinygrad-0.8.0.dist-info → tinygrad-0.9.1.dist-info}/WHEEL +1 -1
  63. tinygrad/features/image.py +0 -93
  64. tinygrad/features/multi.py +0 -103
  65. tinygrad/features/search.py +0 -160
  66. tinygrad/graph.py +0 -106
  67. tinygrad/jit.py +0 -152
  68. tinygrad/realize.py +0 -50
  69. tinygrad/runtime/graph/hip.py +0 -24
  70. tinygrad/runtime/ops_cpu.py +0 -45
  71. tinygrad/runtime/ops_hip.py +0 -97
  72. tinygrad/runtime/ops_torch.py +0 -49
  73. tinygrad-0.8.0.dist-info/RECORD +0 -41
  74. {tinygrad-0.8.0.dist-info → tinygrad-0.9.1.dist-info}/top_level.txt +0 -0
@@ -1,12 +1,13 @@
1
1
  from __future__ import annotations
2
2
  import subprocess, hashlib, tempfile, ctypes, ctypes.util, functools, re
3
3
  from pathlib import Path
4
- from typing import Tuple, Optional
5
- import gpuctypes.cuda as cuda
6
- from tinygrad.helpers import DEBUG, getenv, from_mv, init_c_var, colored, cpu_time_execution, compile_cuda_style, encode_args_cuda_style, time_execution_cuda_style # noqa: E501
7
- from tinygrad.device import Compiled, LRUAllocator, MallocAllocator
8
- from tinygrad.codegen.kernel import LinearizerOptions
4
+ from typing import Tuple, Optional, List
5
+ import tinygrad.runtime.autogen.cuda as cuda
6
+ from tinygrad.helpers import DEBUG, getenv, from_mv, to_char_p_p, init_c_var, init_c_struct_t, colored, cpu_time_execution
7
+ from tinygrad.device import Compiled, Compiler, CompileError, BufferOptions, LRUAllocator, MallocAllocator
9
8
  from tinygrad.renderer.cstyle import CUDARenderer
9
+ from tinygrad.renderer.assembly import PTXRenderer
10
+ if getenv("IOCTL"): import extra.nv_gpu_driver.nv_ioctl # noqa: F401 # pylint: disable=unused-import
10
11
 
11
12
  def pretty_ptx(s):
12
13
  # all expressions match `<valid_before><expr><valid_after>` and replace it with `<valid_before>color(<expr>)<valid_after>`
@@ -22,72 +23,163 @@ CUDACPU = getenv("CUDACPU") == 1
22
23
  if CUDACPU:
23
24
  gpuocelot_lib = ctypes.CDLL(ctypes.util.find_library("gpuocelot"))
24
25
  gpuocelot_lib.ptx_run.argtypes = [ctypes.c_char_p, ctypes.c_int, ctypes.POINTER(ctypes.c_void_p), ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int] # noqa: E501
25
- cuda.cuLaunchKernel = lambda src, gx, gy, gz, lx, ly, lz, shared, stream, unused_extra, args: gpuocelot_lib.ptx_run(src, len(args), (ctypes.c_void_p * len(args))(*[ctypes.cast(x, ctypes.c_void_p) for x in args]), lx, ly, lz, gx, gy, gz, shared) # noqa: E501
26
+ cuda.cuLaunchKernel = lambda src, gx, gy, gz, lx, ly, lz, shared, stream, unused_extra, args: gpuocelot_lib.ptx_run(src, len(args), (ctypes.c_void_p * len(args))(*[ctypes.cast(x, ctypes.c_void_p) for x in args]), lx, ly, lz, gx, gy, gz, shared) # type: ignore # noqa: E501
26
27
 
27
28
  def check(status):
28
29
  if status != 0: raise RuntimeError(f"CUDA Error {status}, {ctypes.string_at(init_c_var(ctypes.POINTER(ctypes.c_char)(), lambda x: cuda.cuGetErrorString(status, ctypes.byref(x)))).decode()}") # noqa: E501
29
30
 
30
- def cu_time_execution(cb, enable=False) -> Optional[float]: return time_execution_cuda_style(cb, cuda.CUevent, cuda.cuEventCreate, cuda.cuEventRecord, cuda.cuEventSynchronize, cuda.cuEventDestroy_v2, cuda.cuEventElapsedTime, enable=enable) if not CUDACPU else cpu_time_execution(cb, enable=enable) # noqa: E501
31
+ def encode_args(args, vals) -> Tuple[ctypes.Structure, ctypes.Array]:
32
+ c_args = init_c_struct_t(tuple([(f'f{i}', cuda.CUdeviceptr_v2) for i in range(len(args))] +
33
+ [(f'v{i}', ctypes.c_int) for i in range(len(vals))]))(*args, *vals)
34
+ vargs = (ctypes.c_void_p * 5)(ctypes.c_void_p(1), ctypes.cast(ctypes.byref(c_args), ctypes.c_void_p), ctypes.c_void_p(2),
35
+ ctypes.cast(ctypes.pointer(ctypes.c_size_t(ctypes.sizeof(c_args))), ctypes.c_void_p), ctypes.c_void_p(0))
36
+ return c_args, vargs
31
37
 
32
- def compile_cuda(prg) -> bytes: return compile_cuda_style(prg, [f'--gpu-architecture={CUDADevice.default_arch_name}', "-I/usr/local/cuda/include", "-I/usr/include", "-I/opt/cuda/include/"], cuda.nvrtcProgram, cuda.nvrtcCreateProgram, cuda.nvrtcCompileProgram, cuda.nvrtcGetPTX, cuda.nvrtcGetPTXSize, cuda.nvrtcGetProgramLog, cuda.nvrtcGetProgramLogSize, check) # noqa: E501
38
+ def cu_time_execution(cb, enable=False) -> Optional[float]:
39
+ if CUDACPU: return cpu_time_execution(cb, enable=enable)
40
+ if not enable: return cb()
41
+ evs = [init_c_var(cuda.CUevent(), lambda x: cuda.cuEventCreate(ctypes.byref(x), 0)) for _ in range(2)]
42
+ cuda.cuEventRecord(evs[0], None)
43
+ cb()
44
+ cuda.cuEventRecord(evs[1], None)
45
+ check(cuda.cuEventSynchronize(evs[1]))
46
+ cuda.cuEventElapsedTime(ctypes.byref(ret := ctypes.c_float()), evs[0], evs[1])
47
+ for ev in evs: cuda.cuEventDestroy_v2(ev)
48
+ return ret.value * 1e-3
49
+
50
+ def _get_bytes(arg, get_str, get_sz, check) -> bytes:
51
+ sz = init_c_var(ctypes.c_size_t(), lambda x: check(get_sz(arg, ctypes.byref(x))))
52
+ return ctypes.string_at(init_c_var(ctypes.create_string_buffer(sz.value), lambda x: check(get_str(arg, x))), size=sz.value)
53
+
54
+ class PTXCompiler(Compiler):
55
+ def __init__(self, arch:str):
56
+ self.arch = arch
57
+ self.version = "7.8" if arch >= "sm_89" else "7.5"
58
+ super().__init__(f"compile_ptx_{self.arch}")
59
+ def compile(self, src:str) -> bytes: return src.replace("TARGET", self.arch).replace("VERSION", self.version).encode()
60
+
61
+ class CUDACompiler(Compiler):
62
+ def __init__(self, arch:str):
63
+ self.arch = arch
64
+ check(cuda.nvrtcVersion((nvrtcMajor := ctypes.c_int()), (nvrtcMinor := ctypes.c_int())))
65
+ self.compile_options = [f'--gpu-architecture={arch}', "-I/usr/local/cuda/include", "-I/usr/include", "-I/opt/cuda/include/"]
66
+ if (nvrtcMajor.value, nvrtcMinor.value) >= (12, 4): self.compile_options.append("--minimal")
67
+ super().__init__(f"compile_cuda_{self.arch}")
68
+ def compile(self, src:str) -> bytes:
69
+ check(cuda.nvrtcCreateProgram(ctypes.byref(prog := cuda.nvrtcProgram()), src.encode(), "<null>".encode(), 0, None, None))
70
+ status = cuda.nvrtcCompileProgram(prog, len(self.compile_options), to_char_p_p([o.encode() for o in self.compile_options]))
71
+
72
+ if status != 0: raise CompileError(f"compile failed: {_get_bytes(prog, cuda.nvrtcGetProgramLog, cuda.nvrtcGetProgramLogSize, check).decode()}")
73
+ return _get_bytes(prog, cuda.nvrtcGetPTX, cuda.nvrtcGetPTXSize, check)
74
+
75
+ def cuda_disassemble(lib, arch):
76
+ try:
77
+ fn = (Path(tempfile.gettempdir()) / f"tinycuda_{hashlib.md5(lib).hexdigest()}").as_posix()
78
+ with open(fn + ".ptx", "wb") as f: f.write(lib)
79
+ subprocess.run(["ptxas", f"-arch={arch}", "-o", fn, fn+".ptx"], check=True)
80
+ print(subprocess.check_output(['nvdisasm', fn]).decode('utf-8'))
81
+ except Exception as e: print("failed to generate SASS", str(e))
33
82
 
34
83
  class CUDAProgram:
35
84
  def __init__(self, device:CUDADevice, name:str, lib:bytes):
36
85
  self.device, self.name, self.lib = device, name, lib
37
- if DEBUG >= 5: print(pretty_ptx(lib.decode('utf-8')))
38
- if DEBUG >= 6:
39
- try:
40
- fn = (Path(tempfile.gettempdir()) / f"tinycuda_{hashlib.md5(lib).hexdigest()}").as_posix()
41
- with open(fn + ".ptx", "wb") as f: f.write(lib)
42
- subprocess.run(["ptxas", f"-arch={CUDADevice.default_arch_name}", "-o", fn, fn+".ptx"], check=True)
43
- print(subprocess.check_output(['nvdisasm', fn]).decode('utf-8'))
44
- except Exception as e: print("failed to generate SASS", str(e))
86
+ if DEBUG >= 5: print("\n".join([f"{i+1:>3} {line}" for i, line in enumerate(pretty_ptx(lib.decode('utf-8')).split("\n"))]))
87
+ if DEBUG >= 6: cuda_disassemble(lib, device.arch)
45
88
 
46
- if not CUDACPU:
89
+ if CUDACPU: self.prg = lib
90
+ else:
47
91
  check(cuda.cuCtxSetCurrent(self.device.context))
48
- self.module = init_c_var(cuda.CUmodule(), lambda x: check(cuda.cuModuleLoadData(ctypes.byref(x), lib)))
92
+ self.module = cuda.CUmodule()
93
+ status = cuda.cuModuleLoadData(ctypes.byref(self.module), lib)
94
+ if status != 0:
95
+ del self.module
96
+ cuda_disassemble(lib, device.arch)
97
+ raise RuntimeError(f"module load failed with status code {status}: {cuda.cudaError_enum__enumvalues[status]}")
49
98
  check(cuda.cuModuleGetFunction(ctypes.byref(prg := cuda.CUfunction()), self.module, name.encode("utf-8")))
50
- self.prg = prg if not CUDACPU else lib
99
+ self.prg = prg #type: ignore
51
100
 
52
101
  def __del__(self):
53
- if not CUDACPU: check(cuda.cuModuleUnload(self.module))
102
+ if hasattr(self, 'module'): check(cuda.cuModuleUnload(self.module))
54
103
 
55
- def __call__(self, *bufs, global_size:Tuple[int,int,int], local_size:Tuple[int,int,int], vals:Tuple[int, ...]=(), wait=False):
56
- if not CUDACPU: check(cuda.cuCtxSetCurrent(self.device.context))
57
- c_kernel_input_config = encode_args_cuda_style(bufs, vals, cuda.CUdeviceptr_v2, (1,2,0))[0] if not CUDACPU else (bufs+tuple(vals))
58
- return cu_time_execution(lambda: check(cuda.cuLaunchKernel(self.prg, *global_size, *local_size, 0, None, None, c_kernel_input_config)), enable=wait) # noqa: E501
104
+ def __call__(self, *args, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1), vals:Tuple[int, ...]=(), wait=False):
105
+ if CUDACPU: self.vargs = args+tuple(vals)
106
+ else:
107
+ check(cuda.cuCtxSetCurrent(self.device.context))
108
+ if not hasattr(self, "vargs"):
109
+ self.c_args, self.vargs = encode_args(args, vals) #type: ignore
110
+ else:
111
+ for i in range(len(args)): self.c_args.__setattr__(f'f{i}', args[i])
112
+ for i in range(len(vals)): self.c_args.__setattr__(f'v{i}', vals[i])
113
+ return cu_time_execution(lambda: check(cuda.cuLaunchKernel(self.prg, *global_size, *local_size, 0, None, None, self.vargs)), enable=wait)
59
114
 
60
115
  class CUDAAllocator(LRUAllocator):
61
116
  def __init__(self, device:CUDADevice):
62
117
  self.device = device
63
118
  super().__init__()
64
- def _alloc(self, size):
119
+ def _alloc(self, size, options:BufferOptions):
65
120
  check(cuda.cuCtxSetCurrent(self.device.context))
121
+ if options.host: return init_c_var(ctypes.c_void_p(), lambda x: check(cuda.cuMemHostAlloc(ctypes.byref(x), size, 0x01)))
66
122
  return init_c_var(cuda.CUdeviceptr(), lambda x: check(cuda.cuMemAlloc_v2(ctypes.byref(x), size)))
67
- def _free(self, opaque): check(cuda.cuMemFree_v2(opaque))
123
+ def _free(self, opaque, options:BufferOptions):
124
+ if options.host: check(cuda.cuMemFreeHost(opaque))
125
+ else: check(cuda.cuMemFree_v2(opaque))
68
126
  def copyin(self, dest, src:memoryview):
69
127
  check(cuda.cuCtxSetCurrent(self.device.context))
70
- check(cuda.cuMemcpyHtoD_v2(dest, from_mv(src), len(src), None))
128
+ host_mem = self.alloc(len(src), BufferOptions(host=True))
129
+ self.device.pending_copyin.append((host_mem, len(src), BufferOptions(host=True)))
130
+ ctypes.memmove(host_mem, from_mv(src), len(src))
131
+ check(cuda.cuMemcpyHtoDAsync_v2(dest, host_mem, len(src), None))
71
132
  def copyout(self, dest:memoryview, src):
133
+ CUDADevice.synchronize_system()
72
134
  check(cuda.cuCtxSetCurrent(self.device.context))
73
135
  check(cuda.cuMemcpyDtoH_v2(from_mv(dest), src, len(dest)))
136
+ def transfer(self, dest, src, sz:int, src_dev, dest_dev):
137
+ check(cuda.cuCtxSetCurrent(src_dev.context))
138
+ check(cuda.cuEventCreate(ctypes.byref(sync_event := cuda.CUevent()), 0))
139
+ check(cuda.cuMemcpyDtoDAsync_v2(dest, src, sz, None))
140
+ check(cuda.cuEventRecord(sync_event, None))
141
+ check(cuda.cuCtxSetCurrent(dest_dev.context))
142
+ check(cuda.cuStreamWaitEvent(None, sync_event, 0)) # sync the default stream on the dest dev
143
+ def offset(self, buf, size:int, offset:int): return ctypes.c_ulong(buf.value + offset)
74
144
 
75
145
  class CUDADevice(Compiled):
76
- default_arch_name = "sm_35"
146
+ devices: List[CUDADevice] = []
147
+ peer_access = False
148
+
77
149
  def __init__(self, device:str):
78
150
  device_id = int(device.split(":")[1]) if ":" in device else 0
79
151
  if not CUDACPU:
80
152
  check(cuda.cuInit(0))
81
- check(cuda.cuDeviceGet(ctypes.byref(device := cuda.CUdevice()), device_id))
82
- self.context = init_c_var(cuda.CUcontext(), lambda x: check(cuda.cuCtxCreate_v2(ctypes.byref(x), 0, device)))
153
+ self.cu_device = init_c_var(cuda.CUdevice(), lambda x: check(cuda.cuDeviceGet(ctypes.byref(x), device_id)))
154
+ self.context = init_c_var(cuda.CUcontext(), lambda x: check(cuda.cuCtxCreate_v2(ctypes.byref(x), 0, self.cu_device)))
83
155
  check(cuda.cuDeviceComputeCapability(ctypes.byref(major := ctypes.c_int()), ctypes.byref(minor := ctypes.c_int()), device_id))
84
- if device_id == 0: CUDADevice.default_arch_name = f"sm_{major.value}{minor.value}"
156
+
157
+ for dev in CUDADevice.devices:
158
+ check(cuda.cuDeviceCanAccessPeer(ctypes.byref(val := ctypes.c_int()), self.cu_device, dev.cu_device))
159
+ if val.value != 1: continue
160
+ check(cuda.cuCtxSetCurrent(dev.context))
161
+ check(cuda.cuCtxEnablePeerAccess(self.context, 0))
162
+ check(cuda.cuCtxSetCurrent(self.context))
163
+ check(cuda.cuCtxEnablePeerAccess(dev.context, 0))
164
+ CUDADevice.peer_access = True
165
+
166
+ self.arch = f"sm_{major.value}{minor.value}" if not CUDACPU else "sm_35"
167
+ self.pending_copyin: List[Tuple[int, int, Optional[BufferOptions]]] = []
168
+ CUDADevice.devices.append(self)
85
169
 
86
170
  from tinygrad.runtime.graph.cuda import CUDAGraph
87
- super().__init__(CUDAAllocator(self) if not CUDACPU else MallocAllocator,
88
- LinearizerOptions(supports_float4_alu=False, global_max=[65535, 65535, 2147483647], local_max=[64, 1024, 1024]),
89
- CUDARenderer, compile_cuda, functools.partial(CUDAProgram, self), graph=CUDAGraph if not CUDACPU else None)
171
+ super().__init__(device, CUDAAllocator(self) if not CUDACPU else MallocAllocator,
172
+ PTXRenderer(self.arch) if getenv("PTX") else CUDARenderer(self.arch),
173
+ PTXCompiler(self.arch) if getenv("PTX") else CUDACompiler(self.arch),
174
+ functools.partial(CUDAProgram, self), graph=CUDAGraph if not CUDACPU else None)
175
+
90
176
  def synchronize(self):
91
- if not CUDACPU:
92
- check(cuda.cuCtxSetCurrent(self.context))
93
- check(cuda.cuCtxSynchronize())
177
+ if CUDACPU: return
178
+ check(cuda.cuCtxSetCurrent(self.context))
179
+ check(cuda.cuCtxSynchronize())
180
+ for opaque,sz,options in self.pending_copyin: self.allocator.free(opaque, sz, options)
181
+ self.pending_copyin.clear()
182
+
183
+ @staticmethod
184
+ def synchronize_system():
185
+ for d in CUDADevice.devices: d.synchronize()
@@ -1,57 +1,125 @@
1
- import os, mmap, _posixshmem, io
2
- from typing import Callable, Dict, Tuple
3
- from tinygrad.dtype import DType, dtypes
4
- from tinygrad.helpers import prod, OSX
5
- from tinygrad.device import Interpreted, Allocator
6
- from tinygrad.ops import Op, MovementOps, UnaryOps
7
- from tinygrad.shape.view import strides_for_shape
8
-
9
- class UnderlyingDiskBuffer:
10
- def __init__(self, fd, mem): self.fd, self.mem = fd, mem
11
- def __del__(self):
12
- if self.fd is not None: os.close(self.fd)
1
+ from __future__ import annotations
2
+ import os, mmap, _posixshmem, io, ctypes, ctypes.util, platform, contextlib
3
+ from typing import Optional, Generator, Tuple, Callable, List
4
+ from tinygrad.helpers import OSX, round_up
5
+ from tinygrad.device import Compiled, Allocator
6
+ import tinygrad.runtime.autogen.io_uring as io_uring
7
+
8
+ libc = ctypes.CDLL(ctypes.util.find_library("c"))
9
+ libc.mmap.argtypes = [ctypes.c_void_p, ctypes.c_size_t, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_long]
10
+ libc.mmap.restype = ctypes.c_void_p
13
11
 
14
12
  class DiskBuffer:
15
- def __init__(self, ud:UnderlyingDiskBuffer, size:int, dtype:DType=dtypes.uint8, offset=0):
16
- self.ud, self.size, self.dtype, self.offset = ud, size, dtype, offset
17
- def __repr__(self): return f"<DiskBuffer size={self.size} dtype={self.dtype} offset={self.offset}>"
18
- def cast(self, arg:Tuple[DType, bool]):
19
- # TODO: support shape changing bitcast
20
- #assert arg[1], "DiskTensor only supports bitcast"
21
- return DiskBuffer(self.ud, self.size, arg[0], offset=self.offset)
22
- def as_strided(self, arg):
23
- assert strides_for_shape(arg[0]) == arg[1], "disk tensors don't support strides"
24
- return DiskBuffer(self.ud, prod(arg[0]), self.dtype, offset=self.offset+arg[2]*self.dtype.itemsize)
25
- def _buf(self) -> memoryview: return memoryview(self.ud.mem)[self.offset:self.offset+self.size*self.dtype.itemsize]
26
-
27
- disk_fxn_for_op: Dict[Op, Callable] = { UnaryOps.CAST: DiskBuffer.cast, MovementOps.AS_STRIDED: DiskBuffer.as_strided }
13
+ def __init__(self, device:DiskDevice, size:int, offset=0):
14
+ self.device, self.size, self.offset = device, size, offset
15
+ def __repr__(self): return f"<DiskBuffer size={self.size} offset={self.offset}>"
16
+ def _buf(self) -> memoryview:
17
+ assert self.device.mem is not None, "DiskBuffer wasn't opened"
18
+ return memoryview(self.device.mem)[self.offset:self.offset+self.size]
28
19
 
29
20
  MAP_LOCKED, MAP_POPULATE = 0 if OSX else 0x2000, getattr(mmap, "MAP_POPULATE", 0 if OSX else 0x008000)
30
21
  class DiskAllocator(Allocator):
31
- def __init__(self, device): self.device = device
32
- def _alloc(self, size):
33
- if str(self.device).startswith("shm:"):
34
- fd = _posixshmem.shm_open("/"+self.device[4:].lstrip("/"), os.O_RDWR, 0o600)
35
- mem = mmap.mmap(fd, size, mmap.MAP_SHARED | MAP_POPULATE | MAP_LOCKED)
36
- os.close(fd)
37
- fd = None
38
- else:
39
- try: fd = os.open(self.device, os.O_RDWR|os.O_CREAT|(0 if OSX else os.O_DIRECT))
40
- except OSError: fd = os.open(self.device, os.O_RDWR|os.O_CREAT)
41
- if os.fstat(fd).st_size < size: os.ftruncate(fd, size)
42
- mem = mmap.mmap(fd, size)
43
- if (hp := getattr(mmap, "MADV_HUGEPAGE", None)) is not None: mem.madvise(hp) # type: ignore
44
- return DiskBuffer(UnderlyingDiskBuffer(fd, mem), size)
22
+ def __init__(self, device:DiskDevice): self.device = device
23
+ def _alloc(self, size:int, options):
24
+ self.device._might_open(size)
25
+ return DiskBuffer(self.device, size)
26
+ def _free(self, opaque, options): self.device._might_close()
45
27
  def as_buffer(self, src:DiskBuffer): return src._buf()
46
28
  def copyin(self, dest:DiskBuffer, src:memoryview): dest._buf()[:] = src
47
29
  def copyout(self, dest:memoryview, src:DiskBuffer):
48
- if OSX and src.ud.fd is not None:
30
+ if OSX and hasattr(self.device, 'fd'):
49
31
  # OSX doesn't seem great at mmap, this is faster
50
- with io.FileIO(src.ud.fd, "a+b", closefd=False) as fo:
32
+ with io.FileIO(self.device.fd, "a+b", closefd=False) as fo:
51
33
  fo.seek(src.offset)
52
34
  fo.readinto(dest)
53
35
  else:
54
36
  dest[:] = src._buf()
55
37
 
56
- class DiskDevice(Interpreted):
57
- def __init__(self, device): super().__init__(DiskAllocator(device[5:]), disk_fxn_for_op)
38
+ def _copyout_sharded(self, src:DiskBuffer, size:int, _get_free_buf:Callable, seg_len:int) -> Generator[Tuple[int, int, int, int], None, None]:
39
+ assert hasattr(DiskDevice, 'io_uring'), "function requires io uring support"
40
+
41
+ fd_offset = src.offset - (minor_offset := src.offset % mmap.PAGESIZE)
42
+ processed_reqs_cnt, copied_in, next_read_offset, total_copy_size = 0, 0, 0, round_up(size + minor_offset, mmap.PAGESIZE)
43
+ reqs: List[Tuple[int, int, int, int]] = []
44
+
45
+ while next_read_offset < total_copy_size or len(reqs) != processed_reqs_cnt:
46
+ if next_read_offset < total_copy_size and (copy_batch := _get_free_buf()) is not None:
47
+ # Prepare sqe
48
+ sqe_index = (tail:=DiskDevice.io_uring.sq.ktail[0]) & DiskDevice.io_uring.sq.kring_mask[0]
49
+ sqe = DiskDevice.io_uring.sq.sqes[sqe_index]
50
+ sqe.opcode, sqe.fd, sqe.off = io_uring.IORING_OP_READ, self.device.fd, fd_offset + next_read_offset
51
+ sqe.addr, sqe.len, sqe.user_data = copy_batch[0], min(seg_len, total_copy_size - next_read_offset), len(reqs)
52
+
53
+ # Send sqe
54
+ DiskDevice.io_uring.sq.array[sqe_index] = sqe_index
55
+ DiskDevice.io_uring.sq.ktail[0] = tail + 1
56
+ libc.syscall(io_uring.NR_io_uring_enter, DiskDevice.io_uring.ring_fd, 1, 1, io_uring.IORING_ENTER_GETEVENTS)
57
+
58
+ reqs.append((copy_batch, copied_in, minor_offset, real_copy_size:=min(sqe.len - minor_offset, size - copied_in)))
59
+ next_read_offset += sqe.len
60
+ copied_in += real_copy_size
61
+ minor_offset = 0
62
+
63
+ if (head:=DiskDevice.io_uring.cq.khead[0]) != DiskDevice.io_uring.cq.ktail[0]:
64
+ cqe = DiskDevice.io_uring.cq.cqes[head & DiskDevice.io_uring.cq.kring_mask[0]]
65
+ assert cqe.res >= 0, f"read from disk failed, err: {cqe.res}"
66
+ yield reqs[cqe.user_data]
67
+ DiskDevice.io_uring.cq.khead[0] = head + 1 # advance
68
+ processed_reqs_cnt += 1
69
+
70
+ def offset(self, buf:DiskBuffer, size:int, offset:int): return DiskBuffer(buf.device, size, offset)
71
+
72
+ class DiskDevice(Compiled):
73
+ _tried_io_uring_init = False
74
+
75
+ def __init__(self, device:str):
76
+ if not DiskDevice._tried_io_uring_init: self._iouring_setup()
77
+
78
+ self.size: Optional[int] = None
79
+ self.count = 0
80
+ super().__init__(device, DiskAllocator(self), None, None, None)
81
+ def _might_open(self, size):
82
+ self.count += 1
83
+ assert self.size is None or size <= self.size, f"can't reopen Disk tensor with larger size, opened with {self.size}, tried to open with {size}"
84
+ if self.size is not None: return
85
+ filename = self.dname[len("disk:"):]
86
+ self.size = size
87
+
88
+ if filename.startswith("shm:"):
89
+ fd = _posixshmem.shm_open("/"+filename[4:].lstrip("/"), os.O_RDWR, 0o600)
90
+ self.mem = mmap.mmap(fd, self.size, mmap.MAP_SHARED | MAP_POPULATE | MAP_LOCKED)
91
+ os.close(fd)
92
+ else:
93
+ try: self.fd = os.open(filename, os.O_RDWR|os.O_CREAT|(0 if OSX else os.O_DIRECT))
94
+ except OSError: self.fd = os.open(filename, os.O_RDWR|os.O_CREAT)
95
+ if os.fstat(self.fd).st_size < self.size: os.ftruncate(self.fd, self.size)
96
+ self.mem = mmap.mmap(self.fd, self.size)
97
+ if (hp := getattr(mmap, "MADV_HUGEPAGE", None)) is not None:
98
+ with contextlib.suppress(OSError): self.mem.madvise(hp) # some systems have transparent_hugepage disabled
99
+ def _might_close(self):
100
+ self.count -= 1
101
+ if self.count == 0:
102
+ if hasattr(self, 'fd'): os.close(self.fd)
103
+ self.size = None
104
+ def _iouring_setup(self):
105
+ DiskDevice._tried_io_uring_init = True
106
+
107
+ if platform.system() != 'Linux': return
108
+
109
+ fd = libc.syscall(io_uring.NR_io_uring_setup, 4096, ctypes.byref(p:=io_uring.struct_io_uring_params()))
110
+ if fd < 0: return
111
+
112
+ sq_ptr = libc.mmap(0, p.sq_off.array + p.sq_entries * 4, mmap.PROT_READ | mmap.PROT_WRITE, mmap.MAP_SHARED | MAP_POPULATE, fd, 0)
113
+ cq_ptr = libc.mmap(0, p.cq_off.cqes + p.cq_entries * ctypes.sizeof(io_uring.struct_io_uring_cqe),
114
+ mmap.PROT_READ | mmap.PROT_WRITE, mmap.MAP_SHARED | MAP_POPULATE, fd, io_uring.IORING_OFF_CQ_RING)
115
+ sqes = libc.mmap(0, p.sq_entries * ctypes.sizeof(io_uring.struct_io_uring_sqe),
116
+ mmap.PROT_READ | mmap.PROT_WRITE, mmap.MAP_SHARED | MAP_POPULATE, fd, io_uring.IORING_OFF_SQES)
117
+
118
+ def u32ptr(val): return ctypes.cast(val, ctypes.POINTER(ctypes.c_uint32))
119
+ sqdesc = io_uring.struct_io_uring_sq(khead=u32ptr(sq_ptr+p.sq_off.head), ktail=u32ptr(sq_ptr+p.sq_off.tail), array=u32ptr(sq_ptr+p.sq_off.array),
120
+ kring_mask=u32ptr(sq_ptr+p.sq_off.ring_mask), sqes=ctypes.cast(sqes, ctypes.POINTER(io_uring.struct_io_uring_sqe)))
121
+
122
+ cqdesc = io_uring.struct_io_uring_cq(khead=u32ptr(cq_ptr+p.cq_off.head), ktail=u32ptr(cq_ptr+p.cq_off.tail),
123
+ kring_mask=u32ptr(sq_ptr+p.cq_off.ring_mask), cqes=ctypes.cast(cq_ptr+p.cq_off.cqes, ctypes.POINTER(io_uring.struct_io_uring_cqe)))
124
+
125
+ DiskDevice.io_uring = io_uring.struct_io_uring(ring_fd=fd, sq=sqdesc, cq=cqdesc) # type: ignore
@@ -1,12 +1,10 @@
1
1
  from __future__ import annotations
2
- from typing import Tuple, Optional, List
3
- import ctypes, functools
4
- import gpuctypes.opencl as cl
2
+ from typing import Tuple, Optional, List, cast
3
+ import ctypes, functools, hashlib
4
+ import tinygrad.runtime.autogen.opencl as cl
5
5
  from tinygrad.helpers import init_c_var, to_char_p_p, from_mv, OSX, DEBUG
6
- from tinygrad.dtype import ImageDType
7
- from tinygrad.codegen.kernel import LinearizerOptions
8
6
  from tinygrad.renderer.cstyle import OpenCLRenderer
9
- from tinygrad.device import Compiled, LRUAllocator
7
+ from tinygrad.device import BufferOptions, LRUAllocator, Compiled, Compiler, CompileError
10
8
 
11
9
  # see test/external/external_osx_profiling.py to determine this ratio. it's in like GPU clocks or something
12
10
  OSX_TIMING_RATIO = (125/3) if OSX else 1.0
@@ -15,44 +13,47 @@ def check(status):
15
13
  if status != 0: raise RuntimeError(f"OpenCL Error {status}")
16
14
  def checked(ret, status): return (check(status.value), ret)[1]
17
15
 
18
- def compile_cl(prg:str) -> bytes:
19
- assert CLDevice.compiler_context is not None, 'OpenCL requires a "compiler_context" to compile, init a device before you call this'
20
- program = checked(cl.clCreateProgramWithSource(CLDevice.compiler_context.context, 1, to_char_p_p([prg_bytes := prg.encode()]),
21
- ctypes.byref(ctypes.c_size_t(len(prg_bytes))), ctypes.byref(status := ctypes.c_int32())), status)
22
- status = cl.clBuildProgram(program, 1, ctypes.byref(CLDevice.compiler_context.device_id), None, cl.clBuildProgram.argtypes[4](), None)
23
- if status != 0:
24
- cl.clGetProgramBuildInfo(program, CLDevice.compiler_context.device_id, cl.CL_PROGRAM_BUILD_LOG, 0, None, ctypes.byref(log_size := ctypes.c_size_t())) # noqa: E501
25
- cl.clGetProgramBuildInfo(program, CLDevice.compiler_context.device_id, cl.CL_PROGRAM_BUILD_LOG, log_size.value, mstr := ctypes.create_string_buffer(log_size.value), None) # noqa: E501
26
- raise RuntimeError(f"OpenCL Compile Error\n\n{ctypes.string_at(mstr, size=log_size.value).decode()}")
27
- binary_sizes = init_c_var((ctypes.c_size_t * 1)(), lambda x: check(cl.clGetProgramInfo(program, cl.CL_PROGRAM_BINARY_SIZES, ctypes.sizeof(x), ctypes.byref(x), None))) # noqa: E501
28
- binary = init_c_var(ctypes.create_string_buffer(binary_sizes[0]), lambda x: check(cl.clGetProgramInfo(program, cl.CL_PROGRAM_BINARIES, ctypes.sizeof(ctypes.c_void_p), ctypes.byref((ctypes.c_void_p * 1)(ctypes.addressof(x))), None))) # noqa: E501
29
- check(cl.clReleaseProgram(program))
30
- return bytes(binary)
16
+ class CLCompiler(Compiler):
17
+ def __init__(self, device:CLDevice, compile_key:str):
18
+ self.device = device
19
+ super().__init__(f"compile_cl_{compile_key}")
20
+ def compile(self, src:str) -> bytes:
21
+ program = checked(cl.clCreateProgramWithSource(self.device.context, 1, to_char_p_p([src.encode()]), None, status := ctypes.c_int32()), status)
22
+ build_status: int = cl.clBuildProgram(program, 1, self.device.device_id, None, cl.clBuildProgram.argtypes[4](), None)
23
+ if build_status != 0:
24
+ cl.clGetProgramBuildInfo(program, self.device.device_id, cl.CL_PROGRAM_BUILD_LOG, 0, None, log_size := ctypes.c_size_t())
25
+ cl.clGetProgramBuildInfo(program, self.device.device_id, cl.CL_PROGRAM_BUILD_LOG, log_size.value, mstr := ctypes.create_string_buffer(log_size.value), None) # noqa: E501
26
+ raise CompileError(f"OpenCL Compile Error\n\n{mstr.value.decode()}")
27
+ check(cl.clGetProgramInfo(program, cl.CL_PROGRAM_BINARY_SIZES, ctypes.sizeof(ctypes.c_size_t), binary_sizes := (ctypes.c_size_t * 1)(), None))
28
+ check(cl.clGetProgramInfo(program, cl.CL_PROGRAM_BINARIES, ctypes.sizeof(ctypes.c_void_p), (ctypes.c_void_p * 1)(ctypes.addressof(binary := ctypes.create_string_buffer(binary_sizes[0]))), None)) # noqa: E501
29
+ check(cl.clReleaseProgram(program))
30
+ return bytes(binary)
31
31
 
32
32
  class CLProgram:
33
33
  def __init__(self, device:CLDevice, name:str, lib:bytes):
34
34
  self.device, self.name, self.lib = device, name, lib
35
- self.program = checked(cl.clCreateProgramWithBinary(device.context, 1, ctypes.byref(device.device_id), (ctypes.c_size_t * 1)(len(lib)),
36
- to_char_p_p([lib], ctypes.c_ubyte), ctypes.byref(binary_status := ctypes.c_int32()),
37
- ctypes.byref(errcode_ret := ctypes.c_int32())), errcode_ret)
35
+ self.program = checked(cl.clCreateProgramWithBinary(device.context, 1, device.device_id, (ctypes.c_size_t * 1)(len(lib)),
36
+ to_char_p_p([lib], ctypes.c_ubyte), binary_status := ctypes.c_int32(),
37
+ errcode_ret := ctypes.c_int32()), errcode_ret)
38
38
  check(binary_status.value)
39
- check(cl.clBuildProgram(self.program, 1, ctypes.byref(device.device_id), None, cl.clBuildProgram.argtypes[4](), None)) # NOTE: OSX requires this
40
- self.kernel = checked(cl.clCreateKernel(self.program, name.encode(), ctypes.byref(status := ctypes.c_int32())), status)
39
+ check(cl.clBuildProgram(self.program, 1, device.device_id, None, cl.clBuildProgram.argtypes[4](), None)) # NOTE: OSX requires this
40
+ self.kernel = checked(cl.clCreateKernel(self.program, name.encode(), status := ctypes.c_int32()), status)
41
41
 
42
42
  def __del__(self):
43
- check(cl.clReleaseKernel(self.kernel))
44
- check(cl.clReleaseProgram(self.program))
43
+ if hasattr(self, 'kernel'): check(cl.clReleaseKernel(self.kernel))
44
+ if hasattr(self, 'program'): check(cl.clReleaseProgram(self.program))
45
45
 
46
- def __call__(self, *bufs:cl.cl_mem, global_size:Tuple[int,...], local_size:Optional[Tuple[int,...]]=None, vals:Tuple[int, ...]=(), wait=False) -> Optional[float]: # noqa: E501
46
+ def __call__(self, *bufs:ctypes._CData, global_size:Tuple[int,int,int]=(1,1,1), local_size:Optional[Tuple[int,int,int]]=None, vals:Tuple[int, ...]=(), wait=False) -> Optional[float]: # noqa: E501
47
47
  for i,b in enumerate(bufs): cl.clSetKernelArg(self.kernel, i, ctypes.sizeof(b), ctypes.byref(b))
48
- for i,b in enumerate(vals,start=len(bufs)): cl.clSetKernelArg(self.kernel, i, 4, ctypes.byref(ctypes.c_int32(b)))
49
- if local_size is not None: global_size = tuple(int(g*l) for g,l in zip(global_size, local_size))
48
+ for i,v in enumerate(vals,start=len(bufs)): cl.clSetKernelArg(self.kernel, i, 4, ctypes.byref(ctypes.c_int32(v)))
49
+ if local_size is not None: global_size = cast(Tuple[int,int,int], tuple(int(g*l) for g,l in zip(global_size, local_size)))
50
50
  event = cl.cl_event() if wait else None
51
51
  check(cl.clEnqueueNDRangeKernel(self.device.queue, self.kernel, len(global_size), None, (ctypes.c_size_t * len(global_size))(*global_size), (ctypes.c_size_t * len(local_size))(*local_size) if local_size else None, 0, None, event)) # noqa: E501
52
52
  if wait:
53
- check(cl.clWaitForEvents(1, ctypes.byref(event)))
54
- start = init_c_var(ctypes.c_uint64(), lambda x: check(cl.clGetEventProfilingInfo(event, cl.CL_PROFILING_COMMAND_START, ctypes.sizeof(x), ctypes.byref(x), None))) # noqa: E501
55
- end = init_c_var(ctypes.c_uint64(), lambda x: check(cl.clGetEventProfilingInfo(event, cl.CL_PROFILING_COMMAND_END, ctypes.sizeof(x), ctypes.byref(x), None))) # noqa: E501
53
+ assert event is not None
54
+ check(cl.clWaitForEvents(1, event))
55
+ check(cl.clGetEventProfilingInfo(event, cl.CL_PROFILING_COMMAND_START, 8, ctypes.byref(start := ctypes.c_uint64()), None))
56
+ check(cl.clGetEventProfilingInfo(event, cl.CL_PROFILING_COMMAND_END, 8, ctypes.byref(end := ctypes.c_uint64()), None))
56
57
  return float(end.value-start.value) * OSX_TIMING_RATIO * 1e-9
57
58
  return None
58
59
 
@@ -60,40 +61,41 @@ class CLAllocator(LRUAllocator):
60
61
  def __init__(self, device:CLDevice):
61
62
  self.device = device
62
63
  super().__init__()
63
- def _alloc(self, size:int) -> cl.cl_mem:
64
- return checked(cl.clCreateBuffer(self.device.context, cl.CL_MEM_READ_WRITE, size, None, ctypes.byref(status := ctypes.c_int32())), status)
65
- def _alloc_image(self, dtype:ImageDType) -> cl.cl_mem:
66
- return checked(cl.clCreateImage2D(self.device.context, cl.CL_MEM_READ_WRITE,
67
- cl.cl_image_format(cl.CL_RGBA, {2: cl.CL_HALF_FLOAT, 4: cl.CL_FLOAT}[dtype.itemsize]),
68
- dtype.shape[1], dtype.shape[0], 0, None, ctypes.byref(status := ctypes.c_int32())), status)
69
- def _free(self, buf:cl.cl_mem): check(cl.clReleaseMemObject(buf))
70
- def copyin(self, dest:cl.cl_mem, src:memoryview):
64
+ def _alloc(self, size:int, options:BufferOptions) -> ctypes._CData:
65
+ if options.image is not None:
66
+ return checked(cl.clCreateImage2D(self.device.context, cl.CL_MEM_READ_WRITE,
67
+ cl.cl_image_format(cl.CL_RGBA, {2: cl.CL_HALF_FLOAT, 4: cl.CL_FLOAT}[options.image.itemsize]),
68
+ options.image.shape[1], options.image.shape[0], 0, None, status := ctypes.c_int32()), status)
69
+ return checked(cl.clCreateBuffer(self.device.context, cl.CL_MEM_READ_WRITE, size, None, status := ctypes.c_int32()), status)
70
+ def _free(self, opaque:ctypes._CData, options:BufferOptions): check(cl.clReleaseMemObject(opaque))
71
+ def copyin(self, dest:ctypes._CData, src:memoryview):
71
72
  check(cl.clEnqueueWriteBuffer(self.device.queue, dest, False, 0, len(src)*src.itemsize, from_mv(src), 0, None, None))
72
73
  self.device.pending_copyin.append(src) # NOTE: these can't be freed until the GPU actually executes this command
73
- def copyout(self, dest:memoryview, src:cl.cl_mem):
74
+ def copyout(self, dest:memoryview, src:ctypes._CData):
74
75
  check(cl.clEnqueueReadBuffer(self.device.queue, src, False, 0, len(dest)*dest.itemsize, from_mv(dest), 0, None, None))
75
76
  self.device.synchronize()
76
77
 
77
78
  class CLDevice(Compiled):
78
79
  device_ids = None # this is global and only initted once
79
- compiler_context = None # this is the first created context. we make an assumption they are all the same for the compiler
80
80
  def __init__(self, device:str=""):
81
81
  if CLDevice.device_ids is None:
82
- num_platforms = init_c_var(ctypes.c_uint32(), lambda x: check(cl.clGetPlatformIDs(0, None, ctypes.byref(x))))
83
- platform_ids = init_c_var((cl.cl_platform_id * num_platforms.value)(), lambda x: check(cl.clGetPlatformIDs(num_platforms.value, x, None)))
82
+ check(cl.clGetPlatformIDs(0, None, num_platforms := ctypes.c_uint32()))
83
+ check(cl.clGetPlatformIDs(num_platforms.value, platform_ids := (cl.cl_platform_id * num_platforms.value)(), None))
84
84
  for device_type in [cl.CL_DEVICE_TYPE_GPU, cl.CL_DEVICE_TYPE_DEFAULT]:
85
- num_devices = ctypes.c_uint32()
86
- err = cl.clGetDeviceIDs(platform_ids[0], device_type, 0, None, ctypes.byref(num_devices))
85
+ err = cl.clGetDeviceIDs(platform_ids[0], device_type, 0, None, num_devices := ctypes.c_uint32())
87
86
  if err == 0 and num_devices.value != 0: break
88
87
  if DEBUG >= 1: print(f"CLDevice: got {num_platforms.value} platforms and {num_devices.value} devices")
89
88
  CLDevice.device_ids = init_c_var((cl.cl_device_id * num_devices.value)(), lambda x: check(cl.clGetDeviceIDs(platform_ids[0], device_type, num_devices, x, None))) # noqa: E501
90
89
 
91
90
  self.device_id = CLDevice.device_ids[0 if ":" not in device else int(device.split(":")[1])]
92
- self.context = checked(cl.clCreateContext(None, 1, ctypes.byref(self.device_id), cl.clCreateContext.argtypes[3](), None, ctypes.byref(status := ctypes.c_int32())), status) # noqa: E501
93
- if CLDevice.compiler_context is None: CLDevice.compiler_context = self
94
- self.queue = checked(cl.clCreateCommandQueue(self.context, self.device_id, cl.CL_QUEUE_PROFILING_ENABLE, ctypes.byref(status)), status)
91
+ self.device_name = (cl.clGetDeviceInfo(self.device_id, cl.CL_DEVICE_NAME, 256, buf := ctypes.create_string_buffer(256), None), buf.value.decode())[1] # noqa: E501
92
+ self.driver_version = (cl.clGetDeviceInfo(self.device_id, cl.CL_DRIVER_VERSION, 256, buf := ctypes.create_string_buffer(256), None), buf.value.decode())[1] # noqa: E501
93
+ self.context = checked(cl.clCreateContext(None, 1, self.device_id, cl.clCreateContext.argtypes[3](), None, status := ctypes.c_int32()), status)
94
+ self.queue = checked(cl.clCreateCommandQueue(self.context, self.device_id, cl.CL_QUEUE_PROFILING_ENABLE, status), status)
95
95
  self.pending_copyin: List[memoryview] = []
96
- super().__init__(CLAllocator(self), LinearizerOptions(), OpenCLRenderer, compile_cl, functools.partial(CLProgram, self))
96
+
97
+ compile_key = hashlib.md5(self.device_name.encode() + self.driver_version.encode()).hexdigest()
98
+ super().__init__(device, CLAllocator(self), OpenCLRenderer(), CLCompiler(self, f"compile_cl_{compile_key}"), functools.partial(CLProgram, self))
97
99
  def synchronize(self):
98
100
  check(cl.clFinish(self.queue))
99
101
  self.pending_copyin.clear()