tinygrad 0.9.1__py3-none-any.whl → 0.9.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tinygrad/codegen/kernel.py +248 -115
- tinygrad/codegen/lowerer.py +215 -0
- tinygrad/codegen/transcendental.py +310 -0
- tinygrad/codegen/uopgraph.py +622 -0
- tinygrad/codegen/uops.py +235 -393
- tinygrad/device.py +428 -69
- tinygrad/dtype.py +18 -4
- tinygrad/engine/graph.py +19 -32
- tinygrad/engine/jit.py +148 -70
- tinygrad/engine/realize.py +127 -51
- tinygrad/engine/schedule.py +259 -216
- tinygrad/engine/search.py +29 -22
- tinygrad/function.py +9 -0
- tinygrad/helpers.py +87 -49
- tinygrad/lazy.py +34 -35
- tinygrad/multi.py +41 -36
- tinygrad/nn/__init__.py +39 -22
- tinygrad/nn/state.py +3 -3
- tinygrad/ops.py +63 -62
- tinygrad/renderer/__init__.py +43 -21
- tinygrad/renderer/assembly.py +104 -106
- tinygrad/renderer/cstyle.py +87 -60
- tinygrad/renderer/llvmir.py +21 -30
- tinygrad/runtime/autogen/amd_gpu.py +25208 -5753
- tinygrad/runtime/autogen/cuda.py +6 -162
- tinygrad/runtime/autogen/kfd.py +32 -0
- tinygrad/runtime/autogen/libc.py +4260 -0
- tinygrad/runtime/autogen/nvrtc.py +579 -0
- tinygrad/runtime/graph/clang.py +2 -2
- tinygrad/runtime/graph/cuda.py +8 -11
- tinygrad/runtime/graph/hcq.py +120 -107
- tinygrad/runtime/graph/metal.py +18 -15
- tinygrad/runtime/ops_amd.py +197 -305
- tinygrad/runtime/ops_clang.py +2 -2
- tinygrad/runtime/ops_cuda.py +36 -94
- tinygrad/runtime/ops_disk.py +3 -7
- tinygrad/runtime/ops_gpu.py +4 -2
- tinygrad/runtime/ops_hip.py +70 -0
- tinygrad/runtime/ops_metal.py +38 -27
- tinygrad/runtime/ops_nv.py +283 -363
- tinygrad/runtime/ops_python.py +26 -30
- tinygrad/runtime/support/compiler_cuda.py +78 -0
- tinygrad/runtime/{driver/hip_comgr.py → support/compiler_hip.py} +15 -1
- tinygrad/runtime/support/elf.py +38 -0
- tinygrad/shape/shapetracker.py +5 -14
- tinygrad/shape/symbolic.py +4 -8
- tinygrad/shape/view.py +34 -22
- tinygrad/tensor.py +399 -97
- {tinygrad-0.9.1.dist-info → tinygrad-0.9.2.dist-info}/METADATA +49 -48
- tinygrad-0.9.2.dist-info/RECORD +70 -0
- {tinygrad-0.9.1.dist-info → tinygrad-0.9.2.dist-info}/WHEEL +1 -1
- tinygrad/codegen/linearizer.py +0 -528
- tinygrad-0.9.1.dist-info/RECORD +0 -63
- /tinygrad/runtime/{driver → support}/__init__.py +0 -0
- {tinygrad-0.9.1.dist-info → tinygrad-0.9.2.dist-info}/LICENSE +0 -0
- {tinygrad-0.9.1.dist-info → tinygrad-0.9.2.dist-info}/top_level.txt +0 -0
tinygrad/runtime/ops_clang.py
CHANGED
@@ -7,8 +7,8 @@ class ClangCompiler(Compiler):
|
|
7
7
|
def compile(self, src:str) -> bytes:
|
8
8
|
# TODO: remove file write. sadly clang doesn't like the use of /dev/stdout here
|
9
9
|
with tempfile.NamedTemporaryFile(delete=True) as output_file:
|
10
|
-
subprocess.check_output(['clang', '-
|
11
|
-
'-o', str(output_file.name)], input=src.encode('utf-8'))
|
10
|
+
subprocess.check_output(['clang', '-shared', '-march=native', '-O2', '-Wall', '-Werror', '-x', 'c', '-fPIC', '-ffreestanding', '-nostdlib',
|
11
|
+
'-', '-o', str(output_file.name)], input=src.encode('utf-8'))
|
12
12
|
return pathlib.Path(output_file.name).read_bytes()
|
13
13
|
|
14
14
|
class ClangProgram:
|
tinygrad/runtime/ops_cuda.py
CHANGED
@@ -1,30 +1,14 @@
|
|
1
1
|
from __future__ import annotations
|
2
|
-
import
|
3
|
-
from pathlib import Path
|
2
|
+
import ctypes, ctypes.util, functools
|
4
3
|
from typing import Tuple, Optional, List
|
5
|
-
|
6
|
-
from tinygrad.
|
7
|
-
from tinygrad.device import Compiled, Compiler, CompileError, BufferOptions, LRUAllocator, MallocAllocator
|
4
|
+
from tinygrad.helpers import DEBUG, getenv, from_mv, init_c_var, init_c_struct_t
|
5
|
+
from tinygrad.device import Compiled, BufferOptions, LRUAllocator
|
8
6
|
from tinygrad.renderer.cstyle import CUDARenderer
|
9
7
|
from tinygrad.renderer.assembly import PTXRenderer
|
8
|
+
from tinygrad.runtime.autogen import cuda
|
9
|
+
from tinygrad.runtime.support.compiler_cuda import cuda_disassemble, pretty_ptx, CUDACompiler, PTXCompiler, PTX
|
10
10
|
if getenv("IOCTL"): import extra.nv_gpu_driver.nv_ioctl # noqa: F401 # pylint: disable=unused-import
|
11
11
|
|
12
|
-
def pretty_ptx(s):
|
13
|
-
# all expressions match `<valid_before><expr><valid_after>` and replace it with `<valid_before>color(<expr>)<valid_after>`
|
14
|
-
s = re.sub(r'([!@<\[\s,\+\-;\n])((?:[_%$][\w%\$_]+(?:\.[xyz])?\:?)|(?:buf\d+))([<>\]\s,\+\-;\n\)])', lambda m:m[1]+colored(m[2], "blue")+m[3], s, flags=re.M) # identifiers # noqa: E501
|
15
|
-
s = re.sub(r'(.)((?:b|s|u|f)(?:8|16|32|64)|pred)([\.\s])', lambda m:m[1]+colored(m[2], "green")+m[3], s, flags=re.M) # types
|
16
|
-
s = re.sub(r'^(\s*)([\w]+)(.*?;$)', lambda m:m[1]+colored(m[2], "yellow")+m[3], s, flags=re.M) # instructions
|
17
|
-
s = re.sub(r'([<>\[\]\s,\+\-;])((?:0[fF][0-9a-fA-F]{8})|(?:[0-9]+)|(?:0[xX][0-9a-fA-F]+))([<>\[\]\s,\+\-;])', lambda m:m[1]+colored(m[2], "yellow")+m[3], s, flags=re.M) # numbers # noqa: E501
|
18
|
-
s = re.sub(r'(\.)(param|reg|global)', lambda m:m[1]+colored(m[2], "magenta"), s, flags=re.M) # space
|
19
|
-
s = re.sub(r'(\.)(version|target|address_size|visible|entry)', lambda m:m[1]+colored(m[2], "magenta"), s, flags=re.M) # derivatives
|
20
|
-
return s
|
21
|
-
|
22
|
-
CUDACPU = getenv("CUDACPU") == 1
|
23
|
-
if CUDACPU:
|
24
|
-
gpuocelot_lib = ctypes.CDLL(ctypes.util.find_library("gpuocelot"))
|
25
|
-
gpuocelot_lib.ptx_run.argtypes = [ctypes.c_char_p, ctypes.c_int, ctypes.POINTER(ctypes.c_void_p), ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int] # noqa: E501
|
26
|
-
cuda.cuLaunchKernel = lambda src, gx, gy, gz, lx, ly, lz, shared, stream, unused_extra, args: gpuocelot_lib.ptx_run(src, len(args), (ctypes.c_void_p * len(args))(*[ctypes.cast(x, ctypes.c_void_p) for x in args]), lx, ly, lz, gx, gy, gz, shared) # type: ignore # noqa: E501
|
27
|
-
|
28
12
|
def check(status):
|
29
13
|
if status != 0: raise RuntimeError(f"CUDA Error {status}, {ctypes.string_at(init_c_var(ctypes.POINTER(ctypes.c_char)(), lambda x: cuda.cuGetErrorString(status, ctypes.byref(x)))).decode()}") # noqa: E501
|
30
14
|
|
@@ -36,7 +20,6 @@ def encode_args(args, vals) -> Tuple[ctypes.Structure, ctypes.Array]:
|
|
36
20
|
return c_args, vargs
|
37
21
|
|
38
22
|
def cu_time_execution(cb, enable=False) -> Optional[float]:
|
39
|
-
if CUDACPU: return cpu_time_execution(cb, enable=enable)
|
40
23
|
if not enable: return cb()
|
41
24
|
evs = [init_c_var(cuda.CUevent(), lambda x: cuda.cuEventCreate(ctypes.byref(x), 0)) for _ in range(2)]
|
42
25
|
cuda.cuEventRecord(evs[0], None)
|
@@ -47,69 +30,32 @@ def cu_time_execution(cb, enable=False) -> Optional[float]:
|
|
47
30
|
for ev in evs: cuda.cuEventDestroy_v2(ev)
|
48
31
|
return ret.value * 1e-3
|
49
32
|
|
50
|
-
def _get_bytes(arg, get_str, get_sz, check) -> bytes:
|
51
|
-
sz = init_c_var(ctypes.c_size_t(), lambda x: check(get_sz(arg, ctypes.byref(x))))
|
52
|
-
return ctypes.string_at(init_c_var(ctypes.create_string_buffer(sz.value), lambda x: check(get_str(arg, x))), size=sz.value)
|
53
|
-
|
54
|
-
class PTXCompiler(Compiler):
|
55
|
-
def __init__(self, arch:str):
|
56
|
-
self.arch = arch
|
57
|
-
self.version = "7.8" if arch >= "sm_89" else "7.5"
|
58
|
-
super().__init__(f"compile_ptx_{self.arch}")
|
59
|
-
def compile(self, src:str) -> bytes: return src.replace("TARGET", self.arch).replace("VERSION", self.version).encode()
|
60
|
-
|
61
|
-
class CUDACompiler(Compiler):
|
62
|
-
def __init__(self, arch:str):
|
63
|
-
self.arch = arch
|
64
|
-
check(cuda.nvrtcVersion((nvrtcMajor := ctypes.c_int()), (nvrtcMinor := ctypes.c_int())))
|
65
|
-
self.compile_options = [f'--gpu-architecture={arch}', "-I/usr/local/cuda/include", "-I/usr/include", "-I/opt/cuda/include/"]
|
66
|
-
if (nvrtcMajor.value, nvrtcMinor.value) >= (12, 4): self.compile_options.append("--minimal")
|
67
|
-
super().__init__(f"compile_cuda_{self.arch}")
|
68
|
-
def compile(self, src:str) -> bytes:
|
69
|
-
check(cuda.nvrtcCreateProgram(ctypes.byref(prog := cuda.nvrtcProgram()), src.encode(), "<null>".encode(), 0, None, None))
|
70
|
-
status = cuda.nvrtcCompileProgram(prog, len(self.compile_options), to_char_p_p([o.encode() for o in self.compile_options]))
|
71
|
-
|
72
|
-
if status != 0: raise CompileError(f"compile failed: {_get_bytes(prog, cuda.nvrtcGetProgramLog, cuda.nvrtcGetProgramLogSize, check).decode()}")
|
73
|
-
return _get_bytes(prog, cuda.nvrtcGetPTX, cuda.nvrtcGetPTXSize, check)
|
74
|
-
|
75
|
-
def cuda_disassemble(lib, arch):
|
76
|
-
try:
|
77
|
-
fn = (Path(tempfile.gettempdir()) / f"tinycuda_{hashlib.md5(lib).hexdigest()}").as_posix()
|
78
|
-
with open(fn + ".ptx", "wb") as f: f.write(lib)
|
79
|
-
subprocess.run(["ptxas", f"-arch={arch}", "-o", fn, fn+".ptx"], check=True)
|
80
|
-
print(subprocess.check_output(['nvdisasm', fn]).decode('utf-8'))
|
81
|
-
except Exception as e: print("failed to generate SASS", str(e))
|
82
|
-
|
83
33
|
class CUDAProgram:
|
84
34
|
def __init__(self, device:CUDADevice, name:str, lib:bytes):
|
85
35
|
self.device, self.name, self.lib = device, name, lib
|
86
36
|
if DEBUG >= 5: print("\n".join([f"{i+1:>3} {line}" for i, line in enumerate(pretty_ptx(lib.decode('utf-8')).split("\n"))]))
|
87
37
|
if DEBUG >= 6: cuda_disassemble(lib, device.arch)
|
88
38
|
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
check(cuda.cuModuleGetFunction(ctypes.byref(prg := cuda.CUfunction()), self.module, name.encode("utf-8")))
|
99
|
-
self.prg = prg #type: ignore
|
39
|
+
check(cuda.cuCtxSetCurrent(self.device.context))
|
40
|
+
self.module = cuda.CUmodule()
|
41
|
+
status = cuda.cuModuleLoadData(ctypes.byref(self.module), lib)
|
42
|
+
if status != 0:
|
43
|
+
del self.module
|
44
|
+
cuda_disassemble(lib, device.arch)
|
45
|
+
raise RuntimeError(f"module load failed with status code {status}: {cuda.cudaError_enum__enumvalues[status]}")
|
46
|
+
check(cuda.cuModuleGetFunction(ctypes.byref(prg := cuda.CUfunction()), self.module, name.encode("utf-8")))
|
47
|
+
self.prg = prg #type: ignore
|
100
48
|
|
101
49
|
def __del__(self):
|
102
50
|
if hasattr(self, 'module'): check(cuda.cuModuleUnload(self.module))
|
103
51
|
|
104
52
|
def __call__(self, *args, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1), vals:Tuple[int, ...]=(), wait=False):
|
105
|
-
|
53
|
+
check(cuda.cuCtxSetCurrent(self.device.context))
|
54
|
+
if not hasattr(self, "vargs"):
|
55
|
+
self.c_args, self.vargs = encode_args(args, vals) #type: ignore
|
106
56
|
else:
|
107
|
-
|
108
|
-
|
109
|
-
self.c_args, self.vargs = encode_args(args, vals) #type: ignore
|
110
|
-
else:
|
111
|
-
for i in range(len(args)): self.c_args.__setattr__(f'f{i}', args[i])
|
112
|
-
for i in range(len(vals)): self.c_args.__setattr__(f'v{i}', vals[i])
|
57
|
+
for i in range(len(args)): self.c_args.__setattr__(f'f{i}', args[i])
|
58
|
+
for i in range(len(vals)): self.c_args.__setattr__(f'v{i}', vals[i])
|
113
59
|
return cu_time_execution(lambda: check(cuda.cuLaunchKernel(self.prg, *global_size, *local_size, 0, None, None, self.vargs)), enable=wait)
|
114
60
|
|
115
61
|
class CUDAAllocator(LRUAllocator):
|
@@ -148,33 +94,29 @@ class CUDADevice(Compiled):
|
|
148
94
|
|
149
95
|
def __init__(self, device:str):
|
150
96
|
device_id = int(device.split(":")[1]) if ":" in device else 0
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
self.arch = f"sm_{major.value}{minor.value}" if not CUDACPU else "sm_35"
|
97
|
+
check(cuda.cuInit(0))
|
98
|
+
self.cu_device = init_c_var(cuda.CUdevice(), lambda x: check(cuda.cuDeviceGet(ctypes.byref(x), device_id)))
|
99
|
+
self.context = init_c_var(cuda.CUcontext(), lambda x: check(cuda.cuCtxCreate_v2(ctypes.byref(x), 0, self.cu_device)))
|
100
|
+
check(cuda.cuDeviceComputeCapability(ctypes.byref(major := ctypes.c_int()), ctypes.byref(minor := ctypes.c_int()), device_id))
|
101
|
+
|
102
|
+
for dev in CUDADevice.devices:
|
103
|
+
check(cuda.cuDeviceCanAccessPeer(ctypes.byref(val := ctypes.c_int()), self.cu_device, dev.cu_device))
|
104
|
+
if val.value != 1: continue
|
105
|
+
check(cuda.cuCtxSetCurrent(dev.context))
|
106
|
+
check(cuda.cuCtxEnablePeerAccess(self.context, 0))
|
107
|
+
check(cuda.cuCtxSetCurrent(self.context))
|
108
|
+
check(cuda.cuCtxEnablePeerAccess(dev.context, 0))
|
109
|
+
CUDADevice.peer_access = True
|
110
|
+
|
111
|
+
self.arch = f"sm_{major.value}{minor.value}"
|
167
112
|
self.pending_copyin: List[Tuple[int, int, Optional[BufferOptions]]] = []
|
168
113
|
CUDADevice.devices.append(self)
|
169
114
|
|
170
115
|
from tinygrad.runtime.graph.cuda import CUDAGraph
|
171
|
-
super().__init__(device, CUDAAllocator(self) if
|
172
|
-
|
173
|
-
PTXCompiler(self.arch) if getenv("PTX") else CUDACompiler(self.arch),
|
174
|
-
functools.partial(CUDAProgram, self), graph=CUDAGraph if not CUDACPU else None)
|
116
|
+
super().__init__(device, CUDAAllocator(self), PTXRenderer(self.arch) if PTX else CUDARenderer(self.arch),
|
117
|
+
PTXCompiler(self.arch) if PTX else CUDACompiler(self.arch), functools.partial(CUDAProgram, self), graph=CUDAGraph)
|
175
118
|
|
176
119
|
def synchronize(self):
|
177
|
-
if CUDACPU: return
|
178
120
|
check(cuda.cuCtxSetCurrent(self.context))
|
179
121
|
check(cuda.cuCtxSynchronize())
|
180
122
|
for opaque,sz,options in self.pending_copyin: self.allocator.free(opaque, sz, options)
|
tinygrad/runtime/ops_disk.py
CHANGED
@@ -1,13 +1,9 @@
|
|
1
1
|
from __future__ import annotations
|
2
|
-
import os, mmap, _posixshmem, io, ctypes, ctypes.util, platform, contextlib
|
2
|
+
import os, sys, mmap, _posixshmem, io, ctypes, ctypes.util, platform, contextlib
|
3
3
|
from typing import Optional, Generator, Tuple, Callable, List
|
4
4
|
from tinygrad.helpers import OSX, round_up
|
5
5
|
from tinygrad.device import Compiled, Allocator
|
6
|
-
|
7
|
-
|
8
|
-
libc = ctypes.CDLL(ctypes.util.find_library("c"))
|
9
|
-
libc.mmap.argtypes = [ctypes.c_void_p, ctypes.c_size_t, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_long]
|
10
|
-
libc.mmap.restype = ctypes.c_void_p
|
6
|
+
from tinygrad.runtime.autogen import io_uring, libc
|
11
7
|
|
12
8
|
class DiskBuffer:
|
13
9
|
def __init__(self, device:DiskDevice, size:int, offset=0):
|
@@ -104,7 +100,7 @@ class DiskDevice(Compiled):
|
|
104
100
|
def _iouring_setup(self):
|
105
101
|
DiskDevice._tried_io_uring_init = True
|
106
102
|
|
107
|
-
if platform.system() != 'Linux': return
|
103
|
+
if platform.system() != 'Linux' or hasattr(sys, "getandroidapilevel"): return
|
108
104
|
|
109
105
|
fd = libc.syscall(io_uring.NR_io_uring_setup, 4096, ctypes.byref(p:=io_uring.struct_io_uring_params()))
|
110
106
|
if fd < 0: return
|
tinygrad/runtime/ops_gpu.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
from typing import Tuple, Optional, List, cast
|
3
3
|
import ctypes, functools, hashlib
|
4
|
-
|
4
|
+
from tinygrad.runtime.autogen import opencl as cl
|
5
5
|
from tinygrad.helpers import init_c_var, to_char_p_p, from_mv, OSX, DEBUG
|
6
6
|
from tinygrad.renderer.cstyle import OpenCLRenderer
|
7
7
|
from tinygrad.device import BufferOptions, LRUAllocator, Compiled, Compiler, CompileError
|
@@ -9,8 +9,9 @@ from tinygrad.device import BufferOptions, LRUAllocator, Compiled, Compiler, Com
|
|
9
9
|
# see test/external/external_osx_profiling.py to determine this ratio. it's in like GPU clocks or something
|
10
10
|
OSX_TIMING_RATIO = (125/3) if OSX else 1.0
|
11
11
|
|
12
|
+
cl_errors = {attr: k for k in dir(cl) if k.startswith("CL_") and (attr:=getattr(cl, k)) <= 0}
|
12
13
|
def check(status):
|
13
|
-
if status != 0: raise RuntimeError(f"OpenCL Error {status}")
|
14
|
+
if status != 0: raise RuntimeError(f"OpenCL Error {status}: {cl_errors.get(status, 'Unknown error')}")
|
14
15
|
def checked(ret, status): return (check(status.value), ret)[1]
|
15
16
|
|
16
17
|
class CLCompiler(Compiler):
|
@@ -90,6 +91,7 @@ class CLDevice(Compiled):
|
|
90
91
|
self.device_id = CLDevice.device_ids[0 if ":" not in device else int(device.split(":")[1])]
|
91
92
|
self.device_name = (cl.clGetDeviceInfo(self.device_id, cl.CL_DEVICE_NAME, 256, buf := ctypes.create_string_buffer(256), None), buf.value.decode())[1] # noqa: E501
|
92
93
|
self.driver_version = (cl.clGetDeviceInfo(self.device_id, cl.CL_DRIVER_VERSION, 256, buf := ctypes.create_string_buffer(256), None), buf.value.decode())[1] # noqa: E501
|
94
|
+
if DEBUG >= 1: print(f"CLDevice: opening {self.device_name} with version {self.driver_version}")
|
93
95
|
self.context = checked(cl.clCreateContext(None, 1, self.device_id, cl.clCreateContext.argtypes[3](), None, status := ctypes.c_int32()), status)
|
94
96
|
self.queue = checked(cl.clCreateCommandQueue(self.context, self.device_id, cl.CL_QUEUE_PROFILING_ENABLE, status), status)
|
95
97
|
self.pending_copyin: List[memoryview] = []
|
@@ -0,0 +1,70 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
import ctypes, functools
|
3
|
+
from typing import Tuple
|
4
|
+
from tinygrad.helpers import DEBUG, init_c_var, from_mv, init_c_struct_t
|
5
|
+
from tinygrad.device import Compiled, LRUAllocator, BufferOptions
|
6
|
+
from tinygrad.runtime.autogen import hip
|
7
|
+
from tinygrad.runtime.support.compiler_hip import AMDCompiler, disasm
|
8
|
+
from tinygrad.renderer.cstyle import HIPRenderer
|
9
|
+
|
10
|
+
def check(status):
|
11
|
+
if status != 0: raise RuntimeError(f"HIP Error {status}, {ctypes.string_at(hip.hipGetErrorString(status)).decode()}")
|
12
|
+
|
13
|
+
class HIPProgram:
|
14
|
+
def __init__(self, device:HIPDevice, name:str, lib:bytes):
|
15
|
+
self.device, self.name, self.lib = device, name, lib
|
16
|
+
|
17
|
+
if DEBUG >= 6: print(disasm(lib))
|
18
|
+
|
19
|
+
check(hip.hipSetDevice(self.device.device_id))
|
20
|
+
self.module = init_c_var(hip.hipModule_t(), lambda x: check(hip.hipModuleLoadData(ctypes.byref(x), lib)))
|
21
|
+
self.prg = init_c_var(hip.hipFunction_t(), lambda x: check(hip.hipModuleGetFunction(ctypes.byref(x), self.module, name.encode("utf-8"))))
|
22
|
+
|
23
|
+
def __del__(self):
|
24
|
+
if hasattr(self, 'module'): check(hip.hipModuleUnload(self.module))
|
25
|
+
|
26
|
+
def __call__(self, *args, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1), vals:Tuple[int, ...]=(), wait=False):
|
27
|
+
check(hip.hipSetDevice(self.device.device_id))
|
28
|
+
if not hasattr(self, "vargs"):
|
29
|
+
self.c_args = init_c_struct_t(tuple([(f'f{i}', hip.hipDeviceptr_t) for i in range(len(args))] +
|
30
|
+
[(f'v{i}', ctypes.c_int) for i in range(len(vals))]))(*args, *vals)
|
31
|
+
self.vargs = (ctypes.c_void_p * 5)(1, ctypes.cast(ctypes.byref(self.c_args), ctypes.c_void_p), 2,
|
32
|
+
ctypes.cast(ctypes.pointer(ctypes.c_size_t(ctypes.sizeof(self.c_args))), ctypes.c_void_p), 3)
|
33
|
+
|
34
|
+
for i in range(len(args)): self.c_args.__setattr__(f'f{i}', args[i])
|
35
|
+
for i in range(len(vals)): self.c_args.__setattr__(f'v{i}', vals[i])
|
36
|
+
|
37
|
+
if wait: check(hip.hipEventRecord(self.device.time_event_st, None))
|
38
|
+
|
39
|
+
check(hip.hipModuleLaunchKernel(self.prg, *global_size, *local_size, 0, None, None, self.vargs))
|
40
|
+
|
41
|
+
if wait:
|
42
|
+
check(hip.hipEventRecord(self.device.time_event_en, None))
|
43
|
+
check(hip.hipEventSynchronize(self.device.time_event_en))
|
44
|
+
check(hip.hipEventElapsedTime(ctypes.byref(ret := ctypes.c_float()), self.device.time_event_st, self.device.time_event_en))
|
45
|
+
return ret.value * 1e-3
|
46
|
+
|
47
|
+
class HIPAllocator(LRUAllocator):
|
48
|
+
def __init__(self, device:HIPDevice):
|
49
|
+
self.device = device
|
50
|
+
super().__init__()
|
51
|
+
def _alloc(self, size:int, options:BufferOptions):
|
52
|
+
check(hip.hipSetDevice(self.device.device_id))
|
53
|
+
return init_c_var(hip.hipDeviceptr_t(), lambda x: check(hip.hipMalloc(ctypes.byref(x), size)))
|
54
|
+
def _free(self, opaque, options:BufferOptions): check(hip.hipFree(opaque))
|
55
|
+
def copyin(self, dest, src: memoryview):
|
56
|
+
check(hip.hipSetDevice(self.device.device_id))
|
57
|
+
check(hip.hipMemcpy(dest, from_mv(src), len(src), hip.hipMemcpyHostToDevice))
|
58
|
+
def copyout(self, dest:memoryview, src):
|
59
|
+
self.device.synchronize()
|
60
|
+
check(hip.hipMemcpy(from_mv(dest), src, len(dest), hip.hipMemcpyDeviceToHost))
|
61
|
+
|
62
|
+
class HIPDevice(Compiled):
|
63
|
+
def __init__(self, device:str=""):
|
64
|
+
self.device_id = int(device.split(":")[1]) if ":" in device else 0
|
65
|
+
self.arch = init_c_var(hip.hipDeviceProp_t(), lambda x: check(hip.hipGetDeviceProperties(x, self.device_id))).gcnArchName.decode()
|
66
|
+
self.time_event_st, self.time_event_en = [init_c_var(hip.hipEvent_t(), lambda x: hip.hipEventCreate(ctypes.byref(x), 0)) for _ in range(2)]
|
67
|
+
super().__init__(device, HIPAllocator(self), HIPRenderer(), AMDCompiler(self.arch), functools.partial(HIPProgram, self))
|
68
|
+
def synchronize(self):
|
69
|
+
check(hip.hipSetDevice(self.device_id))
|
70
|
+
check(hip.hipDeviceSynchronize())
|
tinygrad/runtime/ops_metal.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
import os, subprocess, pathlib, ctypes, tempfile, functools
|
3
3
|
import Metal, libdispatch
|
4
|
-
from typing import List,
|
4
|
+
from typing import List, Any, Tuple, Optional
|
5
5
|
from tinygrad.helpers import prod, getenv, DEBUG, unwrap2
|
6
6
|
from tinygrad.device import Compiled, Compiler, CompileError, LRUAllocator
|
7
7
|
from tinygrad.renderer.cstyle import MetalRenderer
|
@@ -33,7 +33,9 @@ class MetalProgram:
|
|
33
33
|
with tempfile.NamedTemporaryFile(delete=True) as shader:
|
34
34
|
shader.write(lib)
|
35
35
|
shader.flush()
|
36
|
-
os.system(f"cd {pathlib.Path(__file__).parents[2]}/extra/disassemblers/applegpu && python3 compiler_explorer.py {shader.name}")
|
36
|
+
ret = os.system(f"cd {pathlib.Path(__file__).parents[2]}/extra/disassemblers/applegpu && python3 compiler_explorer.py {shader.name}")
|
37
|
+
if ret:
|
38
|
+
print("Error running disassembler: Make sure you have https://github.com/dougallj/applegpu cloned to tinygrad/extra/disassemblers/applegpu")
|
37
39
|
assert lib[:4] == b"MTLB", "Invalid Metal library. Could be due to using conda. Try system python or METAL_XCODE=1 DISABLE_COMPILER_CACHE=1."
|
38
40
|
data = libdispatch.dispatch_data_create(lib, len(lib), None, None)
|
39
41
|
self.library = unwrap2(self.device.device.newLibraryWithData_error_(data, None))
|
@@ -45,7 +47,7 @@ class MetalProgram:
|
|
45
47
|
command_buffer = self.device.mtl_queue.commandBuffer()
|
46
48
|
encoder = command_buffer.computeCommandEncoder()
|
47
49
|
encoder.setComputePipelineState_(self.pipeline_state)
|
48
|
-
for i,a in enumerate(bufs): encoder.setBuffer_offset_atIndex_(a,
|
50
|
+
for i,a in enumerate(bufs): encoder.setBuffer_offset_atIndex_(a.buf, a.offset, i)
|
49
51
|
for i,a in enumerate(vals,start=len(bufs)): encoder.setBytes_length_atIndex_(ctypes.c_int32(a), 4, i)
|
50
52
|
encoder.dispatchThreadgroups_threadsPerThreadgroup_(Metal.MTLSize(*global_size), Metal.MTLSize(*local_size))
|
51
53
|
encoder.endEncoding()
|
@@ -55,46 +57,56 @@ class MetalProgram:
|
|
55
57
|
return command_buffer.GPUEndTime() - command_buffer.GPUStartTime()
|
56
58
|
self.device.mtl_buffers_in_flight.append(command_buffer)
|
57
59
|
|
60
|
+
class MetalBuffer:
|
61
|
+
def __init__(self, buf:Any, size:int, offset=0): self.buf, self.size, self.offset = buf, size, offset
|
62
|
+
|
58
63
|
class MetalAllocator(LRUAllocator):
|
59
64
|
def __init__(self, device:MetalDevice):
|
60
65
|
self.device:MetalDevice = device
|
61
|
-
self.track_cross_device: Set[MetalDevice] = set()
|
62
66
|
super().__init__()
|
63
|
-
def
|
64
|
-
self.device.synchronize()
|
65
|
-
for x in self.track_cross_device: x.synchronize()
|
66
|
-
self.track_cross_device.clear()
|
67
|
-
return super().free_cache()
|
68
|
-
def _alloc(self, size:int, options) -> Any:
|
67
|
+
def _alloc(self, size:int, options) -> MetalBuffer:
|
69
68
|
ret = self.device.device.newBufferWithLength_options_(size, Metal.MTLResourceStorageModeShared)
|
70
69
|
if ret is None: raise MemoryError(f"Metal OOM while allocating {size=}")
|
71
|
-
return ret
|
72
|
-
def
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
encoder.
|
70
|
+
return MetalBuffer(ret, size)
|
71
|
+
def _free(self, opaque:MetalBuffer, options): opaque.buf.release()
|
72
|
+
def transfer(self, dest:MetalBuffer, src:MetalBuffer, sz:int, src_dev:MetalDevice, dest_dev:MetalDevice):
|
73
|
+
dest_dev.synchronize()
|
74
|
+
src_command_buffer = src_dev.mtl_queue.commandBuffer()
|
75
|
+
encoder = src_command_buffer.blitCommandEncoder()
|
76
|
+
encoder.copyFromBuffer_sourceOffset_toBuffer_destinationOffset_size_(src.buf, src.offset, dest.buf, dest.offset, sz)
|
77
77
|
encoder.endEncoding()
|
78
|
-
|
79
|
-
|
78
|
+
if src_dev != dest_dev:
|
79
|
+
src_command_buffer.encodeSignalEvent_value_(src_dev.timeline_signal, src_dev.timeline_value)
|
80
|
+
dest_command_buffer = dest_dev.mtl_queue.commandBuffer()
|
81
|
+
dest_command_buffer.encodeWaitForEvent_value_(src_dev.timeline_signal, src_dev.timeline_value)
|
82
|
+
dest_command_buffer.commit()
|
83
|
+
dest_dev.mtl_buffers_in_flight.append(dest_command_buffer)
|
84
|
+
src_dev.timeline_value += 1
|
85
|
+
src_command_buffer.commit()
|
86
|
+
src_dev.mtl_buffers_in_flight.append(src_command_buffer)
|
80
87
|
def from_buffer(self, src:memoryview) -> Optional[Any]:
|
81
|
-
ret = self.device.device.newBufferWithBytesNoCopy_length_options_deallocator_(src,
|
88
|
+
ret = self.device.device.newBufferWithBytesNoCopy_length_options_deallocator_(src, src.nbytes, Metal.MTLResourceStorageModeShared, None)
|
82
89
|
if ret: self.device.mv_in_metal.append(src)
|
83
|
-
return ret
|
84
|
-
def
|
85
|
-
def as_buffer(self, src:Any) -> memoryview:
|
90
|
+
return MetalBuffer(ret, src.nbytes)
|
91
|
+
def as_buffer(self, src:MetalBuffer) -> memoryview:
|
86
92
|
self.device.synchronize()
|
87
|
-
return src.contents().as_buffer(src.
|
88
|
-
def copyin(self, dest:
|
89
|
-
def copyout(self, dest:memoryview, src:
|
93
|
+
return src.buf.contents().as_buffer(src.offset+src.size)[src.offset:]
|
94
|
+
def copyin(self, dest:MetalBuffer, src:memoryview): self.as_buffer(dest)[:] = src
|
95
|
+
def copyout(self, dest:memoryview, src:MetalBuffer): dest[:] = self.as_buffer(src)
|
96
|
+
def offset(self, buf:MetalBuffer, size:int, offset:int): return MetalBuffer(buf.buf, size, offset)
|
90
97
|
|
91
98
|
class MetalDevice(Compiled):
|
92
99
|
def __init__(self, device:str):
|
93
100
|
self.device = Metal.MTLCreateSystemDefaultDevice()
|
94
101
|
self.mtl_queue = self.device.newCommandQueueWithMaxCommandBufferCount_(1024)
|
102
|
+
if self.mtl_queue is None: raise RuntimeError("Cannot allocate a new command queue")
|
103
|
+
|
95
104
|
self.mtl_buffers_in_flight: List[Any] = []
|
96
105
|
self.mv_in_metal: List[memoryview] = []
|
97
|
-
|
106
|
+
|
107
|
+
self.timeline_signal = self.device.newSharedEvent()
|
108
|
+
self.timeline_value = 0
|
109
|
+
|
98
110
|
from tinygrad.runtime.graph.metal import MetalGraph
|
99
111
|
super().__init__(device, MetalAllocator(self), MetalRenderer(), MetalCompiler(None if getenv("METAL_XCODE") else self),
|
100
112
|
functools.partial(MetalProgram, self), MetalGraph)
|
@@ -102,4 +114,3 @@ class MetalDevice(Compiled):
|
|
102
114
|
for cbuf in self.mtl_buffers_in_flight: wait_check(cbuf)
|
103
115
|
self.mv_in_metal.clear()
|
104
116
|
self.mtl_buffers_in_flight.clear()
|
105
|
-
self.track_cross_buffer.clear()
|