tinygrad 0.9.1__py3-none-any.whl → 0.9.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. tinygrad/codegen/kernel.py +248 -115
  2. tinygrad/codegen/lowerer.py +215 -0
  3. tinygrad/codegen/transcendental.py +310 -0
  4. tinygrad/codegen/uopgraph.py +622 -0
  5. tinygrad/codegen/uops.py +235 -393
  6. tinygrad/device.py +428 -69
  7. tinygrad/dtype.py +18 -4
  8. tinygrad/engine/graph.py +19 -32
  9. tinygrad/engine/jit.py +148 -70
  10. tinygrad/engine/realize.py +127 -51
  11. tinygrad/engine/schedule.py +259 -216
  12. tinygrad/engine/search.py +29 -22
  13. tinygrad/function.py +9 -0
  14. tinygrad/helpers.py +87 -49
  15. tinygrad/lazy.py +34 -35
  16. tinygrad/multi.py +41 -36
  17. tinygrad/nn/__init__.py +39 -22
  18. tinygrad/nn/state.py +3 -3
  19. tinygrad/ops.py +63 -62
  20. tinygrad/renderer/__init__.py +43 -21
  21. tinygrad/renderer/assembly.py +104 -106
  22. tinygrad/renderer/cstyle.py +87 -60
  23. tinygrad/renderer/llvmir.py +21 -30
  24. tinygrad/runtime/autogen/amd_gpu.py +25208 -5753
  25. tinygrad/runtime/autogen/cuda.py +6 -162
  26. tinygrad/runtime/autogen/kfd.py +32 -0
  27. tinygrad/runtime/autogen/libc.py +4260 -0
  28. tinygrad/runtime/autogen/nvrtc.py +579 -0
  29. tinygrad/runtime/graph/clang.py +2 -2
  30. tinygrad/runtime/graph/cuda.py +8 -11
  31. tinygrad/runtime/graph/hcq.py +120 -107
  32. tinygrad/runtime/graph/metal.py +18 -15
  33. tinygrad/runtime/ops_amd.py +197 -305
  34. tinygrad/runtime/ops_clang.py +2 -2
  35. tinygrad/runtime/ops_cuda.py +36 -94
  36. tinygrad/runtime/ops_disk.py +3 -7
  37. tinygrad/runtime/ops_gpu.py +4 -2
  38. tinygrad/runtime/ops_hip.py +70 -0
  39. tinygrad/runtime/ops_metal.py +38 -27
  40. tinygrad/runtime/ops_nv.py +283 -363
  41. tinygrad/runtime/ops_python.py +26 -30
  42. tinygrad/runtime/support/compiler_cuda.py +78 -0
  43. tinygrad/runtime/{driver/hip_comgr.py → support/compiler_hip.py} +15 -1
  44. tinygrad/runtime/support/elf.py +38 -0
  45. tinygrad/shape/shapetracker.py +5 -14
  46. tinygrad/shape/symbolic.py +4 -8
  47. tinygrad/shape/view.py +34 -22
  48. tinygrad/tensor.py +399 -97
  49. {tinygrad-0.9.1.dist-info → tinygrad-0.9.2.dist-info}/METADATA +49 -48
  50. tinygrad-0.9.2.dist-info/RECORD +70 -0
  51. {tinygrad-0.9.1.dist-info → tinygrad-0.9.2.dist-info}/WHEEL +1 -1
  52. tinygrad/codegen/linearizer.py +0 -528
  53. tinygrad-0.9.1.dist-info/RECORD +0 -63
  54. /tinygrad/runtime/{driver → support}/__init__.py +0 -0
  55. {tinygrad-0.9.1.dist-info → tinygrad-0.9.2.dist-info}/LICENSE +0 -0
  56. {tinygrad-0.9.1.dist-info → tinygrad-0.9.2.dist-info}/top_level.txt +0 -0
@@ -7,8 +7,8 @@ class ClangCompiler(Compiler):
7
7
  def compile(self, src:str) -> bytes:
8
8
  # TODO: remove file write. sadly clang doesn't like the use of /dev/stdout here
9
9
  with tempfile.NamedTemporaryFile(delete=True) as output_file:
10
- subprocess.check_output(['clang', '-include', 'tgmath.h', '-shared', '-march=native', '-O2', '-Wall', '-Werror', '-x', 'c', '-fPIC', '-',
11
- '-o', str(output_file.name)], input=src.encode('utf-8'))
10
+ subprocess.check_output(['clang', '-shared', '-march=native', '-O2', '-Wall', '-Werror', '-x', 'c', '-fPIC', '-ffreestanding', '-nostdlib',
11
+ '-', '-o', str(output_file.name)], input=src.encode('utf-8'))
12
12
  return pathlib.Path(output_file.name).read_bytes()
13
13
 
14
14
  class ClangProgram:
@@ -1,30 +1,14 @@
1
1
  from __future__ import annotations
2
- import subprocess, hashlib, tempfile, ctypes, ctypes.util, functools, re
3
- from pathlib import Path
2
+ import ctypes, ctypes.util, functools
4
3
  from typing import Tuple, Optional, List
5
- import tinygrad.runtime.autogen.cuda as cuda
6
- from tinygrad.helpers import DEBUG, getenv, from_mv, to_char_p_p, init_c_var, init_c_struct_t, colored, cpu_time_execution
7
- from tinygrad.device import Compiled, Compiler, CompileError, BufferOptions, LRUAllocator, MallocAllocator
4
+ from tinygrad.helpers import DEBUG, getenv, from_mv, init_c_var, init_c_struct_t
5
+ from tinygrad.device import Compiled, BufferOptions, LRUAllocator
8
6
  from tinygrad.renderer.cstyle import CUDARenderer
9
7
  from tinygrad.renderer.assembly import PTXRenderer
8
+ from tinygrad.runtime.autogen import cuda
9
+ from tinygrad.runtime.support.compiler_cuda import cuda_disassemble, pretty_ptx, CUDACompiler, PTXCompiler, PTX
10
10
  if getenv("IOCTL"): import extra.nv_gpu_driver.nv_ioctl # noqa: F401 # pylint: disable=unused-import
11
11
 
12
- def pretty_ptx(s):
13
- # all expressions match `<valid_before><expr><valid_after>` and replace it with `<valid_before>color(<expr>)<valid_after>`
14
- s = re.sub(r'([!@<\[\s,\+\-;\n])((?:[_%$][\w%\$_]+(?:\.[xyz])?\:?)|(?:buf\d+))([<>\]\s,\+\-;\n\)])', lambda m:m[1]+colored(m[2], "blue")+m[3], s, flags=re.M) # identifiers # noqa: E501
15
- s = re.sub(r'(.)((?:b|s|u|f)(?:8|16|32|64)|pred)([\.\s])', lambda m:m[1]+colored(m[2], "green")+m[3], s, flags=re.M) # types
16
- s = re.sub(r'^(\s*)([\w]+)(.*?;$)', lambda m:m[1]+colored(m[2], "yellow")+m[3], s, flags=re.M) # instructions
17
- s = re.sub(r'([<>\[\]\s,\+\-;])((?:0[fF][0-9a-fA-F]{8})|(?:[0-9]+)|(?:0[xX][0-9a-fA-F]+))([<>\[\]\s,\+\-;])', lambda m:m[1]+colored(m[2], "yellow")+m[3], s, flags=re.M) # numbers # noqa: E501
18
- s = re.sub(r'(\.)(param|reg|global)', lambda m:m[1]+colored(m[2], "magenta"), s, flags=re.M) # space
19
- s = re.sub(r'(\.)(version|target|address_size|visible|entry)', lambda m:m[1]+colored(m[2], "magenta"), s, flags=re.M) # derivatives
20
- return s
21
-
22
- CUDACPU = getenv("CUDACPU") == 1
23
- if CUDACPU:
24
- gpuocelot_lib = ctypes.CDLL(ctypes.util.find_library("gpuocelot"))
25
- gpuocelot_lib.ptx_run.argtypes = [ctypes.c_char_p, ctypes.c_int, ctypes.POINTER(ctypes.c_void_p), ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int] # noqa: E501
26
- cuda.cuLaunchKernel = lambda src, gx, gy, gz, lx, ly, lz, shared, stream, unused_extra, args: gpuocelot_lib.ptx_run(src, len(args), (ctypes.c_void_p * len(args))(*[ctypes.cast(x, ctypes.c_void_p) for x in args]), lx, ly, lz, gx, gy, gz, shared) # type: ignore # noqa: E501
27
-
28
12
  def check(status):
29
13
  if status != 0: raise RuntimeError(f"CUDA Error {status}, {ctypes.string_at(init_c_var(ctypes.POINTER(ctypes.c_char)(), lambda x: cuda.cuGetErrorString(status, ctypes.byref(x)))).decode()}") # noqa: E501
30
14
 
@@ -36,7 +20,6 @@ def encode_args(args, vals) -> Tuple[ctypes.Structure, ctypes.Array]:
36
20
  return c_args, vargs
37
21
 
38
22
  def cu_time_execution(cb, enable=False) -> Optional[float]:
39
- if CUDACPU: return cpu_time_execution(cb, enable=enable)
40
23
  if not enable: return cb()
41
24
  evs = [init_c_var(cuda.CUevent(), lambda x: cuda.cuEventCreate(ctypes.byref(x), 0)) for _ in range(2)]
42
25
  cuda.cuEventRecord(evs[0], None)
@@ -47,69 +30,32 @@ def cu_time_execution(cb, enable=False) -> Optional[float]:
47
30
  for ev in evs: cuda.cuEventDestroy_v2(ev)
48
31
  return ret.value * 1e-3
49
32
 
50
- def _get_bytes(arg, get_str, get_sz, check) -> bytes:
51
- sz = init_c_var(ctypes.c_size_t(), lambda x: check(get_sz(arg, ctypes.byref(x))))
52
- return ctypes.string_at(init_c_var(ctypes.create_string_buffer(sz.value), lambda x: check(get_str(arg, x))), size=sz.value)
53
-
54
- class PTXCompiler(Compiler):
55
- def __init__(self, arch:str):
56
- self.arch = arch
57
- self.version = "7.8" if arch >= "sm_89" else "7.5"
58
- super().__init__(f"compile_ptx_{self.arch}")
59
- def compile(self, src:str) -> bytes: return src.replace("TARGET", self.arch).replace("VERSION", self.version).encode()
60
-
61
- class CUDACompiler(Compiler):
62
- def __init__(self, arch:str):
63
- self.arch = arch
64
- check(cuda.nvrtcVersion((nvrtcMajor := ctypes.c_int()), (nvrtcMinor := ctypes.c_int())))
65
- self.compile_options = [f'--gpu-architecture={arch}', "-I/usr/local/cuda/include", "-I/usr/include", "-I/opt/cuda/include/"]
66
- if (nvrtcMajor.value, nvrtcMinor.value) >= (12, 4): self.compile_options.append("--minimal")
67
- super().__init__(f"compile_cuda_{self.arch}")
68
- def compile(self, src:str) -> bytes:
69
- check(cuda.nvrtcCreateProgram(ctypes.byref(prog := cuda.nvrtcProgram()), src.encode(), "<null>".encode(), 0, None, None))
70
- status = cuda.nvrtcCompileProgram(prog, len(self.compile_options), to_char_p_p([o.encode() for o in self.compile_options]))
71
-
72
- if status != 0: raise CompileError(f"compile failed: {_get_bytes(prog, cuda.nvrtcGetProgramLog, cuda.nvrtcGetProgramLogSize, check).decode()}")
73
- return _get_bytes(prog, cuda.nvrtcGetPTX, cuda.nvrtcGetPTXSize, check)
74
-
75
- def cuda_disassemble(lib, arch):
76
- try:
77
- fn = (Path(tempfile.gettempdir()) / f"tinycuda_{hashlib.md5(lib).hexdigest()}").as_posix()
78
- with open(fn + ".ptx", "wb") as f: f.write(lib)
79
- subprocess.run(["ptxas", f"-arch={arch}", "-o", fn, fn+".ptx"], check=True)
80
- print(subprocess.check_output(['nvdisasm', fn]).decode('utf-8'))
81
- except Exception as e: print("failed to generate SASS", str(e))
82
-
83
33
  class CUDAProgram:
84
34
  def __init__(self, device:CUDADevice, name:str, lib:bytes):
85
35
  self.device, self.name, self.lib = device, name, lib
86
36
  if DEBUG >= 5: print("\n".join([f"{i+1:>3} {line}" for i, line in enumerate(pretty_ptx(lib.decode('utf-8')).split("\n"))]))
87
37
  if DEBUG >= 6: cuda_disassemble(lib, device.arch)
88
38
 
89
- if CUDACPU: self.prg = lib
90
- else:
91
- check(cuda.cuCtxSetCurrent(self.device.context))
92
- self.module = cuda.CUmodule()
93
- status = cuda.cuModuleLoadData(ctypes.byref(self.module), lib)
94
- if status != 0:
95
- del self.module
96
- cuda_disassemble(lib, device.arch)
97
- raise RuntimeError(f"module load failed with status code {status}: {cuda.cudaError_enum__enumvalues[status]}")
98
- check(cuda.cuModuleGetFunction(ctypes.byref(prg := cuda.CUfunction()), self.module, name.encode("utf-8")))
99
- self.prg = prg #type: ignore
39
+ check(cuda.cuCtxSetCurrent(self.device.context))
40
+ self.module = cuda.CUmodule()
41
+ status = cuda.cuModuleLoadData(ctypes.byref(self.module), lib)
42
+ if status != 0:
43
+ del self.module
44
+ cuda_disassemble(lib, device.arch)
45
+ raise RuntimeError(f"module load failed with status code {status}: {cuda.cudaError_enum__enumvalues[status]}")
46
+ check(cuda.cuModuleGetFunction(ctypes.byref(prg := cuda.CUfunction()), self.module, name.encode("utf-8")))
47
+ self.prg = prg #type: ignore
100
48
 
101
49
  def __del__(self):
102
50
  if hasattr(self, 'module'): check(cuda.cuModuleUnload(self.module))
103
51
 
104
52
  def __call__(self, *args, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1), vals:Tuple[int, ...]=(), wait=False):
105
- if CUDACPU: self.vargs = args+tuple(vals)
53
+ check(cuda.cuCtxSetCurrent(self.device.context))
54
+ if not hasattr(self, "vargs"):
55
+ self.c_args, self.vargs = encode_args(args, vals) #type: ignore
106
56
  else:
107
- check(cuda.cuCtxSetCurrent(self.device.context))
108
- if not hasattr(self, "vargs"):
109
- self.c_args, self.vargs = encode_args(args, vals) #type: ignore
110
- else:
111
- for i in range(len(args)): self.c_args.__setattr__(f'f{i}', args[i])
112
- for i in range(len(vals)): self.c_args.__setattr__(f'v{i}', vals[i])
57
+ for i in range(len(args)): self.c_args.__setattr__(f'f{i}', args[i])
58
+ for i in range(len(vals)): self.c_args.__setattr__(f'v{i}', vals[i])
113
59
  return cu_time_execution(lambda: check(cuda.cuLaunchKernel(self.prg, *global_size, *local_size, 0, None, None, self.vargs)), enable=wait)
114
60
 
115
61
  class CUDAAllocator(LRUAllocator):
@@ -148,33 +94,29 @@ class CUDADevice(Compiled):
148
94
 
149
95
  def __init__(self, device:str):
150
96
  device_id = int(device.split(":")[1]) if ":" in device else 0
151
- if not CUDACPU:
152
- check(cuda.cuInit(0))
153
- self.cu_device = init_c_var(cuda.CUdevice(), lambda x: check(cuda.cuDeviceGet(ctypes.byref(x), device_id)))
154
- self.context = init_c_var(cuda.CUcontext(), lambda x: check(cuda.cuCtxCreate_v2(ctypes.byref(x), 0, self.cu_device)))
155
- check(cuda.cuDeviceComputeCapability(ctypes.byref(major := ctypes.c_int()), ctypes.byref(minor := ctypes.c_int()), device_id))
156
-
157
- for dev in CUDADevice.devices:
158
- check(cuda.cuDeviceCanAccessPeer(ctypes.byref(val := ctypes.c_int()), self.cu_device, dev.cu_device))
159
- if val.value != 1: continue
160
- check(cuda.cuCtxSetCurrent(dev.context))
161
- check(cuda.cuCtxEnablePeerAccess(self.context, 0))
162
- check(cuda.cuCtxSetCurrent(self.context))
163
- check(cuda.cuCtxEnablePeerAccess(dev.context, 0))
164
- CUDADevice.peer_access = True
165
-
166
- self.arch = f"sm_{major.value}{minor.value}" if not CUDACPU else "sm_35"
97
+ check(cuda.cuInit(0))
98
+ self.cu_device = init_c_var(cuda.CUdevice(), lambda x: check(cuda.cuDeviceGet(ctypes.byref(x), device_id)))
99
+ self.context = init_c_var(cuda.CUcontext(), lambda x: check(cuda.cuCtxCreate_v2(ctypes.byref(x), 0, self.cu_device)))
100
+ check(cuda.cuDeviceComputeCapability(ctypes.byref(major := ctypes.c_int()), ctypes.byref(minor := ctypes.c_int()), device_id))
101
+
102
+ for dev in CUDADevice.devices:
103
+ check(cuda.cuDeviceCanAccessPeer(ctypes.byref(val := ctypes.c_int()), self.cu_device, dev.cu_device))
104
+ if val.value != 1: continue
105
+ check(cuda.cuCtxSetCurrent(dev.context))
106
+ check(cuda.cuCtxEnablePeerAccess(self.context, 0))
107
+ check(cuda.cuCtxSetCurrent(self.context))
108
+ check(cuda.cuCtxEnablePeerAccess(dev.context, 0))
109
+ CUDADevice.peer_access = True
110
+
111
+ self.arch = f"sm_{major.value}{minor.value}"
167
112
  self.pending_copyin: List[Tuple[int, int, Optional[BufferOptions]]] = []
168
113
  CUDADevice.devices.append(self)
169
114
 
170
115
  from tinygrad.runtime.graph.cuda import CUDAGraph
171
- super().__init__(device, CUDAAllocator(self) if not CUDACPU else MallocAllocator,
172
- PTXRenderer(self.arch) if getenv("PTX") else CUDARenderer(self.arch),
173
- PTXCompiler(self.arch) if getenv("PTX") else CUDACompiler(self.arch),
174
- functools.partial(CUDAProgram, self), graph=CUDAGraph if not CUDACPU else None)
116
+ super().__init__(device, CUDAAllocator(self), PTXRenderer(self.arch) if PTX else CUDARenderer(self.arch),
117
+ PTXCompiler(self.arch) if PTX else CUDACompiler(self.arch), functools.partial(CUDAProgram, self), graph=CUDAGraph)
175
118
 
176
119
  def synchronize(self):
177
- if CUDACPU: return
178
120
  check(cuda.cuCtxSetCurrent(self.context))
179
121
  check(cuda.cuCtxSynchronize())
180
122
  for opaque,sz,options in self.pending_copyin: self.allocator.free(opaque, sz, options)
@@ -1,13 +1,9 @@
1
1
  from __future__ import annotations
2
- import os, mmap, _posixshmem, io, ctypes, ctypes.util, platform, contextlib
2
+ import os, sys, mmap, _posixshmem, io, ctypes, ctypes.util, platform, contextlib
3
3
  from typing import Optional, Generator, Tuple, Callable, List
4
4
  from tinygrad.helpers import OSX, round_up
5
5
  from tinygrad.device import Compiled, Allocator
6
- import tinygrad.runtime.autogen.io_uring as io_uring
7
-
8
- libc = ctypes.CDLL(ctypes.util.find_library("c"))
9
- libc.mmap.argtypes = [ctypes.c_void_p, ctypes.c_size_t, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_long]
10
- libc.mmap.restype = ctypes.c_void_p
6
+ from tinygrad.runtime.autogen import io_uring, libc
11
7
 
12
8
  class DiskBuffer:
13
9
  def __init__(self, device:DiskDevice, size:int, offset=0):
@@ -104,7 +100,7 @@ class DiskDevice(Compiled):
104
100
  def _iouring_setup(self):
105
101
  DiskDevice._tried_io_uring_init = True
106
102
 
107
- if platform.system() != 'Linux': return
103
+ if platform.system() != 'Linux' or hasattr(sys, "getandroidapilevel"): return
108
104
 
109
105
  fd = libc.syscall(io_uring.NR_io_uring_setup, 4096, ctypes.byref(p:=io_uring.struct_io_uring_params()))
110
106
  if fd < 0: return
@@ -1,7 +1,7 @@
1
1
  from __future__ import annotations
2
2
  from typing import Tuple, Optional, List, cast
3
3
  import ctypes, functools, hashlib
4
- import tinygrad.runtime.autogen.opencl as cl
4
+ from tinygrad.runtime.autogen import opencl as cl
5
5
  from tinygrad.helpers import init_c_var, to_char_p_p, from_mv, OSX, DEBUG
6
6
  from tinygrad.renderer.cstyle import OpenCLRenderer
7
7
  from tinygrad.device import BufferOptions, LRUAllocator, Compiled, Compiler, CompileError
@@ -9,8 +9,9 @@ from tinygrad.device import BufferOptions, LRUAllocator, Compiled, Compiler, Com
9
9
  # see test/external/external_osx_profiling.py to determine this ratio. it's in like GPU clocks or something
10
10
  OSX_TIMING_RATIO = (125/3) if OSX else 1.0
11
11
 
12
+ cl_errors = {attr: k for k in dir(cl) if k.startswith("CL_") and (attr:=getattr(cl, k)) <= 0}
12
13
  def check(status):
13
- if status != 0: raise RuntimeError(f"OpenCL Error {status}")
14
+ if status != 0: raise RuntimeError(f"OpenCL Error {status}: {cl_errors.get(status, 'Unknown error')}")
14
15
  def checked(ret, status): return (check(status.value), ret)[1]
15
16
 
16
17
  class CLCompiler(Compiler):
@@ -90,6 +91,7 @@ class CLDevice(Compiled):
90
91
  self.device_id = CLDevice.device_ids[0 if ":" not in device else int(device.split(":")[1])]
91
92
  self.device_name = (cl.clGetDeviceInfo(self.device_id, cl.CL_DEVICE_NAME, 256, buf := ctypes.create_string_buffer(256), None), buf.value.decode())[1] # noqa: E501
92
93
  self.driver_version = (cl.clGetDeviceInfo(self.device_id, cl.CL_DRIVER_VERSION, 256, buf := ctypes.create_string_buffer(256), None), buf.value.decode())[1] # noqa: E501
94
+ if DEBUG >= 1: print(f"CLDevice: opening {self.device_name} with version {self.driver_version}")
93
95
  self.context = checked(cl.clCreateContext(None, 1, self.device_id, cl.clCreateContext.argtypes[3](), None, status := ctypes.c_int32()), status)
94
96
  self.queue = checked(cl.clCreateCommandQueue(self.context, self.device_id, cl.CL_QUEUE_PROFILING_ENABLE, status), status)
95
97
  self.pending_copyin: List[memoryview] = []
@@ -0,0 +1,70 @@
1
+ from __future__ import annotations
2
+ import ctypes, functools
3
+ from typing import Tuple
4
+ from tinygrad.helpers import DEBUG, init_c_var, from_mv, init_c_struct_t
5
+ from tinygrad.device import Compiled, LRUAllocator, BufferOptions
6
+ from tinygrad.runtime.autogen import hip
7
+ from tinygrad.runtime.support.compiler_hip import AMDCompiler, disasm
8
+ from tinygrad.renderer.cstyle import HIPRenderer
9
+
10
+ def check(status):
11
+ if status != 0: raise RuntimeError(f"HIP Error {status}, {ctypes.string_at(hip.hipGetErrorString(status)).decode()}")
12
+
13
+ class HIPProgram:
14
+ def __init__(self, device:HIPDevice, name:str, lib:bytes):
15
+ self.device, self.name, self.lib = device, name, lib
16
+
17
+ if DEBUG >= 6: print(disasm(lib))
18
+
19
+ check(hip.hipSetDevice(self.device.device_id))
20
+ self.module = init_c_var(hip.hipModule_t(), lambda x: check(hip.hipModuleLoadData(ctypes.byref(x), lib)))
21
+ self.prg = init_c_var(hip.hipFunction_t(), lambda x: check(hip.hipModuleGetFunction(ctypes.byref(x), self.module, name.encode("utf-8"))))
22
+
23
+ def __del__(self):
24
+ if hasattr(self, 'module'): check(hip.hipModuleUnload(self.module))
25
+
26
+ def __call__(self, *args, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1), vals:Tuple[int, ...]=(), wait=False):
27
+ check(hip.hipSetDevice(self.device.device_id))
28
+ if not hasattr(self, "vargs"):
29
+ self.c_args = init_c_struct_t(tuple([(f'f{i}', hip.hipDeviceptr_t) for i in range(len(args))] +
30
+ [(f'v{i}', ctypes.c_int) for i in range(len(vals))]))(*args, *vals)
31
+ self.vargs = (ctypes.c_void_p * 5)(1, ctypes.cast(ctypes.byref(self.c_args), ctypes.c_void_p), 2,
32
+ ctypes.cast(ctypes.pointer(ctypes.c_size_t(ctypes.sizeof(self.c_args))), ctypes.c_void_p), 3)
33
+
34
+ for i in range(len(args)): self.c_args.__setattr__(f'f{i}', args[i])
35
+ for i in range(len(vals)): self.c_args.__setattr__(f'v{i}', vals[i])
36
+
37
+ if wait: check(hip.hipEventRecord(self.device.time_event_st, None))
38
+
39
+ check(hip.hipModuleLaunchKernel(self.prg, *global_size, *local_size, 0, None, None, self.vargs))
40
+
41
+ if wait:
42
+ check(hip.hipEventRecord(self.device.time_event_en, None))
43
+ check(hip.hipEventSynchronize(self.device.time_event_en))
44
+ check(hip.hipEventElapsedTime(ctypes.byref(ret := ctypes.c_float()), self.device.time_event_st, self.device.time_event_en))
45
+ return ret.value * 1e-3
46
+
47
+ class HIPAllocator(LRUAllocator):
48
+ def __init__(self, device:HIPDevice):
49
+ self.device = device
50
+ super().__init__()
51
+ def _alloc(self, size:int, options:BufferOptions):
52
+ check(hip.hipSetDevice(self.device.device_id))
53
+ return init_c_var(hip.hipDeviceptr_t(), lambda x: check(hip.hipMalloc(ctypes.byref(x), size)))
54
+ def _free(self, opaque, options:BufferOptions): check(hip.hipFree(opaque))
55
+ def copyin(self, dest, src: memoryview):
56
+ check(hip.hipSetDevice(self.device.device_id))
57
+ check(hip.hipMemcpy(dest, from_mv(src), len(src), hip.hipMemcpyHostToDevice))
58
+ def copyout(self, dest:memoryview, src):
59
+ self.device.synchronize()
60
+ check(hip.hipMemcpy(from_mv(dest), src, len(dest), hip.hipMemcpyDeviceToHost))
61
+
62
+ class HIPDevice(Compiled):
63
+ def __init__(self, device:str=""):
64
+ self.device_id = int(device.split(":")[1]) if ":" in device else 0
65
+ self.arch = init_c_var(hip.hipDeviceProp_t(), lambda x: check(hip.hipGetDeviceProperties(x, self.device_id))).gcnArchName.decode()
66
+ self.time_event_st, self.time_event_en = [init_c_var(hip.hipEvent_t(), lambda x: hip.hipEventCreate(ctypes.byref(x), 0)) for _ in range(2)]
67
+ super().__init__(device, HIPAllocator(self), HIPRenderer(), AMDCompiler(self.arch), functools.partial(HIPProgram, self))
68
+ def synchronize(self):
69
+ check(hip.hipSetDevice(self.device_id))
70
+ check(hip.hipDeviceSynchronize())
@@ -1,7 +1,7 @@
1
1
  from __future__ import annotations
2
2
  import os, subprocess, pathlib, ctypes, tempfile, functools
3
3
  import Metal, libdispatch
4
- from typing import List, Set, Any, Tuple, Optional
4
+ from typing import List, Any, Tuple, Optional
5
5
  from tinygrad.helpers import prod, getenv, DEBUG, unwrap2
6
6
  from tinygrad.device import Compiled, Compiler, CompileError, LRUAllocator
7
7
  from tinygrad.renderer.cstyle import MetalRenderer
@@ -33,7 +33,9 @@ class MetalProgram:
33
33
  with tempfile.NamedTemporaryFile(delete=True) as shader:
34
34
  shader.write(lib)
35
35
  shader.flush()
36
- os.system(f"cd {pathlib.Path(__file__).parents[2]}/extra/disassemblers/applegpu && python3 compiler_explorer.py {shader.name}")
36
+ ret = os.system(f"cd {pathlib.Path(__file__).parents[2]}/extra/disassemblers/applegpu && python3 compiler_explorer.py {shader.name}")
37
+ if ret:
38
+ print("Error running disassembler: Make sure you have https://github.com/dougallj/applegpu cloned to tinygrad/extra/disassemblers/applegpu")
37
39
  assert lib[:4] == b"MTLB", "Invalid Metal library. Could be due to using conda. Try system python or METAL_XCODE=1 DISABLE_COMPILER_CACHE=1."
38
40
  data = libdispatch.dispatch_data_create(lib, len(lib), None, None)
39
41
  self.library = unwrap2(self.device.device.newLibraryWithData_error_(data, None))
@@ -45,7 +47,7 @@ class MetalProgram:
45
47
  command_buffer = self.device.mtl_queue.commandBuffer()
46
48
  encoder = command_buffer.computeCommandEncoder()
47
49
  encoder.setComputePipelineState_(self.pipeline_state)
48
- for i,a in enumerate(bufs): encoder.setBuffer_offset_atIndex_(a, 0, i)
50
+ for i,a in enumerate(bufs): encoder.setBuffer_offset_atIndex_(a.buf, a.offset, i)
49
51
  for i,a in enumerate(vals,start=len(bufs)): encoder.setBytes_length_atIndex_(ctypes.c_int32(a), 4, i)
50
52
  encoder.dispatchThreadgroups_threadsPerThreadgroup_(Metal.MTLSize(*global_size), Metal.MTLSize(*local_size))
51
53
  encoder.endEncoding()
@@ -55,46 +57,56 @@ class MetalProgram:
55
57
  return command_buffer.GPUEndTime() - command_buffer.GPUStartTime()
56
58
  self.device.mtl_buffers_in_flight.append(command_buffer)
57
59
 
60
+ class MetalBuffer:
61
+ def __init__(self, buf:Any, size:int, offset=0): self.buf, self.size, self.offset = buf, size, offset
62
+
58
63
  class MetalAllocator(LRUAllocator):
59
64
  def __init__(self, device:MetalDevice):
60
65
  self.device:MetalDevice = device
61
- self.track_cross_device: Set[MetalDevice] = set()
62
66
  super().__init__()
63
- def free_cache(self):
64
- self.device.synchronize()
65
- for x in self.track_cross_device: x.synchronize()
66
- self.track_cross_device.clear()
67
- return super().free_cache()
68
- def _alloc(self, size:int, options) -> Any:
67
+ def _alloc(self, size:int, options) -> MetalBuffer:
69
68
  ret = self.device.device.newBufferWithLength_options_(size, Metal.MTLResourceStorageModeShared)
70
69
  if ret is None: raise MemoryError(f"Metal OOM while allocating {size=}")
71
- return ret
72
- def transfer(self, dest:Any, src:Any, sz:int, src_dev: MetalDevice, **kwargs):
73
- src_dev.synchronize()
74
- command_buffer = self.device.mtl_queue.commandBuffer()
75
- encoder = command_buffer.blitCommandEncoder()
76
- encoder.copyFromBuffer_sourceOffset_toBuffer_destinationOffset_size_(src, 0, dest, 0, sz)
70
+ return MetalBuffer(ret, size)
71
+ def _free(self, opaque:MetalBuffer, options): opaque.buf.release()
72
+ def transfer(self, dest:MetalBuffer, src:MetalBuffer, sz:int, src_dev:MetalDevice, dest_dev:MetalDevice):
73
+ dest_dev.synchronize()
74
+ src_command_buffer = src_dev.mtl_queue.commandBuffer()
75
+ encoder = src_command_buffer.blitCommandEncoder()
76
+ encoder.copyFromBuffer_sourceOffset_toBuffer_destinationOffset_size_(src.buf, src.offset, dest.buf, dest.offset, sz)
77
77
  encoder.endEncoding()
78
- command_buffer.commit()
79
- self.device.mtl_buffers_in_flight.append(command_buffer)
78
+ if src_dev != dest_dev:
79
+ src_command_buffer.encodeSignalEvent_value_(src_dev.timeline_signal, src_dev.timeline_value)
80
+ dest_command_buffer = dest_dev.mtl_queue.commandBuffer()
81
+ dest_command_buffer.encodeWaitForEvent_value_(src_dev.timeline_signal, src_dev.timeline_value)
82
+ dest_command_buffer.commit()
83
+ dest_dev.mtl_buffers_in_flight.append(dest_command_buffer)
84
+ src_dev.timeline_value += 1
85
+ src_command_buffer.commit()
86
+ src_dev.mtl_buffers_in_flight.append(src_command_buffer)
80
87
  def from_buffer(self, src:memoryview) -> Optional[Any]:
81
- ret = self.device.device.newBufferWithBytesNoCopy_length_options_deallocator_(src, len(src), Metal.MTLResourceStorageModeShared, None)
88
+ ret = self.device.device.newBufferWithBytesNoCopy_length_options_deallocator_(src, src.nbytes, Metal.MTLResourceStorageModeShared, None)
82
89
  if ret: self.device.mv_in_metal.append(src)
83
- return ret
84
- def _free(self, opaque:Any, options): opaque.release()
85
- def as_buffer(self, src:Any) -> memoryview:
90
+ return MetalBuffer(ret, src.nbytes)
91
+ def as_buffer(self, src:MetalBuffer) -> memoryview:
86
92
  self.device.synchronize()
87
- return src.contents().as_buffer(src.length())
88
- def copyin(self, dest:Any, src:memoryview): self.as_buffer(dest)[:] = src
89
- def copyout(self, dest:memoryview, src:Any): dest[:] = self.as_buffer(src)
93
+ return src.buf.contents().as_buffer(src.offset+src.size)[src.offset:]
94
+ def copyin(self, dest:MetalBuffer, src:memoryview): self.as_buffer(dest)[:] = src
95
+ def copyout(self, dest:memoryview, src:MetalBuffer): dest[:] = self.as_buffer(src)
96
+ def offset(self, buf:MetalBuffer, size:int, offset:int): return MetalBuffer(buf.buf, size, offset)
90
97
 
91
98
  class MetalDevice(Compiled):
92
99
  def __init__(self, device:str):
93
100
  self.device = Metal.MTLCreateSystemDefaultDevice()
94
101
  self.mtl_queue = self.device.newCommandQueueWithMaxCommandBufferCount_(1024)
102
+ if self.mtl_queue is None: raise RuntimeError("Cannot allocate a new command queue")
103
+
95
104
  self.mtl_buffers_in_flight: List[Any] = []
96
105
  self.mv_in_metal: List[memoryview] = []
97
- self.track_cross_buffer: List[Any] = []
106
+
107
+ self.timeline_signal = self.device.newSharedEvent()
108
+ self.timeline_value = 0
109
+
98
110
  from tinygrad.runtime.graph.metal import MetalGraph
99
111
  super().__init__(device, MetalAllocator(self), MetalRenderer(), MetalCompiler(None if getenv("METAL_XCODE") else self),
100
112
  functools.partial(MetalProgram, self), MetalGraph)
@@ -102,4 +114,3 @@ class MetalDevice(Compiled):
102
114
  for cbuf in self.mtl_buffers_in_flight: wait_check(cbuf)
103
115
  self.mv_in_metal.clear()
104
116
  self.mtl_buffers_in_flight.clear()
105
- self.track_cross_buffer.clear()