PyPI - tinygrad - Versions diffs - 0.9.1__py3-none-any.whl → 0.9.2__py3-none-any.whl - Mend

tinygrad 0.9.1py3-none-any.whl → 0.9.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (56) hide show

tinygrad/codegen/kernel.py +248 -115
tinygrad/codegen/lowerer.py +215 -0
tinygrad/codegen/transcendental.py +310 -0
tinygrad/codegen/uopgraph.py +622 -0
tinygrad/codegen/uops.py +235 -393
tinygrad/device.py +428 -69
tinygrad/dtype.py +18 -4
tinygrad/engine/graph.py +19 -32
tinygrad/engine/jit.py +148 -70
tinygrad/engine/realize.py +127 -51
tinygrad/engine/schedule.py +259 -216
tinygrad/engine/search.py +29 -22
tinygrad/function.py +9 -0
tinygrad/helpers.py +87 -49
tinygrad/lazy.py +34 -35
tinygrad/multi.py +41 -36
tinygrad/nn/__init__.py +39 -22
tinygrad/nn/state.py +3 -3
tinygrad/ops.py +63 -62
tinygrad/renderer/__init__.py +43 -21
tinygrad/renderer/assembly.py +104 -106
tinygrad/renderer/cstyle.py +87 -60
tinygrad/renderer/llvmir.py +21 -30
tinygrad/runtime/autogen/amd_gpu.py +25208 -5753
tinygrad/runtime/autogen/cuda.py +6 -162
tinygrad/runtime/autogen/kfd.py +32 -0
tinygrad/runtime/autogen/libc.py +4260 -0
tinygrad/runtime/autogen/nvrtc.py +579 -0
tinygrad/runtime/graph/clang.py +2 -2
tinygrad/runtime/graph/cuda.py +8 -11
tinygrad/runtime/graph/hcq.py +120 -107
tinygrad/runtime/graph/metal.py +18 -15
tinygrad/runtime/ops_amd.py +197 -305
tinygrad/runtime/ops_clang.py +2 -2
tinygrad/runtime/ops_cuda.py +36 -94
tinygrad/runtime/ops_disk.py +3 -7
tinygrad/runtime/ops_gpu.py +4 -2
tinygrad/runtime/ops_hip.py +70 -0
tinygrad/runtime/ops_metal.py +38 -27
tinygrad/runtime/ops_nv.py +283 -363
tinygrad/runtime/ops_python.py +26 -30
tinygrad/runtime/support/compiler_cuda.py +78 -0
tinygrad/runtime/{driver/hip_comgr.py → support/compiler_hip.py} +15 -1
tinygrad/runtime/support/elf.py +38 -0
tinygrad/shape/shapetracker.py +5 -14
tinygrad/shape/symbolic.py +4 -8
tinygrad/shape/view.py +34 -22
tinygrad/tensor.py +399 -97
{tinygrad-0.9.1.dist-info → tinygrad-0.9.2.dist-info}/METADATA +49 -48
tinygrad-0.9.2.dist-info/RECORD +70 -0
{tinygrad-0.9.1.dist-info → tinygrad-0.9.2.dist-info}/WHEEL +1 -1
tinygrad/codegen/linearizer.py +0 -528
tinygrad-0.9.1.dist-info/RECORD +0 -63
/tinygrad/runtime/{driver → support}/__init__.py +0 -0
{tinygrad-0.9.1.dist-info → tinygrad-0.9.2.dist-info}/LICENSE +0 -0
{tinygrad-0.9.1.dist-info → tinygrad-0.9.2.dist-info}/top_level.txt +0 -0

tinygrad/runtime/ops_clang.py CHANGED Viewed

@@ -7,8 +7,8 @@ class ClangCompiler(Compiler):
   def compile(self, src:str) -> bytes:
     # TODO: remove file write. sadly clang doesn't like the use of /dev/stdout here
     with tempfile.NamedTemporaryFile(delete=True) as output_file:
-      subprocess.check_output(['clang', '-include', 'tgmath.h', '-shared', '-march=native', '-O2', '-Wall', '-Werror', '-x', 'c', '-fPIC', '-',
-                               '-o', str(output_file.name)], input=src.encode('utf-8'))
+      subprocess.check_output(['clang', '-shared', '-march=native', '-O2', '-Wall', '-Werror', '-x', 'c', '-fPIC', '-ffreestanding', '-nostdlib',
+                               '-', '-o', str(output_file.name)], input=src.encode('utf-8'))
       return pathlib.Path(output_file.name).read_bytes()
 class ClangProgram:

tinygrad/runtime/ops_cuda.py CHANGED Viewed

@@ -1,30 +1,14 @@
 from __future__ import annotations
-import subprocess, hashlib, tempfile, ctypes, ctypes.util, functools, re
-from pathlib import Path
+import ctypes, ctypes.util, functools
 from typing import Tuple, Optional, List
-import tinygrad.runtime.autogen.cuda as cuda
-from tinygrad.helpers import DEBUG, getenv, from_mv, to_char_p_p, init_c_var, init_c_struct_t, colored, cpu_time_execution
-from tinygrad.device import Compiled, Compiler, CompileError, BufferOptions, LRUAllocator, MallocAllocator
+from tinygrad.helpers import DEBUG, getenv, from_mv, init_c_var, init_c_struct_t
+from tinygrad.device import Compiled, BufferOptions, LRUAllocator
 from tinygrad.renderer.cstyle import CUDARenderer
 from tinygrad.renderer.assembly import PTXRenderer
+from tinygrad.runtime.autogen import cuda
+from tinygrad.runtime.support.compiler_cuda import cuda_disassemble, pretty_ptx, CUDACompiler, PTXCompiler, PTX
 if getenv("IOCTL"): import extra.nv_gpu_driver.nv_ioctl  # noqa: F401  # pylint: disable=unused-import
-def pretty_ptx(s):
-  # all expressions match `<valid_before><expr><valid_after>` and replace it with `<valid_before>color(<expr>)<valid_after>`
-  s = re.sub(r'([!@<\[\s,\+\-;\n])((?:[_%$][\w%\$_]+(?:\.[xyz])?\:?)|(?:buf\d+))([<>\]\s,\+\-;\n\)])', lambda m:m[1]+colored(m[2], "blue")+m[3], s, flags=re.M) # identifiers  # noqa: E501
-  s = re.sub(r'(.)((?:b|s|u|f)(?:8|16|32|64)|pred)([\.\s])', lambda m:m[1]+colored(m[2], "green")+m[3], s, flags=re.M) # types
-  s = re.sub(r'^(\s*)([\w]+)(.*?;$)', lambda m:m[1]+colored(m[2], "yellow")+m[3], s, flags=re.M) # instructions
-  s = re.sub(r'([<>\[\]\s,\+\-;])((?:0[fF][0-9a-fA-F]{8})|(?:[0-9]+)|(?:0[xX][0-9a-fA-F]+))([<>\[\]\s,\+\-;])', lambda m:m[1]+colored(m[2], "yellow")+m[3], s, flags=re.M) # numbers  # noqa: E501
-  s = re.sub(r'(\.)(param|reg|global)', lambda m:m[1]+colored(m[2], "magenta"), s, flags=re.M) # space
-  s = re.sub(r'(\.)(version|target|address_size|visible|entry)', lambda m:m[1]+colored(m[2], "magenta"), s, flags=re.M) # derivatives
-  return s
-CUDACPU = getenv("CUDACPU") == 1
-if CUDACPU:
-  gpuocelot_lib = ctypes.CDLL(ctypes.util.find_library("gpuocelot"))
-  gpuocelot_lib.ptx_run.argtypes = [ctypes.c_char_p, ctypes.c_int, ctypes.POINTER(ctypes.c_void_p), ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int]  # noqa: E501
-  cuda.cuLaunchKernel = lambda src, gx, gy, gz, lx, ly, lz, shared, stream, unused_extra, args: gpuocelot_lib.ptx_run(src, len(args), (ctypes.c_void_p * len(args))(*[ctypes.cast(x, ctypes.c_void_p) for x in args]), lx, ly, lz, gx, gy, gz, shared)  # type: ignore  # noqa: E501
 def check(status):
   if status != 0: raise RuntimeError(f"CUDA Error {status}, {ctypes.string_at(init_c_var(ctypes.POINTER(ctypes.c_char)(), lambda x: cuda.cuGetErrorString(status, ctypes.byref(x)))).decode()}")  # noqa: E501
@@ -36,7 +20,6 @@ def encode_args(args, vals) -> Tuple[ctypes.Structure, ctypes.Array]:
   return c_args, vargs
 def cu_time_execution(cb, enable=False) -> Optional[float]:
-  if CUDACPU: return cpu_time_execution(cb, enable=enable)
   if not enable: return cb()
   evs = [init_c_var(cuda.CUevent(), lambda x: cuda.cuEventCreate(ctypes.byref(x), 0)) for _ in range(2)]
   cuda.cuEventRecord(evs[0], None)
@@ -47,69 +30,32 @@ def cu_time_execution(cb, enable=False) -> Optional[float]:
   for ev in evs: cuda.cuEventDestroy_v2(ev)
   return ret.value * 1e-3
-def _get_bytes(arg, get_str, get_sz, check) -> bytes:
-  sz = init_c_var(ctypes.c_size_t(), lambda x: check(get_sz(arg, ctypes.byref(x))))
-  return ctypes.string_at(init_c_var(ctypes.create_string_buffer(sz.value), lambda x: check(get_str(arg, x))), size=sz.value)
-class PTXCompiler(Compiler):
-  def __init__(self, arch:str):
-    self.arch = arch
-    self.version = "7.8" if arch >= "sm_89" else "7.5"
-    super().__init__(f"compile_ptx_{self.arch}")
-  def compile(self, src:str) -> bytes: return src.replace("TARGET", self.arch).replace("VERSION", self.version).encode()
-class CUDACompiler(Compiler):
-  def __init__(self, arch:str):
-    self.arch = arch
-    check(cuda.nvrtcVersion((nvrtcMajor := ctypes.c_int()), (nvrtcMinor := ctypes.c_int())))
-    self.compile_options = [f'--gpu-architecture={arch}', "-I/usr/local/cuda/include", "-I/usr/include", "-I/opt/cuda/include/"]
-    if (nvrtcMajor.value, nvrtcMinor.value) >= (12, 4): self.compile_options.append("--minimal")
-    super().__init__(f"compile_cuda_{self.arch}")
-  def compile(self, src:str) -> bytes:
-    check(cuda.nvrtcCreateProgram(ctypes.byref(prog := cuda.nvrtcProgram()), src.encode(), "<null>".encode(), 0, None, None))
-    status = cuda.nvrtcCompileProgram(prog, len(self.compile_options), to_char_p_p([o.encode() for o in self.compile_options]))
-    if status != 0: raise CompileError(f"compile failed: {_get_bytes(prog, cuda.nvrtcGetProgramLog, cuda.nvrtcGetProgramLogSize, check).decode()}")
-    return _get_bytes(prog, cuda.nvrtcGetPTX, cuda.nvrtcGetPTXSize, check)
-def cuda_disassemble(lib, arch):
-  try:
-    fn = (Path(tempfile.gettempdir()) / f"tinycuda_{hashlib.md5(lib).hexdigest()}").as_posix()
-    with open(fn + ".ptx", "wb") as f: f.write(lib)
-    subprocess.run(["ptxas", f"-arch={arch}", "-o", fn, fn+".ptx"], check=True)
-    print(subprocess.check_output(['nvdisasm', fn]).decode('utf-8'))
-  except Exception as e: print("failed to generate SASS", str(e))
 class CUDAProgram:
   def __init__(self, device:CUDADevice, name:str, lib:bytes):
     self.device, self.name, self.lib = device, name, lib
     if DEBUG >= 5: print("\n".join([f"{i+1:>3} {line}" for i, line in enumerate(pretty_ptx(lib.decode('utf-8')).split("\n"))]))
     if DEBUG >= 6: cuda_disassemble(lib, device.arch)
-    if CUDACPU: self.prg = lib
-    else:
-      check(cuda.cuCtxSetCurrent(self.device.context))
-      self.module = cuda.CUmodule()
-      status = cuda.cuModuleLoadData(ctypes.byref(self.module), lib)
-      if status != 0:
-        del self.module
-        cuda_disassemble(lib, device.arch)
-        raise RuntimeError(f"module load failed with status code {status}: {cuda.cudaError_enum__enumvalues[status]}")
-      check(cuda.cuModuleGetFunction(ctypes.byref(prg := cuda.CUfunction()), self.module, name.encode("utf-8")))
-      self.prg = prg #type: ignore
+    check(cuda.cuCtxSetCurrent(self.device.context))
+    self.module = cuda.CUmodule()
+    status = cuda.cuModuleLoadData(ctypes.byref(self.module), lib)
+    if status != 0:
+      del self.module
+      cuda_disassemble(lib, device.arch)
+      raise RuntimeError(f"module load failed with status code {status}: {cuda.cudaError_enum__enumvalues[status]}")
+    check(cuda.cuModuleGetFunction(ctypes.byref(prg := cuda.CUfunction()), self.module, name.encode("utf-8")))
+    self.prg = prg #type: ignore
   def __del__(self):
     if hasattr(self, 'module'): check(cuda.cuModuleUnload(self.module))
   def __call__(self, *args, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1), vals:Tuple[int, ...]=(), wait=False):
-    if CUDACPU: self.vargs = args+tuple(vals)
+    check(cuda.cuCtxSetCurrent(self.device.context))
+    if not hasattr(self, "vargs"):
+      self.c_args, self.vargs = encode_args(args, vals) #type: ignore
     else:
-      check(cuda.cuCtxSetCurrent(self.device.context))
-      if not hasattr(self, "vargs"):
-        self.c_args, self.vargs = encode_args(args, vals) #type: ignore
-      else:
-        for i in range(len(args)): self.c_args.__setattr__(f'f{i}', args[i])
-        for i in range(len(vals)): self.c_args.__setattr__(f'v{i}', vals[i])
+      for i in range(len(args)): self.c_args.__setattr__(f'f{i}', args[i])
+      for i in range(len(vals)): self.c_args.__setattr__(f'v{i}', vals[i])
     return cu_time_execution(lambda: check(cuda.cuLaunchKernel(self.prg, *global_size, *local_size, 0, None, None, self.vargs)), enable=wait)
 class CUDAAllocator(LRUAllocator):
@@ -148,33 +94,29 @@ class CUDADevice(Compiled):
   def __init__(self, device:str):
     device_id = int(device.split(":")[1]) if ":" in device else 0
-    if not CUDACPU:
-      check(cuda.cuInit(0))
-      self.cu_device = init_c_var(cuda.CUdevice(), lambda x: check(cuda.cuDeviceGet(ctypes.byref(x), device_id)))
-      self.context = init_c_var(cuda.CUcontext(), lambda x: check(cuda.cuCtxCreate_v2(ctypes.byref(x), 0, self.cu_device)))
-      check(cuda.cuDeviceComputeCapability(ctypes.byref(major := ctypes.c_int()), ctypes.byref(minor := ctypes.c_int()), device_id))
-      for dev in CUDADevice.devices:
-        check(cuda.cuDeviceCanAccessPeer(ctypes.byref(val := ctypes.c_int()), self.cu_device, dev.cu_device))
-        if val.value != 1: continue
-        check(cuda.cuCtxSetCurrent(dev.context))
-        check(cuda.cuCtxEnablePeerAccess(self.context, 0))
-        check(cuda.cuCtxSetCurrent(self.context))
-        check(cuda.cuCtxEnablePeerAccess(dev.context, 0))
-        CUDADevice.peer_access = True
-    self.arch = f"sm_{major.value}{minor.value}" if not CUDACPU else "sm_35"
+    check(cuda.cuInit(0))
+    self.cu_device = init_c_var(cuda.CUdevice(), lambda x: check(cuda.cuDeviceGet(ctypes.byref(x), device_id)))
+    self.context = init_c_var(cuda.CUcontext(), lambda x: check(cuda.cuCtxCreate_v2(ctypes.byref(x), 0, self.cu_device)))
+    check(cuda.cuDeviceComputeCapability(ctypes.byref(major := ctypes.c_int()), ctypes.byref(minor := ctypes.c_int()), device_id))
+    for dev in CUDADevice.devices:
+      check(cuda.cuDeviceCanAccessPeer(ctypes.byref(val := ctypes.c_int()), self.cu_device, dev.cu_device))
+      if val.value != 1: continue
+      check(cuda.cuCtxSetCurrent(dev.context))
+      check(cuda.cuCtxEnablePeerAccess(self.context, 0))
+      check(cuda.cuCtxSetCurrent(self.context))
+      check(cuda.cuCtxEnablePeerAccess(dev.context, 0))
+      CUDADevice.peer_access = True
+    self.arch = f"sm_{major.value}{minor.value}"
     self.pending_copyin: List[Tuple[int, int, Optional[BufferOptions]]] = []
     CUDADevice.devices.append(self)
     from tinygrad.runtime.graph.cuda import CUDAGraph
-    super().__init__(device, CUDAAllocator(self) if not CUDACPU else MallocAllocator,
-                     PTXRenderer(self.arch) if getenv("PTX") else CUDARenderer(self.arch),
-                     PTXCompiler(self.arch) if getenv("PTX") else CUDACompiler(self.arch),
-                     functools.partial(CUDAProgram, self), graph=CUDAGraph if not CUDACPU else None)
+    super().__init__(device, CUDAAllocator(self), PTXRenderer(self.arch) if PTX else CUDARenderer(self.arch),
+                     PTXCompiler(self.arch) if PTX else CUDACompiler(self.arch), functools.partial(CUDAProgram, self), graph=CUDAGraph)
   def synchronize(self):
-    if CUDACPU: return
     check(cuda.cuCtxSetCurrent(self.context))
     check(cuda.cuCtxSynchronize())
     for opaque,sz,options in self.pending_copyin: self.allocator.free(opaque, sz, options)

tinygrad/runtime/ops_disk.py CHANGED Viewed

@@ -1,13 +1,9 @@
 from __future__ import annotations
-import os, mmap, _posixshmem, io, ctypes, ctypes.util, platform, contextlib
+import os, sys, mmap, _posixshmem, io, ctypes, ctypes.util, platform, contextlib
 from typing import Optional, Generator, Tuple, Callable, List
 from tinygrad.helpers import OSX, round_up
 from tinygrad.device import Compiled, Allocator
-import tinygrad.runtime.autogen.io_uring as io_uring
-libc = ctypes.CDLL(ctypes.util.find_library("c"))
-libc.mmap.argtypes = [ctypes.c_void_p, ctypes.c_size_t, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_long]
-libc.mmap.restype = ctypes.c_void_p
+from tinygrad.runtime.autogen import io_uring, libc
 class DiskBuffer:
   def __init__(self, device:DiskDevice, size:int, offset=0):
@@ -104,7 +100,7 @@ class DiskDevice(Compiled):
   def _iouring_setup(self):
     DiskDevice._tried_io_uring_init = True
-    if platform.system() != 'Linux': return
+    if platform.system() != 'Linux' or hasattr(sys, "getandroidapilevel"): return
     fd = libc.syscall(io_uring.NR_io_uring_setup, 4096, ctypes.byref(p:=io_uring.struct_io_uring_params()))
     if fd < 0: return

tinygrad/runtime/ops_gpu.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from __future__ import annotations
 from typing import Tuple, Optional, List, cast
 import ctypes, functools, hashlib
-import tinygrad.runtime.autogen.opencl as cl
+from tinygrad.runtime.autogen import opencl as cl
 from tinygrad.helpers import init_c_var, to_char_p_p, from_mv, OSX, DEBUG
 from tinygrad.renderer.cstyle import OpenCLRenderer
 from tinygrad.device import BufferOptions, LRUAllocator, Compiled, Compiler, CompileError
@@ -9,8 +9,9 @@ from tinygrad.device import BufferOptions, LRUAllocator, Compiled, Compiler, Com
 # see test/external/external_osx_profiling.py to determine this ratio. it's in like GPU clocks or something
 OSX_TIMING_RATIO = (125/3) if OSX else 1.0
+cl_errors = {attr: k for k in dir(cl) if k.startswith("CL_") and (attr:=getattr(cl, k)) <= 0}
 def check(status):
-  if status != 0: raise RuntimeError(f"OpenCL Error {status}")
+  if status != 0: raise RuntimeError(f"OpenCL Error {status}: {cl_errors.get(status, 'Unknown error')}")
 def checked(ret, status): return (check(status.value), ret)[1]
 class CLCompiler(Compiler):
@@ -90,6 +91,7 @@ class CLDevice(Compiled):
     self.device_id = CLDevice.device_ids[0 if ":" not in device else int(device.split(":")[1])]
     self.device_name = (cl.clGetDeviceInfo(self.device_id, cl.CL_DEVICE_NAME, 256, buf := ctypes.create_string_buffer(256), None), buf.value.decode())[1]  # noqa: E501
     self.driver_version = (cl.clGetDeviceInfo(self.device_id, cl.CL_DRIVER_VERSION, 256, buf := ctypes.create_string_buffer(256), None), buf.value.decode())[1]  # noqa: E501
+    if DEBUG >= 1: print(f"CLDevice: opening {self.device_name} with version {self.driver_version}")
     self.context = checked(cl.clCreateContext(None, 1, self.device_id, cl.clCreateContext.argtypes[3](), None, status := ctypes.c_int32()), status)
     self.queue = checked(cl.clCreateCommandQueue(self.context, self.device_id, cl.CL_QUEUE_PROFILING_ENABLE, status), status)
     self.pending_copyin: List[memoryview] = []

tinygrad/runtime/ops_hip.py ADDED Viewed

@@ -0,0 +1,70 @@
+from __future__ import annotations
+import ctypes, functools
+from typing import Tuple
+from tinygrad.helpers import DEBUG, init_c_var, from_mv, init_c_struct_t
+from tinygrad.device import Compiled, LRUAllocator, BufferOptions
+from tinygrad.runtime.autogen import hip
+from tinygrad.runtime.support.compiler_hip import AMDCompiler, disasm
+from tinygrad.renderer.cstyle import HIPRenderer
+def check(status):
+  if status != 0: raise RuntimeError(f"HIP Error {status}, {ctypes.string_at(hip.hipGetErrorString(status)).decode()}")
+class HIPProgram:
+  def __init__(self, device:HIPDevice, name:str, lib:bytes):
+    self.device, self.name, self.lib = device, name, lib
+    if DEBUG >= 6: print(disasm(lib))
+    check(hip.hipSetDevice(self.device.device_id))
+    self.module = init_c_var(hip.hipModule_t(), lambda x: check(hip.hipModuleLoadData(ctypes.byref(x), lib)))
+    self.prg = init_c_var(hip.hipFunction_t(), lambda x: check(hip.hipModuleGetFunction(ctypes.byref(x), self.module, name.encode("utf-8"))))
+  def __del__(self):
+    if hasattr(self, 'module'): check(hip.hipModuleUnload(self.module))
+  def __call__(self, *args, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1), vals:Tuple[int, ...]=(), wait=False):
+    check(hip.hipSetDevice(self.device.device_id))
+    if not hasattr(self, "vargs"):
+      self.c_args = init_c_struct_t(tuple([(f'f{i}', hip.hipDeviceptr_t) for i in range(len(args))] +
+                                          [(f'v{i}', ctypes.c_int) for i in range(len(vals))]))(*args, *vals)
+      self.vargs = (ctypes.c_void_p * 5)(1, ctypes.cast(ctypes.byref(self.c_args), ctypes.c_void_p), 2,
+                                         ctypes.cast(ctypes.pointer(ctypes.c_size_t(ctypes.sizeof(self.c_args))), ctypes.c_void_p), 3)
+    for i in range(len(args)): self.c_args.__setattr__(f'f{i}', args[i])
+    for i in range(len(vals)): self.c_args.__setattr__(f'v{i}', vals[i])
+    if wait: check(hip.hipEventRecord(self.device.time_event_st, None))
+    check(hip.hipModuleLaunchKernel(self.prg, *global_size, *local_size, 0, None, None, self.vargs))
+    if wait:
+      check(hip.hipEventRecord(self.device.time_event_en, None))
+      check(hip.hipEventSynchronize(self.device.time_event_en))
+      check(hip.hipEventElapsedTime(ctypes.byref(ret := ctypes.c_float()), self.device.time_event_st, self.device.time_event_en))
+      return ret.value * 1e-3
+class HIPAllocator(LRUAllocator):
+  def __init__(self, device:HIPDevice):
+    self.device = device
+    super().__init__()
+  def _alloc(self, size:int, options:BufferOptions):
+    check(hip.hipSetDevice(self.device.device_id))
+    return init_c_var(hip.hipDeviceptr_t(), lambda x: check(hip.hipMalloc(ctypes.byref(x), size)))
+  def _free(self, opaque, options:BufferOptions): check(hip.hipFree(opaque))
+  def copyin(self, dest, src: memoryview):
+    check(hip.hipSetDevice(self.device.device_id))
+    check(hip.hipMemcpy(dest, from_mv(src), len(src), hip.hipMemcpyHostToDevice))
+  def copyout(self, dest:memoryview, src):
+    self.device.synchronize()
+    check(hip.hipMemcpy(from_mv(dest), src, len(dest), hip.hipMemcpyDeviceToHost))
+class HIPDevice(Compiled):
+  def __init__(self, device:str=""):
+    self.device_id = int(device.split(":")[1]) if ":" in device else 0
+    self.arch = init_c_var(hip.hipDeviceProp_t(), lambda x: check(hip.hipGetDeviceProperties(x, self.device_id))).gcnArchName.decode()
+    self.time_event_st, self.time_event_en = [init_c_var(hip.hipEvent_t(), lambda x: hip.hipEventCreate(ctypes.byref(x), 0)) for _ in range(2)]
+    super().__init__(device, HIPAllocator(self), HIPRenderer(), AMDCompiler(self.arch), functools.partial(HIPProgram, self))
+  def synchronize(self):
+    check(hip.hipSetDevice(self.device_id))
+    check(hip.hipDeviceSynchronize())

tinygrad/runtime/ops_metal.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from __future__ import annotations
 import os, subprocess, pathlib, ctypes, tempfile, functools
 import Metal, libdispatch
-from typing import List, Set, Any, Tuple, Optional
+from typing import List, Any, Tuple, Optional
 from tinygrad.helpers import prod, getenv, DEBUG, unwrap2
 from tinygrad.device import Compiled, Compiler, CompileError, LRUAllocator
 from tinygrad.renderer.cstyle import MetalRenderer
@@ -33,7 +33,9 @@ class MetalProgram:
       with tempfile.NamedTemporaryFile(delete=True) as shader:
         shader.write(lib)
         shader.flush()
-        os.system(f"cd {pathlib.Path(__file__).parents[2]}/extra/disassemblers/applegpu && python3 compiler_explorer.py {shader.name}")
+        ret = os.system(f"cd {pathlib.Path(__file__).parents[2]}/extra/disassemblers/applegpu && python3 compiler_explorer.py {shader.name}")
+        if ret:
+          print("Error running disassembler: Make sure you have https://github.com/dougallj/applegpu cloned to tinygrad/extra/disassemblers/applegpu")
     assert lib[:4] == b"MTLB", "Invalid Metal library. Could be due to using conda. Try system python or METAL_XCODE=1 DISABLE_COMPILER_CACHE=1."
     data = libdispatch.dispatch_data_create(lib, len(lib), None, None)
     self.library = unwrap2(self.device.device.newLibraryWithData_error_(data, None))
@@ -45,7 +47,7 @@ class MetalProgram:
     command_buffer = self.device.mtl_queue.commandBuffer()
     encoder = command_buffer.computeCommandEncoder()
     encoder.setComputePipelineState_(self.pipeline_state)
-    for i,a in enumerate(bufs): encoder.setBuffer_offset_atIndex_(a, 0, i)
+    for i,a in enumerate(bufs): encoder.setBuffer_offset_atIndex_(a.buf, a.offset, i)
     for i,a in enumerate(vals,start=len(bufs)): encoder.setBytes_length_atIndex_(ctypes.c_int32(a), 4, i)
     encoder.dispatchThreadgroups_threadsPerThreadgroup_(Metal.MTLSize(*global_size), Metal.MTLSize(*local_size))
     encoder.endEncoding()
@@ -55,46 +57,56 @@ class MetalProgram:
       return command_buffer.GPUEndTime() - command_buffer.GPUStartTime()
     self.device.mtl_buffers_in_flight.append(command_buffer)
+class MetalBuffer:
+  def __init__(self, buf:Any, size:int, offset=0): self.buf, self.size, self.offset = buf, size, offset
 class MetalAllocator(LRUAllocator):
   def __init__(self, device:MetalDevice):
     self.device:MetalDevice = device
-    self.track_cross_device: Set[MetalDevice] = set()
     super().__init__()
-  def free_cache(self):
-    self.device.synchronize()
-    for x in self.track_cross_device: x.synchronize()
-    self.track_cross_device.clear()
-    return super().free_cache()
-  def _alloc(self, size:int, options) -> Any:
+  def _alloc(self, size:int, options) -> MetalBuffer:
     ret = self.device.device.newBufferWithLength_options_(size, Metal.MTLResourceStorageModeShared)
     if ret is None: raise MemoryError(f"Metal OOM while allocating {size=}")
-    return ret
-  def transfer(self, dest:Any, src:Any, sz:int, src_dev: MetalDevice, **kwargs):
-    src_dev.synchronize()
-    command_buffer = self.device.mtl_queue.commandBuffer()
-    encoder = command_buffer.blitCommandEncoder()
-    encoder.copyFromBuffer_sourceOffset_toBuffer_destinationOffset_size_(src, 0, dest, 0, sz)
+    return MetalBuffer(ret, size)
+  def _free(self, opaque:MetalBuffer, options): opaque.buf.release()
+  def transfer(self, dest:MetalBuffer, src:MetalBuffer, sz:int, src_dev:MetalDevice, dest_dev:MetalDevice):
+    dest_dev.synchronize()
+    src_command_buffer = src_dev.mtl_queue.commandBuffer()
+    encoder = src_command_buffer.blitCommandEncoder()
+    encoder.copyFromBuffer_sourceOffset_toBuffer_destinationOffset_size_(src.buf, src.offset, dest.buf, dest.offset, sz)
     encoder.endEncoding()
-    command_buffer.commit()
-    self.device.mtl_buffers_in_flight.append(command_buffer)
+    if src_dev != dest_dev:
+      src_command_buffer.encodeSignalEvent_value_(src_dev.timeline_signal, src_dev.timeline_value)
+      dest_command_buffer = dest_dev.mtl_queue.commandBuffer()
+      dest_command_buffer.encodeWaitForEvent_value_(src_dev.timeline_signal, src_dev.timeline_value)
+      dest_command_buffer.commit()
+      dest_dev.mtl_buffers_in_flight.append(dest_command_buffer)
+      src_dev.timeline_value += 1
+    src_command_buffer.commit()
+    src_dev.mtl_buffers_in_flight.append(src_command_buffer)
   def from_buffer(self, src:memoryview) -> Optional[Any]:
-    ret = self.device.device.newBufferWithBytesNoCopy_length_options_deallocator_(src, len(src), Metal.MTLResourceStorageModeShared, None)
+    ret = self.device.device.newBufferWithBytesNoCopy_length_options_deallocator_(src, src.nbytes, Metal.MTLResourceStorageModeShared, None)
     if ret: self.device.mv_in_metal.append(src)
-    return ret
-  def _free(self, opaque:Any, options): opaque.release()
-  def as_buffer(self, src:Any) -> memoryview:
+    return MetalBuffer(ret, src.nbytes)
+  def as_buffer(self, src:MetalBuffer) -> memoryview:
     self.device.synchronize()
-    return src.contents().as_buffer(src.length())
-  def copyin(self, dest:Any, src:memoryview): self.as_buffer(dest)[:] = src
-  def copyout(self, dest:memoryview, src:Any): dest[:] = self.as_buffer(src)
+    return src.buf.contents().as_buffer(src.offset+src.size)[src.offset:]
+  def copyin(self, dest:MetalBuffer, src:memoryview): self.as_buffer(dest)[:] = src
+  def copyout(self, dest:memoryview, src:MetalBuffer): dest[:] = self.as_buffer(src)
+  def offset(self, buf:MetalBuffer, size:int, offset:int): return MetalBuffer(buf.buf, size, offset)
 class MetalDevice(Compiled):
   def __init__(self, device:str):
     self.device = Metal.MTLCreateSystemDefaultDevice()
     self.mtl_queue = self.device.newCommandQueueWithMaxCommandBufferCount_(1024)
+    if self.mtl_queue is None: raise RuntimeError("Cannot allocate a new command queue")
     self.mtl_buffers_in_flight: List[Any] = []
     self.mv_in_metal: List[memoryview] = []
-    self.track_cross_buffer: List[Any] = []
+    self.timeline_signal = self.device.newSharedEvent()
+    self.timeline_value = 0
     from tinygrad.runtime.graph.metal import MetalGraph
     super().__init__(device, MetalAllocator(self), MetalRenderer(), MetalCompiler(None if getenv("METAL_XCODE") else self),
                      functools.partial(MetalProgram, self), MetalGraph)
@@ -102,4 +114,3 @@ class MetalDevice(Compiled):
     for cbuf in self.mtl_buffers_in_flight: wait_check(cbuf)
     self.mv_in_metal.clear()
     self.mtl_buffers_in_flight.clear()
-    self.track_cross_buffer.clear()

tinygrad 0.9.1__py3-none-any.whl → 0.9.2__py3-none-any.whl

tinygrad 0.9.1py3-none-any.whl → 0.9.2py3-none-any.whl