PyPI - tinygrad - Versions diffs - 0.7.0__py3-none-any.whl → 0.9.0__py3-none-any.whl - Mend

tinygrad 0.7.0py3-none-any.whl → 0.9.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (77) hide show

tinygrad/__init__.py +6 -0
tinygrad/codegen/kernel.py +572 -83
tinygrad/codegen/linearizer.py +415 -395
tinygrad/codegen/uops.py +415 -0
tinygrad/device.py +183 -0
tinygrad/dtype.py +113 -0
tinygrad/engine/__init__.py +0 -0
tinygrad/engine/graph.py +100 -0
tinygrad/engine/jit.py +195 -0
tinygrad/engine/realize.py +191 -0
tinygrad/engine/schedule.py +362 -0
tinygrad/engine/search.py +196 -0
tinygrad/{mlops.py → function.py} +76 -55
tinygrad/helpers.py +196 -89
tinygrad/lazy.py +210 -371
tinygrad/multi.py +169 -0
tinygrad/nn/__init__.py +202 -22
tinygrad/nn/datasets.py +7 -0
tinygrad/nn/optim.py +112 -32
tinygrad/nn/state.py +136 -39
tinygrad/ops.py +119 -202
tinygrad/renderer/__init__.py +61 -0
tinygrad/renderer/assembly.py +276 -0
tinygrad/renderer/cstyle.py +353 -166
tinygrad/renderer/llvmir.py +150 -138
tinygrad/runtime/autogen/amd_gpu.py +1900 -0
tinygrad/runtime/autogen/comgr.py +865 -0
tinygrad/runtime/autogen/cuda.py +5923 -0
tinygrad/runtime/autogen/hip.py +5909 -0
tinygrad/runtime/autogen/hsa.py +5761 -0
tinygrad/runtime/autogen/kfd.py +812 -0
tinygrad/runtime/autogen/nv_gpu.py +33328 -0
tinygrad/runtime/autogen/opencl.py +1795 -0
tinygrad/runtime/driver/hip_comgr.py +47 -0
tinygrad/runtime/driver/hsa.py +143 -0
tinygrad/runtime/graph/clang.py +38 -0
tinygrad/runtime/graph/cuda.py +81 -0
tinygrad/runtime/graph/hcq.py +143 -0
tinygrad/runtime/graph/hsa.py +171 -0
tinygrad/runtime/graph/metal.py +75 -0
tinygrad/runtime/ops_amd.py +564 -0
tinygrad/runtime/ops_clang.py +24 -77
tinygrad/runtime/ops_cuda.py +175 -89
tinygrad/runtime/ops_disk.py +56 -33
tinygrad/runtime/ops_gpu.py +92 -95
tinygrad/runtime/ops_hsa.py +278 -0
tinygrad/runtime/ops_llvm.py +39 -60
tinygrad/runtime/ops_metal.py +92 -74
tinygrad/runtime/ops_npy.py +9 -0
tinygrad/runtime/ops_nv.py +630 -0
tinygrad/runtime/ops_python.py +204 -0
tinygrad/shape/shapetracker.py +86 -254
tinygrad/shape/symbolic.py +166 -141
tinygrad/shape/view.py +296 -0
tinygrad/tensor.py +2619 -448
{tinygrad-0.7.0.dist-info → tinygrad-0.9.0.dist-info}/LICENSE +1 -1
tinygrad-0.9.0.dist-info/METADATA +227 -0
tinygrad-0.9.0.dist-info/RECORD +60 -0
{tinygrad-0.7.0.dist-info → tinygrad-0.9.0.dist-info}/WHEEL +1 -1
tinygrad/codegen/assembly.py +0 -190
tinygrad/codegen/optimizer.py +0 -379
tinygrad/codegen/search.py +0 -72
tinygrad/graph.py +0 -83
tinygrad/jit.py +0 -57
tinygrad/nn/image.py +0 -100
tinygrad/renderer/assembly_arm64.py +0 -169
tinygrad/renderer/assembly_ptx.py +0 -98
tinygrad/renderer/wgsl.py +0 -53
tinygrad/runtime/lib.py +0 -113
tinygrad/runtime/ops_cpu.py +0 -51
tinygrad/runtime/ops_hip.py +0 -82
tinygrad/runtime/ops_shm.py +0 -29
tinygrad/runtime/ops_torch.py +0 -30
tinygrad/runtime/ops_webgpu.py +0 -45
tinygrad-0.7.0.dist-info/METADATA +0 -212
tinygrad-0.7.0.dist-info/RECORD +0 -40
{tinygrad-0.7.0.dist-info → tinygrad-0.9.0.dist-info}/top_level.txt +0 -0

tinygrad/runtime/ops_clang.py CHANGED Viewed

@@ -1,81 +1,28 @@
-import os, time, ctypes, hashlib, subprocess, platform, tempfile, functools
-from functools import partial, reduce
-from tinygrad.ops import Compiled
-from tinygrad.helpers import fromimport, getenv, DEBUG, CI
-from tinygrad.runtime.lib import RawMallocBuffer
-from tinygrad.codegen.linearizer import LinearizerOptions
-from tinygrad.renderer.cstyle import uops_to_cstyle, CStyleLanguage
-import struct
-import numpy as np
+import ctypes, subprocess, pathlib, tempfile
+from tinygrad.device import Compiled, Compiler, MallocAllocator
+from tinygrad.helpers import cpu_time_execution, DEBUG, cpu_objdump
+from tinygrad.renderer.cstyle import ClangRenderer
-ARM64 = getenv('ARM64', False)
-if CI and ARM64: from unicorn import Uc, UC_ARCH_ARM64, UC_MODE_ARM, UC_HOOK_CODE, arm64_const   # type: ignore
-args = {
-  'Windows': {'cflags':'', 'ext':'dll', 'exp':'__declspec(dllexport)'},
-  'Linux': {'cflags':'-lm -fPIC --rtlib=compiler-rt ', 'ext':'so', 'exp':''},
-  'Darwin': {'cflags':'-lm -fPIC --rtlib=compiler-rt ', 'ext':'dylib', 'exp':''}
-}[platform.system()]
-CLANG_PROGRAM_HEADER = '#include <math.h>\n#define max(x,y) ((x>y)?x:y)\n#define int64 long\n#define half __fp16\n#define uchar unsigned char\n#define bool uchar\n'
-ADDRESS = 0x10000
-# Unicorn doesn't support external calls
-def align(addr): return (addr+4095) & ~(4095)
-mock_lm = {"sinf": np.sin, "sqrtf": np.sqrt, "exp2f": np.exp2, "log2f": np.log2}
-def emulate_ext_calls(fn, uc, address, size, user_data):
-  s_in = struct.unpack('f', struct.pack('I', uc.reg_read(getattr(arm64_const, f'UC_ARM64_REG_S{fn[2][1:]}'))))[0]
-  uc.reg_write(getattr(arm64_const, f'UC_ARM64_REG_S{fn[1][1:]}'), struct.unpack('I', struct.pack('f', mock_lm[fn[0]](s_in)))[0])  # type: ignore
+class ClangCompiler(Compiler):
+  def compile(self, src:str) -> bytes:
+    # TODO: remove file write. sadly clang doesn't like the use of /dev/stdout here
+    with tempfile.NamedTemporaryFile(delete=True) as output_file:
+      subprocess.check_output(['clang', '-include', 'tgmath.h', '-shared', '-march=native', '-O2', '-Wall', '-Werror', '-x', 'c', '-fPIC', '-',
+                               '-o', str(output_file.name)], input=src.encode('utf-8'))
+      return pathlib.Path(output_file.name).read_bytes()
 class ClangProgram:
-  def __init__(self, name:str, prg:str, binary:bool=False):
-    # TODO: is there a way to not write this to disk?
-    # A: it seems there isn't https://stackoverflow.com/questions/28053328/ctypes-cdll-load-library-from-memory-rather-than-file
-    #    because ctypes.CDLL() calls dlopen (POSIX) or LoadLibrary (Windows) which require a file
-    fn = f"{tempfile.gettempdir()}/clang_{hashlib.md5(prg.encode('utf-8')).hexdigest()}.{args['ext']}"
-    if binary and DEBUG >= 5: print(prg)
-    if not os.path.exists(fn):
-      tmp = f"{fn}.{os.getpid()}.tmp"
-      if not binary:
-        prg = CLANG_PROGRAM_HEADER + prg
-        subprocess.check_output(args=('clang -shared -O2 -Wall -Werror -x c '+args['cflags']+' - -o '+tmp).split(), input=prg.encode('utf-8'))
-        os.rename(tmp, fn)
-      else:
-        if CI and ARM64:
-          prg = prg.split('\n') # type: ignore
-          self.varsize = align(int(prg[0].split(" ")[1]))
-          self.ext_calls = {(i*4+ADDRESS):ins.split(" ")[1:] for i, ins in enumerate(filter(lambda ins: ins[:4] != 'loop', prg[6:-3])) if ins[:2] == 'bl'}
-          prg = "\n".join(['nop' if ins[:2] == 'bl' else ins for ins in prg[6:-3]] + ['\n'])
-          subprocess.check_output(args=('aarch64-linux-gnu-as -o '+tmp).split(), input=prg.encode('utf-8'))
-          subprocess.check_output(args=('aarch64-linux-gnu-objcopy -O binary --only-section=.text '+tmp+' '+fn+'.bin').split())
-          self.prg = open(fn + '.bin', 'rb').read()
-          return
-        subprocess.check_output(args=('as -o' + tmp).split(), input=prg.encode('utf-8'))
-        subprocess.check_output(args=('clang -lm -shared '+tmp+' -o'+fn).split())
-    self.lib = ctypes.CDLL(fn)
-    self.fxn = self.lib[name]
-  def __call__(self, global_size, local_size, *args, wait=False):
-    if wait: st = time.monotonic()
-    if CI and ARM64:
-      mu = Uc(UC_ARCH_ARM64, UC_MODE_ARM)
-      total_mem = align(reduce(lambda total, arg: total + arg.size * arg.dtype.itemsize, args, len(self.prg)+self.varsize))
-      mu.mem_map(ADDRESS, total_mem)
-      for k, fn in self.ext_calls.items(): mu.hook_add(UC_HOOK_CODE, partial(emulate_ext_calls, fn), begin=k, end=k)
-      mu.mem_write(ADDRESS, self.prg + b''.join(bytes(arg._buf) for arg in args))
-      addr = ADDRESS + len(self.prg)
-      for i, arg in enumerate(args):
-        if i<=7:
-          mu.reg_write(getattr(arm64_const, f'UC_ARM64_REG_X{i}'), addr)
-        else:
-          # NOTE: In ARM, args beyond the first 8 are placed on the stack it also account for the stack red zone.
-          mu.mem_write(ADDRESS + total_mem - (len(args[8:])+2)*8 + 8*(i-8), addr.to_bytes(8, 'little'))
-        addr += arg.size * arg.dtype.itemsize
-      mu.reg_write(arm64_const.UC_ARM64_REG_SP, ADDRESS + total_mem - (len(args[8:])+2)*8)
-      mu.emu_start(ADDRESS, ADDRESS + len(self.prg))
-      args[0]._buf = mu.mem_read(mu.reg_read(arm64_const.UC_ARM64_REG_X0), args[0].size * args[0].dtype.itemsize)
-    else:
-      self.fxn(*[x._buf if isinstance(x, RawMallocBuffer) else x for x in args])
-    if wait: return time.monotonic()-st
+  def __init__(self, name:str, lib:bytes):
+    if DEBUG >= 6: cpu_objdump(lib)
+    self.name, self.lib = name, lib
+    # write to disk so we can load it
+    with tempfile.NamedTemporaryFile(delete=True) as cached_file_path:
+      pathlib.Path(cached_file_path.name).write_bytes(lib)
+      self.fxn = ctypes.CDLL(str(cached_file_path.name))[name]
+  def __call__(self, *bufs, vals=(), wait=False): return cpu_time_execution(lambda: self.fxn(*bufs, *vals), enable=wait)
-renderer = fromimport("tinygrad.renderer.assembly_arm64", "uops_to_arm64_asm") if ARM64 else functools.partial(uops_to_cstyle, CStyleLanguage(kernel_prefix=args['exp'], buffer_suffix=" restrict", arg_int_prefix="const int"))
-ClangBuffer = Compiled(RawMallocBuffer, LinearizerOptions(supports_float4=False, has_local=False), renderer, ClangProgram)
+class ClangDevice(Compiled):
+  def __init__(self, device:str):
+    from tinygrad.runtime.graph.clang import ClangGraph
+    super().__init__(device, MallocAllocator, ClangRenderer(), ClangCompiler("compile_clang"), ClangProgram, ClangGraph)

tinygrad/runtime/ops_cuda.py CHANGED Viewed

@@ -1,99 +1,185 @@
-import subprocess, time, re, hashlib, tempfile, os, functools
-from typing import Optional
-import numpy as np
-from pycuda.compiler import compile as cuda_compile # type: ignore
-from tinygrad.helpers import DEBUG, getenv, colored, fromimport
-from tinygrad.ops import Compiled
-from tinygrad.runtime.lib import RawBufferCopyInOut, RawMallocBuffer, LRUAllocator
-from tinygrad.codegen.linearizer import LinearizerOptions
-from tinygrad.renderer.cstyle import uops_to_cstyle, CStyleLanguage
+from __future__ import annotations
+import subprocess, hashlib, tempfile, ctypes, ctypes.util, functools, re
+from pathlib import Path
+from typing import Tuple, Optional, List
+import tinygrad.runtime.autogen.cuda as cuda
+from tinygrad.helpers import DEBUG, getenv, from_mv, to_char_p_p, init_c_var, init_c_struct_t, colored, cpu_time_execution
+from tinygrad.device import Compiled, Compiler, CompileError, BufferOptions, LRUAllocator, MallocAllocator
+from tinygrad.renderer.cstyle import CUDARenderer
+from tinygrad.renderer.assembly import PTXRenderer
+if getenv("IOCTL"): import extra.nv_gpu_driver.nv_ioctl  # noqa: F401
 def pretty_ptx(s):
   # all expressions match `<valid_before><expr><valid_after>` and replace it with `<valid_before>color(<expr>)<valid_after>`
-  s = re.sub(r'([!@<\[\s,\+\-;\n])((?:[_%$][\w%\$_]+(?:\.[xyz])?\:?)|(?:buf\d+))([<>\]\s,\+\-;\n\)])', lambda m:m[1]+colored(m[2], "blue")+m[3], s, flags=re.M) # identifiers
+  s = re.sub(r'([!@<\[\s,\+\-;\n])((?:[_%$][\w%\$_]+(?:\.[xyz])?\:?)|(?:buf\d+))([<>\]\s,\+\-;\n\)])', lambda m:m[1]+colored(m[2], "blue")+m[3], s, flags=re.M) # identifiers  # noqa: E501
   s = re.sub(r'(.)((?:b|s|u|f)(?:8|16|32|64)|pred)([\.\s])', lambda m:m[1]+colored(m[2], "green")+m[3], s, flags=re.M) # types
   s = re.sub(r'^(\s*)([\w]+)(.*?;$)', lambda m:m[1]+colored(m[2], "yellow")+m[3], s, flags=re.M) # instructions
-  s = re.sub(r'([<>\[\]\s,\+\-;])((?:0[fF][0-9a-fA-F]{8})|(?:[0-9]+)|(?:0[xX][0-9a-fA-F]+))([<>\[\]\s,\+\-;])', lambda m:m[1]+colored(m[2], "yellow")+m[3], s, flags=re.M) # numbers
+  s = re.sub(r'([<>\[\]\s,\+\-;])((?:0[fF][0-9a-fA-F]{8})|(?:[0-9]+)|(?:0[xX][0-9a-fA-F]+))([<>\[\]\s,\+\-;])', lambda m:m[1]+colored(m[2], "yellow")+m[3], s, flags=re.M) # numbers  # noqa: E501
   s = re.sub(r'(\.)(param|reg|global)', lambda m:m[1]+colored(m[2], "magenta"), s, flags=re.M) # space
   s = re.sub(r'(\.)(version|target|address_size|visible|entry)', lambda m:m[1]+colored(m[2], "magenta"), s, flags=re.M) # derivatives
   return s
-def arch(): return "sm_" + "".join([str(x) for x in pycuda.driver.Context.get_device().compute_capability()])
-if getenv("CUDACPU", 0) == 1:
-  import ctypes, ctypes.util
-  lib = ctypes.CDLL(ctypes.util.find_library("gpuocelot"))
-  lib.ptx_run.argtypes = [ctypes.c_char_p, ctypes.c_int, ctypes.POINTER(ctypes.c_void_p), ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int]
-  class cuda:
-    class module:
-      def __init__(self, src): self.src = src
-      def get_function(self, _): return self
-      def __call__(self, *args, block, grid): lib.ptx_run(self.src, len(args), (ctypes.c_void_p * len(args))(*[ctypes.cast(x, ctypes.c_void_p) for x in args]), *block, *grid)
-    module_from_buffer = lambda src: cuda.module(src) # pylint: disable=unnecessary-lambda # noqa: E731
-    class Event:
-      def __init__(self): pass
-      def record(self): self.start = time.perf_counter()
-      def time_till(self, other): return self.start - other.start
-      def synchronize(self): pass
-    class Context:
-      synchronize = lambda:0 # noqa: E731
-    CompileError = Exception
-  class context:
-    class device:
-      compute_capability = lambda: (3,5) # pylint: disable=unnecessary-lambda # noqa: E731
-    get_device = lambda: context.device # pylint: disable=unnecessary-lambda # noqa: E731
-  import pycuda.driver # type: ignore
-  pycuda.driver.Context = context
-  RawCUDABuffer = RawMallocBuffer
-else:
-  import pycuda.autoprimaryctx # type: ignore # pylint: disable=unused-import # noqa: F401
-  import pycuda.driver as cuda # type: ignore
-  class CUDAAllocator(LRUAllocator):
-    def _do_alloc(self, size, dtype, device, **kwargs): return cuda.mem_alloc(size * dtype.itemsize) # type: ignore
-    def _cached_bufkey(self, size, dtype, device): return (device, size*dtype.itemsize) # Buffers of the same length could be reused, no matter what dtype.
-  CUDAAlloc = CUDAAllocator(pycuda.driver.Context.get_device().total_memory())
-  class RawCUDABuffer(RawBufferCopyInOut): # type: ignore
-    def __init__(self, size, dtype): super().__init__(size, dtype, allocator=CUDAAlloc)
-    def _copyin(self, x:np.ndarray, stream:Optional[cuda.Stream]=None): cuda.memcpy_htod_async(self._buf, x.ravel(), stream) # type: ignore
-    def _copyout(self, x:np.ndarray): cuda.memcpy_dtoh(x, self._buf) # type: ignore
+CUDACPU = getenv("CUDACPU") == 1
+if CUDACPU:
+  gpuocelot_lib = ctypes.CDLL(ctypes.util.find_library("gpuocelot"))
+  gpuocelot_lib.ptx_run.argtypes = [ctypes.c_char_p, ctypes.c_int, ctypes.POINTER(ctypes.c_void_p), ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int]  # noqa: E501
+  cuda.cuLaunchKernel = lambda src, gx, gy, gz, lx, ly, lz, shared, stream, unused_extra, args: gpuocelot_lib.ptx_run(src, len(args), (ctypes.c_void_p * len(args))(*[ctypes.cast(x, ctypes.c_void_p) for x in args]), lx, ly, lz, gx, gy, gz, shared)  # type: ignore  # noqa: E501
+def check(status):
+  if status != 0: raise RuntimeError(f"CUDA Error {status}, {ctypes.string_at(init_c_var(ctypes.POINTER(ctypes.c_char)(), lambda x: cuda.cuGetErrorString(status, ctypes.byref(x)))).decode()}")  # noqa: E501
+def encode_args(args, vals) -> Tuple[ctypes.Structure, ctypes.Array]:
+  c_args = init_c_struct_t(tuple([(f'f{i}', cuda.CUdeviceptr_v2) for i in range(len(args))] +
+                                 [(f'v{i}', ctypes.c_int) for i in range(len(vals))]))(*args, *vals)
+  vargs = (ctypes.c_void_p * 5)(ctypes.c_void_p(1), ctypes.cast(ctypes.byref(c_args), ctypes.c_void_p), ctypes.c_void_p(2),
+                                ctypes.cast(ctypes.pointer(ctypes.c_size_t(ctypes.sizeof(c_args))), ctypes.c_void_p), ctypes.c_void_p(0))
+  return c_args, vargs
+def cu_time_execution(cb, enable=False) -> Optional[float]:
+  if CUDACPU: return cpu_time_execution(cb, enable=enable)
+  if not enable: return cb()
+  evs = [init_c_var(cuda.CUevent(), lambda x: cuda.cuEventCreate(ctypes.byref(x), 0)) for _ in range(2)]
+  cuda.cuEventRecord(evs[0], None)
+  cb()
+  cuda.cuEventRecord(evs[1], None)
+  check(cuda.cuEventSynchronize(evs[1]))
+  cuda.cuEventElapsedTime(ctypes.byref(ret := ctypes.c_float()), evs[0], evs[1])
+  for ev in evs: cuda.cuEventDestroy_v2(ev)
+  return ret.value * 1e-3
+def _get_bytes(arg, get_str, get_sz, check) -> bytes:
+  sz = init_c_var(ctypes.c_size_t(), lambda x: check(get_sz(arg, ctypes.byref(x))))
+  return ctypes.string_at(init_c_var(ctypes.create_string_buffer(sz.value), lambda x: check(get_str(arg, x))), size=sz.value)
+class PTXCompiler(Compiler):
+  def __init__(self, arch:str):
+    self.arch = arch
+    self.version = "7.8" if arch >= "sm_89" else "7.5"
+    super().__init__(f"compile_ptx_{self.arch}")
+  def compile(self, src:str) -> bytes: return src.replace("TARGET", self.arch).replace("VERSION", self.version).encode()
+class CUDACompiler(Compiler):
+  def __init__(self, arch:str):
+    self.arch = arch
+    check(cuda.nvrtcVersion((nvrtcMajor := ctypes.c_int()), (nvrtcMinor := ctypes.c_int())))
+    self.compile_options = [f'--gpu-architecture={arch}', "-I/usr/local/cuda/include", "-I/usr/include", "-I/opt/cuda/include/"]
+    if (nvrtcMajor.value, nvrtcMinor.value) >= (12, 4): self.compile_options.append("--minimal")
+    super().__init__(f"compile_cuda_{self.arch}")
+  def compile(self, src:str) -> bytes:
+    check(cuda.nvrtcCreateProgram(ctypes.byref(prog := cuda.nvrtcProgram()), src.encode(), "<null>".encode(), 0, None, None))
+    status = cuda.nvrtcCompileProgram(prog, len(self.compile_options), to_char_p_p([o.encode() for o in self.compile_options]))
+    if status != 0: raise CompileError(f"compile failed: {_get_bytes(prog, cuda.nvrtcGetProgramLog, cuda.nvrtcGetProgramLogSize, check).decode()}")
+    return _get_bytes(prog, cuda.nvrtcGetPTX, cuda.nvrtcGetPTXSize, check)
+def cuda_disassemble(lib, arch):
+  try:
+    fn = (Path(tempfile.gettempdir()) / f"tinycuda_{hashlib.md5(lib).hexdigest()}").as_posix()
+    with open(fn + ".ptx", "wb") as f: f.write(lib)
+    subprocess.run(["ptxas", f"-arch={arch}", "-o", fn, fn+".ptx"], check=True)
+    print(subprocess.check_output(['nvdisasm', fn]).decode('utf-8'))
+  except Exception as e: print("failed to generate SASS", str(e))
 class CUDAProgram:
-  def __init__(self, name:str, prg:str, binary=False):
-    if not binary:
-      try: prg = cuda_compile(prg, target="ptx", no_extern_c=True, options=['-Wno-deprecated-gpu-targets']).decode('utf-8')
-      except cuda.CompileError as e:
-        if DEBUG >= 3: print("FAILED TO BUILD", prg)
-        raise e
-    if DEBUG >= 5: print(pretty_ptx(prg))
-    if DEBUG >= 6:
-      try:
-        fn = os.path.join(tempfile.gettempdir(), f"tinycuda_{hashlib.md5(prg.encode('utf-8')).hexdigest()}")
-        with open(fn + ".ptx", "wb") as f: f.write(prg.encode('utf-8'))
-        subprocess.run(["ptxas", f"-arch={arch()}", "-o", fn, fn+".ptx"], check=True)
-        print(subprocess.check_output(['nvdisasm', fn]).decode('utf-8'))
-      except Exception as e: print("failed to generate SASS", str(e))
-    # TODO: name is wrong, so we get it from the ptx using hacks
-    self.prg = cuda.module_from_buffer(prg.encode('utf-8')).get_function(prg.split(".visible .entry ")[1].split("(")[0])
-  def __call__(self, global_size, local_size, *args, wait=False):
-    if wait:
-      start, end = cuda.Event(), cuda.Event()
-      start.record()
-    self.prg(*[x._buf if isinstance(x, RawCUDABuffer) else np.int32(x) if (isinstance(x, int) and not getenv("CUDACPU")) else x for x in args], block=tuple(local_size), grid=tuple(global_size))
-    if wait:
-      end.record()
-      end.synchronize()
-      return start.time_till(end)*1e-3
-renderer = functools.partial(uops_to_cstyle, CStyleLanguage(
-  kernel_prefix = "__global__", smem_prefix = "__shared__ ", arg_int_prefix = "const int", barrier = "__syncthreads();", float4 = "make_float4",
-  gid = [f'blockIdx.{chr(120+i)}' for i in range(3)],
-  lid = [f'threadIdx.{chr(120+i)}' for i in range(3)],
-  half_prekernel = """
-    #include <cuda_fp16.h>
-    struct __align__(8) half4 {
-      half2 x, y;
-      __device__ __forceinline__ explicit half4(const float4& a): x(make_half2(__float2half(a.x), __float2half(a.y))), y(make_half2(__float2half(a.z),__float2half(a.w))) {}
-      __device__ __forceinline__ explicit operator float4() const {return make_float4(__half2float(x.x), __half2float(x.y), __half2float(y.x), __half2float(y.y)); }
-    };
-  """)) if not getenv("PTX") else fromimport("tinygrad.renderer.assembly_ptx", "uops_to_ptx_asm")
-CUDABuffer = Compiled(RawCUDABuffer, LinearizerOptions(supports_float4=False if getenv("PTX") else True, supports_float4_alu=False, global_max = [65535, 65535, 2147483647], local_max = [64, 1024, 1024]), renderer, CUDAProgram, cuda.Context.synchronize)
+  def __init__(self, device:CUDADevice, name:str, lib:bytes):
+    self.device, self.name, self.lib = device, name, lib
+    if DEBUG >= 5: print("\n".join([f"{i+1:>3} {line}" for i, line in enumerate(pretty_ptx(lib.decode('utf-8')).split("\n"))]))
+    if DEBUG >= 6: cuda_disassemble(lib, device.arch)
+    if CUDACPU: self.prg = lib
+    else:
+      check(cuda.cuCtxSetCurrent(self.device.context))
+      self.module = cuda.CUmodule()
+      status = cuda.cuModuleLoadData(ctypes.byref(self.module), lib)
+      if status != 0:
+        del self.module
+        cuda_disassemble(lib, device.arch)
+        raise RuntimeError(f"module load failed with status code {status}: {cuda.cudaError_enum__enumvalues[status]}")
+      check(cuda.cuModuleGetFunction(ctypes.byref(prg := cuda.CUfunction()), self.module, name.encode("utf-8")))
+      self.prg = prg #type: ignore
+  def __del__(self):
+    if hasattr(self, 'module'): check(cuda.cuModuleUnload(self.module))
+  def __call__(self, *args, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1), vals:Tuple[int, ...]=(), wait=False):
+    if CUDACPU: self.vargs = args+tuple(vals)
+    else:
+      check(cuda.cuCtxSetCurrent(self.device.context))
+      if not hasattr(self, "vargs"):
+        self.c_args, self.vargs = encode_args(args, vals) #type: ignore
+      else:
+        for i in range(len(args)): self.c_args.__setattr__(f'f{i}', args[i])
+        for i in range(len(vals)): self.c_args.__setattr__(f'v{i}', vals[i])
+    return cu_time_execution(lambda: check(cuda.cuLaunchKernel(self.prg, *global_size, *local_size, 0, None, None, self.vargs)), enable=wait)
+class CUDAAllocator(LRUAllocator):
+  def __init__(self, device:CUDADevice):
+    self.device = device
+    super().__init__()
+  def _alloc(self, size, options:BufferOptions):
+    check(cuda.cuCtxSetCurrent(self.device.context))
+    if options.host: return init_c_var(ctypes.c_void_p(), lambda x: check(cuda.cuMemHostAlloc(ctypes.byref(x), size, 0x01)))
+    else: return init_c_var(cuda.CUdeviceptr(), lambda x: check(cuda.cuMemAlloc_v2(ctypes.byref(x), size)))
+  def _free(self, opaque, options:BufferOptions):
+    if options.host: return check(cuda.cuMemFreeHost(opaque))
+    else: check(cuda.cuMemFree_v2(opaque))
+  def copyin(self, dest, src:memoryview):
+    check(cuda.cuCtxSetCurrent(self.device.context))
+    host_mem = self.alloc(len(src), BufferOptions(host=True))
+    self.device.pending_copyin.append((host_mem, len(src), BufferOptions(host=True)))
+    ctypes.memmove(host_mem, from_mv(src), len(src))
+    check(cuda.cuMemcpyHtoDAsync_v2(dest, host_mem, len(src), None))
+  def copyout(self, dest:memoryview, src):
+    CUDADevice.synchronize_system()
+    check(cuda.cuCtxSetCurrent(self.device.context))
+    check(cuda.cuMemcpyDtoH_v2(from_mv(dest), src, len(dest)))
+  def transfer(self, dest, src, sz:int, src_dev, dest_dev):
+    check(cuda.cuCtxSetCurrent(src_dev.context))
+    check(cuda.cuEventCreate(ctypes.byref(sync_event := cuda.CUevent()), 0))
+    check(cuda.cuMemcpyDtoDAsync_v2(dest, src, sz, None))
+    check(cuda.cuEventRecord(sync_event, None))
+    check(cuda.cuCtxSetCurrent(dest_dev.context))
+    check(cuda.cuStreamWaitEvent(None, sync_event, 0)) # sync the default stream on the dest dev
+  def offset(self, buf, size:int, offset:int): return ctypes.c_ulong(buf.value + offset)
+class CUDADevice(Compiled):
+  devices: List[CUDADevice] = []
+  peer_access = False
+  def __init__(self, device:str):
+    device_id = int(device.split(":")[1]) if ":" in device else 0
+    if not CUDACPU:
+      check(cuda.cuInit(0))
+      self.cu_device = init_c_var(cuda.CUdevice(), lambda x: check(cuda.cuDeviceGet(ctypes.byref(x), device_id)))
+      self.context = init_c_var(cuda.CUcontext(), lambda x: check(cuda.cuCtxCreate_v2(ctypes.byref(x), 0, self.cu_device)))
+      check(cuda.cuDeviceComputeCapability(ctypes.byref(major := ctypes.c_int()), ctypes.byref(minor := ctypes.c_int()), device_id))
+      for dev in CUDADevice.devices:
+        check(cuda.cuDeviceCanAccessPeer(ctypes.byref(val := ctypes.c_int()), self.cu_device, dev.cu_device))
+        if val.value != 1: continue
+        check(cuda.cuCtxSetCurrent(dev.context))
+        check(cuda.cuCtxEnablePeerAccess(self.context, 0))
+        check(cuda.cuCtxSetCurrent(self.context))
+        check(cuda.cuCtxEnablePeerAccess(dev.context, 0))
+        CUDADevice.peer_access = True
+    self.arch = f"sm_{major.value}{minor.value}" if not CUDACPU else "sm_35"
+    self.pending_copyin: List[Tuple[int, int, Optional[BufferOptions]]] = []
+    CUDADevice.devices.append(self)
+    from tinygrad.runtime.graph.cuda import CUDAGraph
+    super().__init__(device, CUDAAllocator(self) if not CUDACPU else MallocAllocator,
+                     PTXRenderer(self.arch) if getenv("PTX") else CUDARenderer(self.arch),
+                     PTXCompiler(self.arch) if getenv("PTX") else CUDACompiler(self.arch),
+                     functools.partial(CUDAProgram, self), graph=CUDAGraph if not CUDACPU else None)
+  def synchronize(self):
+    if CUDACPU: return
+    check(cuda.cuCtxSetCurrent(self.context))
+    check(cuda.cuCtxSynchronize())
+    for opaque,sz,options in self.pending_copyin: self.allocator.free(opaque, sz, options)
+    self.pending_copyin.clear()
+  @staticmethod
+  def synchronize_system():
+    for d in CUDADevice.devices: d.synchronize()

tinygrad/runtime/ops_disk.py CHANGED Viewed

@@ -1,37 +1,60 @@
-import os, mmap
+from __future__ import annotations
+import os, mmap, _posixshmem, io
 from typing import Optional
-from typing import Callable, Dict, Tuple
-from tinygrad.helpers import prod, DType
-from tinygrad.runtime.lib import RawBufferMapped
-from tinygrad.ops import Interpreted, Op, MovementOps, UnaryOps
+from tinygrad.helpers import OSX
+from tinygrad.device import Compiled, Allocator
-class RawDiskBuffer(RawBufferMapped):
-  def __init__(self, size, dtype:DType, device:Optional[str]=None, buf=None, shape=None, offset=0):  # pylint: disable=super-init-not-called
-    self.shape = (size, ) if shape is None else shape
-    self.offset = offset  # this is an offset in bytes
-    assert device is not None or buf is not None, "disk tensor needs a path or a buf"
-    if device is not None:
-      f = open(device, "a+b")
-      if os.path.getsize(device) < size * dtype.itemsize: os.ftruncate(f.fileno(), size * dtype.itemsize)
-      buf = [f, mmap.mmap(f.fileno(), size * dtype.itemsize), 1]
+class DiskBuffer:
+  def __init__(self, device:DiskDevice, size:int, offset=0):
+    self.device, self.size, self.offset = device, size, offset
+  def __repr__(self): return f"<DiskBuffer size={self.size} offset={self.offset}>"
+  def _buf(self) -> memoryview:
+    assert self.device.mem is not None, "DiskBuffer wasn't opened"
+    return memoryview(self.device.mem)[self.offset:self.offset+self.size]
+MAP_LOCKED, MAP_POPULATE = 0 if OSX else 0x2000, getattr(mmap, "MAP_POPULATE", 0 if OSX else 0x008000)
+class DiskAllocator(Allocator):
+  def __init__(self, device:DiskDevice): self.device = device
+  def _alloc(self, size:int, options):
+    self.device._might_open(size)
+    return DiskBuffer(self.device, size)
+  def _free(self, buf, options): self.device._might_close()
+  def as_buffer(self, src:DiskBuffer): return src._buf()
+  def copyin(self, dest:DiskBuffer, src:memoryview): dest._buf()[:] = src
+  def copyout(self, dest:memoryview, src:DiskBuffer):
+    if OSX and hasattr(self.device, 'fd'):
+      # OSX doesn't seem great at mmap, this is faster
+      with io.FileIO(self.device.fd, "a+b", closefd=False) as fo:
+        fo.seek(src.offset)
+        fo.readinto(dest)
     else:
-      buf[2] += 1
-    # NOTE: we don't call super since disk tensors don't use RAM
-    self.size, self.dtype, self._buf = size, dtype, buf
-  def __del__(self):
-    self._buf[2] -= 1
-    if self._buf[2] == 0: self._buf[0].close()
-  def cast(self, arg:Tuple[DType, bool]): return RawDiskBuffer(self.size, arg[0], buf=self._buf, shape=self.shape, offset=self.offset)
-  def reshape(self, arg): return RawDiskBuffer(self.size, self.dtype, buf=self._buf, shape=arg, offset=self.offset)
-  def shrink(self, arg):
-    assert arg[1:] == tuple([(0,x) for x in self.shape[1:]]), f"can only slice the first dim of disk tensor {arg}"
-    offset = arg[0][0]*prod(self.shape[1:])*self.dtype.itemsize
-    size = (arg[0][1]-arg[0][0]) * prod(self.shape[1:])
-    return RawDiskBuffer(size, self.dtype, buf=self._buf, offset=self.offset+offset, shape=(arg[0][1]-arg[0][0],)+self.shape[1:])
-  def _buffer(self): return memoryview(self._buf[1])[self.offset:self.offset+self.size*self.dtype.itemsize]
-  def readinto(self, buf):
-    self._buf[0].seek(self.offset)
-    self._buf[0].readinto(buf)
+      dest[:] = src._buf()
+  def offset(self, buf:DiskBuffer, size:int, offset:int): return DiskBuffer(buf.device, size, offset)
+class DiskDevice(Compiled):
+  def __init__(self, device:str):
+    self.size: Optional[int] = None
+    self.count = 0
+    super().__init__(device, DiskAllocator(self), None, None, None)
+  def _might_open(self, size):
+    self.count += 1
+    assert self.size is None or size <= self.size, f"can't reopen Disk tensor with larger size, opened with {self.size}, tried to open with {size}"
+    if self.size is not None: return
+    filename = self.dname[len("disk:"):]
+    self.size = size
-disk_fxn_for_op: Dict[Op, Callable] = { UnaryOps.NOOP: lambda x: x, UnaryOps.CAST: RawDiskBuffer.cast, MovementOps.RESHAPE: RawDiskBuffer.reshape, MovementOps.SHRINK: RawDiskBuffer.shrink }
-DiskBuffer = Interpreted(RawDiskBuffer, disk_fxn_for_op, to_underlying=lambda x:x, from_underlying=lambda x:x)
+    if filename.startswith("shm:"):
+      fd = _posixshmem.shm_open("/"+filename[4:].lstrip("/"), os.O_RDWR, 0o600)
+      self.mem = mmap.mmap(fd, self.size, mmap.MAP_SHARED | MAP_POPULATE | MAP_LOCKED)
+      os.close(fd)
+    else:
+      try: self.fd = os.open(filename, os.O_RDWR|os.O_CREAT|(0 if OSX else os.O_DIRECT))
+      except OSError: self.fd = os.open(filename, os.O_RDWR|os.O_CREAT)
+      if os.fstat(self.fd).st_size < self.size: os.ftruncate(self.fd, self.size)
+      self.mem = mmap.mmap(self.fd, self.size)
+    if (hp := getattr(mmap, "MADV_HUGEPAGE", None)) is not None: self.mem.madvise(hp) # type: ignore
+  def _might_close(self):
+    self.count -= 1
+    if self.count == 0:
+      if hasattr(self, 'fd'): os.close(self.fd)
+      self.size = None

tinygrad 0.7.0__py3-none-any.whl → 0.9.0__py3-none-any.whl

tinygrad 0.7.0py3-none-any.whl → 0.9.0py3-none-any.whl