PyPI - tinygrad - Versions diffs - 0.9.1__py3-none-any.whl → 0.10.0__py3-none-any.whl - Mend

tinygrad 0.9.1py3-none-any.whl → 0.10.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (74) hide show

tinygrad/__init__.py +11 -6
tinygrad/codegen/kernel.py +308 -175
tinygrad/codegen/linearize.py +95 -0
tinygrad/codegen/lowerer.py +143 -0
tinygrad/codegen/transcendental.py +257 -0
tinygrad/codegen/uopgraph.py +506 -0
tinygrad/device.py +72 -171
tinygrad/dtype.py +122 -47
tinygrad/engine/jit.py +184 -87
tinygrad/{lazy.py → engine/lazy.py} +74 -66
tinygrad/engine/memory.py +51 -0
tinygrad/engine/realize.py +86 -61
tinygrad/engine/schedule.py +366 -317
tinygrad/engine/search.py +58 -47
tinygrad/function.py +59 -58
tinygrad/helpers.py +120 -102
tinygrad/multi.py +82 -78
tinygrad/nn/__init__.py +116 -67
tinygrad/nn/datasets.py +12 -5
tinygrad/nn/optim.py +1 -1
tinygrad/nn/state.py +91 -6
tinygrad/ops.py +1126 -143
tinygrad/renderer/__init__.py +47 -23
tinygrad/renderer/cstyle.py +338 -265
tinygrad/renderer/llvmir.py +125 -143
tinygrad/renderer/ptx.py +225 -0
tinygrad/runtime/autogen/adreno.py +17904 -0
tinygrad/runtime/autogen/amd_gpu.py +46974 -11993
tinygrad/runtime/autogen/cuda.py +6 -162
tinygrad/runtime/autogen/io_uring.py +97 -63
tinygrad/runtime/autogen/kfd.py +60 -47
tinygrad/runtime/autogen/kgsl.py +1386 -0
tinygrad/runtime/autogen/libc.py +5462 -0
tinygrad/runtime/autogen/nv_gpu.py +1976 -1957
tinygrad/runtime/autogen/nvrtc.py +579 -0
tinygrad/runtime/autogen/opencl.py +11 -11
tinygrad/runtime/autogen/qcom_dsp.py +1739 -0
tinygrad/runtime/graph/clang.py +3 -3
tinygrad/runtime/graph/cuda.py +11 -15
tinygrad/runtime/graph/hcq.py +120 -107
tinygrad/runtime/graph/metal.py +71 -43
tinygrad/runtime/ops_amd.py +244 -323
tinygrad/runtime/ops_clang.py +12 -5
tinygrad/runtime/ops_cloud.py +220 -0
tinygrad/runtime/ops_cuda.py +42 -99
tinygrad/runtime/ops_disk.py +25 -26
tinygrad/runtime/ops_dsp.py +181 -0
tinygrad/runtime/ops_gpu.py +29 -16
tinygrad/runtime/ops_hip.py +68 -0
tinygrad/runtime/ops_llvm.py +15 -10
tinygrad/runtime/ops_metal.py +147 -64
tinygrad/runtime/ops_nv.py +356 -397
tinygrad/runtime/ops_python.py +78 -79
tinygrad/runtime/ops_qcom.py +405 -0
tinygrad/runtime/support/__init__.py +0 -0
tinygrad/runtime/support/compiler_cuda.py +77 -0
tinygrad/runtime/{driver/hip_comgr.py → support/compiler_hip.py} +13 -1
tinygrad/runtime/support/elf.py +38 -0
tinygrad/runtime/support/hcq.py +539 -0
tinygrad/shape/shapetracker.py +40 -50
tinygrad/shape/view.py +102 -63
tinygrad/tensor.py +1109 -365
{tinygrad-0.9.1.dist-info → tinygrad-0.10.0.dist-info}/METADATA +54 -50
tinygrad-0.10.0.dist-info/RECORD +77 -0
{tinygrad-0.9.1.dist-info → tinygrad-0.10.0.dist-info}/WHEEL +1 -1
tinygrad/codegen/linearizer.py +0 -528
tinygrad/codegen/uops.py +0 -451
tinygrad/engine/graph.py +0 -100
tinygrad/renderer/assembly.py +0 -269
tinygrad/shape/symbolic.py +0 -327
tinygrad-0.9.1.dist-info/RECORD +0 -63
/tinygrad/{runtime/driver/__init__.py → py.typed} +0 -0
{tinygrad-0.9.1.dist-info → tinygrad-0.10.0.dist-info}/LICENSE +0 -0
{tinygrad-0.9.1.dist-info → tinygrad-0.10.0.dist-info}/top_level.txt +0 -0

tinygrad/runtime/ops_clang.py CHANGED Viewed

@@ -1,19 +1,26 @@
+from typing import Optional, List
 import ctypes, subprocess, pathlib, tempfile
 from tinygrad.device import Compiled, Compiler, MallocAllocator
-from tinygrad.helpers import cpu_time_execution, DEBUG, cpu_objdump
+from tinygrad.helpers import cpu_time_execution, cpu_objdump
 from tinygrad.renderer.cstyle import ClangRenderer
 class ClangCompiler(Compiler):
+  def __init__(self, cachekey="compile_clang", args:Optional[List[str]]=None, objdump_tool='objdump'):
+    self.args = ['-march=native'] if args is None else args
+    self.objdump_tool = objdump_tool
+    super().__init__(cachekey)
   def compile(self, src:str) -> bytes:
     # TODO: remove file write. sadly clang doesn't like the use of /dev/stdout here
     with tempfile.NamedTemporaryFile(delete=True) as output_file:
-      subprocess.check_output(['clang', '-include', 'tgmath.h', '-shared', '-march=native', '-O2', '-Wall', '-Werror', '-x', 'c', '-fPIC', '-',
-                               '-o', str(output_file.name)], input=src.encode('utf-8'))
+      subprocess.check_output(['clang', '-shared', *self.args, '-O2', '-Wall', '-Werror', '-x', 'c', '-fPIC', '-ffreestanding', '-nostdlib',
+                               '-', '-o', str(output_file.name)], input=src.encode('utf-8'))
       return pathlib.Path(output_file.name).read_bytes()
+  def disassemble(self, lib:bytes): return cpu_objdump(lib, self.objdump_tool)
 class ClangProgram:
   def __init__(self, name:str, lib:bytes):
-    if DEBUG >= 6: cpu_objdump(lib)
     self.name, self.lib = name, lib
     # write to disk so we can load it
     with tempfile.NamedTemporaryFile(delete=True) as cached_file_path:
@@ -25,4 +32,4 @@ class ClangProgram:
 class ClangDevice(Compiled):
   def __init__(self, device:str):
     from tinygrad.runtime.graph.clang import ClangGraph
-    super().__init__(device, MallocAllocator, ClangRenderer(), ClangCompiler("compile_clang"), ClangProgram, ClangGraph)
+    super().__init__(device, MallocAllocator, ClangRenderer(), ClangCompiler(), ClangProgram, ClangGraph)

tinygrad/runtime/ops_cloud.py ADDED Viewed

@@ -0,0 +1,220 @@
+# the CLOUD=1 device is a process boundary between the frontend/runtime
+# normally tinygrad is    frontend <-> middleware <-> runtime <-> hardware
+# with CLOUD tinygrad is  frontend <-> middleware <-> CloudDevice ///HTTP/// cloud_server <-> runtime <-> hardware
+# this client and server can be on the same machine, same network, or just same internet
+# it should be a secure (example: no use of pickle) boundary. HTTP is used for RPC
+from __future__ import annotations
+from typing import Tuple, Optional, Dict, Any, DefaultDict, List
+from collections import defaultdict
+from dataclasses import dataclass, field
+import multiprocessing, functools, http.client, hashlib, json, time, os, binascii, struct, ast, contextlib
+from http.server import HTTPServer, BaseHTTPRequestHandler
+from tinygrad.renderer import Renderer
+from tinygrad.dtype import dtypes
+from tinygrad.helpers import getenv, DEBUG, fromimport, unwrap, Timing
+from tinygrad.device import Compiled, Allocator, Compiler, Device, BufferOptions
+# ***** API *****
+class CloudRequest: pass
+@dataclass(frozen=True)
+class BufferAlloc(CloudRequest): buffer_num: int; size: int; options: BufferOptions # noqa: E702
+@dataclass(frozen=True)
+class BufferFree(CloudRequest): buffer_num: int # noqa: E702
+@dataclass(frozen=True)
+class CopyIn(CloudRequest): buffer_num: int; datahash: str # noqa: E702
+@dataclass(frozen=True)
+class CopyOut(CloudRequest): buffer_num: int
+@dataclass(frozen=True)
+class ProgramAlloc(CloudRequest): name: str; datahash: str # noqa: E702
+@dataclass(frozen=True)
+class ProgramFree(CloudRequest): name: str; datahash: str # noqa: E702
+@dataclass(frozen=True)
+class ProgramExec(CloudRequest):
+  name: str; datahash: str; bufs: Tuple[int, ...]; vals: Tuple[int, ...] # noqa: E702
+  global_size: Optional[Tuple[int, ...]]; local_size: Optional[Tuple[int, ...]]; wait: bool # noqa: E702
+# for safe deserialization
+whitelist = {x.__name__:x for x in [BufferAlloc, BufferFree, CopyIn, CopyOut, ProgramAlloc, ProgramFree, ProgramExec, BufferOptions]}
+eval_fxns = {ast.Constant: lambda x: x.value, ast.Tuple: lambda x: tuple(map(safe_eval, x.elts)), ast.List: lambda x: list(map(safe_eval, x.elts)),
+  ast.Call: lambda x: safe_eval(x.func)(*[safe_eval(arg) for arg in x.args], **{kwarg.arg: safe_eval(kwarg.value) for kwarg in x.keywords}),
+  ast.Name: lambda x: whitelist[x.id], ast.Attribute: lambda x: {"imagef": dtypes.imagef, "imageh": dtypes.imageh}[x.attr]}
+def safe_eval(node): return eval_fxns[node.__class__](node)
+class BatchRequest:
+  def __init__(self):
+    self._q: List[CloudRequest] = []
+    self._h: Dict[str, bytes] = {}
+  def h(self, d:bytes) -> str:
+    binhash = hashlib.sha256(d).digest()
+    self._h[datahash:=binascii.hexlify(binhash).decode()] = binhash+struct.pack("<Q", len(d))+d
+    return datahash
+  def q(self, x:CloudRequest): self._q.append(x)
+  def serialize(self) -> bytes:
+    self.h(repr(self._q).encode())
+    return b''.join(self._h.values())
+  def deserialize(self, dat:bytes) -> BatchRequest:
+    ptr = 0
+    while ptr < len(dat):
+      datahash, datalen = binascii.hexlify(dat[ptr:ptr+0x20]).decode(), struct.unpack("<Q", dat[ptr+0x20:ptr+0x28])[0]
+      self._h[datahash] = dat[ptr+0x28:ptr+0x28+datalen]
+      ptr += 0x28+datalen
+    self._q = safe_eval(ast.parse(self._h[datahash], mode="eval").body)
+    return self
+# ***** backend *****
+@dataclass
+class CloudSession:
+  programs: Dict[Tuple[str, str], Any] = field(default_factory=dict)
+  # TODO: the buffer should track this internally
+  buffers: Dict[int, Tuple[Any, int, Optional[BufferOptions]]] = field(default_factory=dict)
+class CloudHandler(BaseHTTPRequestHandler):
+  protocol_version = 'HTTP/1.1'
+  dname: str
+  sessions: DefaultDict[str, CloudSession] = defaultdict(CloudSession)
+  def setup(self):
+    super().setup()
+    print(f"connection established with {self.client_address}, socket: {self.connection.fileno()}")
+  def _do(self, method):
+    session = CloudHandler.sessions[unwrap(self.headers.get("Cookie")).split("session=")[1]]
+    ret, status_code = b"", 200
+    if self.path == "/batch" and method == "POST":
+      # TODO: streaming deserialize?
+      req = BatchRequest().deserialize(self.rfile.read(int(unwrap(self.headers.get('Content-Length')))))
+      # the cmds are always last (currently in datahash)
+      for c in req._q:
+        if DEBUG >= 1: print(c)
+        match c:
+          case BufferAlloc():
+            assert c.buffer_num not in session.buffers, f"buffer {c.buffer_num} already allocated"
+            session.buffers[c.buffer_num] = (Device[CloudHandler.dname].allocator.alloc(c.size, c.options), c.size, c.options)
+          case BufferFree():
+            buf,sz,buffer_options = session.buffers[c.buffer_num]
+            Device[CloudHandler.dname].allocator.free(buf,sz,buffer_options)
+            del session.buffers[c.buffer_num]
+          case CopyIn(): Device[CloudHandler.dname].allocator.copyin(session.buffers[c.buffer_num][0], memoryview(bytearray(req._h[c.datahash])))
+          case CopyOut():
+            buf,sz,_ = session.buffers[c.buffer_num]
+            Device[CloudHandler.dname].allocator.copyout(memoryview(ret:=bytearray(sz)), buf)
+          case ProgramAlloc():
+            lib = Device[CloudHandler.dname].compiler.compile_cached(req._h[c.datahash].decode())
+            session.programs[(c.name, c.datahash)] = Device[CloudHandler.dname].runtime(c.name, lib)
+          case ProgramFree(): del session.programs[(c.name, c.datahash)]
+          case ProgramExec():
+            bufs = [session.buffers[x][0] for x in c.bufs]
+            extra_args = {k:v for k,v in [("global_size", c.global_size), ("local_size", c.local_size)] if v is not None}
+            r = session.programs[(c.name, c.datahash)](*bufs, vals=c.vals, wait=c.wait, **extra_args)
+            if r is not None: ret = str(r).encode()
+    elif self.path == "/renderer" and method == "GET":
+      cls, args = Device[CloudHandler.dname].renderer.__reduce__()
+      ret = json.dumps((cls.__module__, cls.__name__, args)).encode()
+    else: status_code = 404
+    self.send_response(status_code)
+    self.send_header('Content-Length', str(len(ret)))
+    self.end_headers()
+    return self.wfile.write(ret)
+  def do_GET(self): return self._do("GET")
+  def do_POST(self): return self._do("POST")
+def cloud_server(port:int):
+  multiprocessing.current_process().name = "MainProcess"
+  CloudHandler.dname = getenv("CLOUDDEV", "METAL") if Device.DEFAULT == "CLOUD" else Device.DEFAULT
+  print(f"start cloud server on {port} with device {CloudHandler.dname}")
+  server = HTTPServer(('', port), CloudHandler)
+  server.serve_forever()
+# ***** frontend *****
+class CloudAllocator(Allocator):
+  def __init__(self, device:CloudDevice):
+    self.device = device
+    super().__init__()
+  # TODO: ideally we shouldn't have to deal with images here
+  def _alloc(self, size:int, options:BufferOptions) -> int:
+    self.device.buffer_num += 1
+    self.device.req.q(BufferAlloc(self.device.buffer_num, size, options))
+    return self.device.buffer_num
+  # TODO: options should not be here in any Allocator
+  def _free(self, opaque:int, options): self.device.req.q(BufferFree(opaque))
+  def copyin(self, dest:int, src:memoryview): self.device.req.q(CopyIn(dest, self.device.req.h(bytes(src))))
+  def copyout(self, dest:memoryview, src:int):
+    self.device.req.q(CopyOut(src))
+    resp = self.device.batch_submit()
+    assert len(resp) == len(dest), f"buffer length mismatch {len(resp)} != {len(dest)}"
+    dest[:] = resp
+class CloudProgram:
+  def __init__(self, device:CloudDevice, name:str, lib:bytes):
+    self.device, self.name = device, name
+    self.datahash = self.device.req.h(lib)
+    self.device.req.q(ProgramAlloc(self.name, self.datahash))
+    super().__init__()
+  def __del__(self): self.device.req.q(ProgramFree(self.name, self.datahash))
+  def __call__(self, *bufs, global_size=None, local_size=None, vals:Tuple[int, ...]=(), wait=False):
+    self.device.req.q(ProgramExec(self.name, self.datahash, bufs, vals, global_size, local_size, wait))
+    if wait: return float(self.device.batch_submit())
+class CloudDevice(Compiled):
+  def __init__(self, device:str):
+    if (host:=getenv("HOST", "")) != "": self.host = host
+    else:
+      p = multiprocessing.Process(target=cloud_server, args=(6667,))
+      p.daemon = True
+      p.start()
+      self.host = "127.0.0.1:6667"
+    # state for the connection
+    self.session = binascii.hexlify(os.urandom(0x10)).decode()
+    self.buffer_num = 0
+    self.req: BatchRequest = BatchRequest()
+    if DEBUG >= 1: print(f"cloud with host {self.host}")
+    while 1:
+      try:
+        self.conn = http.client.HTTPConnection(self.host, timeout=60.0)
+        clouddev = json.loads(self.send("GET", "renderer").decode())
+        break
+      except Exception as e:
+        print(e)
+        time.sleep(0.1)
+    if DEBUG >= 1: print(f"remote has device {clouddev}")
+    # TODO: how to we have BEAM be cached on the backend? this should just send a specification of the compute. rethink what goes in Renderer
+    if not clouddev[0].startswith("tinygrad.renderer.") or not clouddev[1].endswith("Renderer"): raise RuntimeError(f"bad renderer {clouddev}")
+    renderer_class = fromimport(clouddev[0], clouddev[1])  # TODO: is this secure?
+    if not issubclass(renderer_class, Renderer): raise RuntimeError(f"renderer isn't a Renderer {clouddev}")
+    super().__init__(device, CloudAllocator(self), renderer_class(*clouddev[2]), Compiler(), functools.partial(CloudProgram, self))
+  def __del__(self):
+    # TODO: this is never being called
+    # TODO: should close the whole session
+    with contextlib.suppress(ConnectionRefusedError, http.client.CannotSendRequest, http.client.RemoteDisconnected): self.batch_submit()
+  def batch_submit(self):
+    data = self.req.serialize()
+    with Timing(f"*** send {len(self.req._q):-3d} requests {len(self.req._h):-3d} hashes with len {len(data)/1024:.2f} kB in ", enabled=DEBUG>=1):
+      ret = self.send("POST", "batch", data)
+    self.req = BatchRequest()
+    return ret
+  def send(self, method, path, data:Optional[bytes]=None) -> bytes:
+    # TODO: retry logic
+    self.conn.request(method, "/"+path, data, headers={"Cookie": f"session={self.session}"})
+    response = self.conn.getresponse()
+    assert response.status == 200, f"failed on {method} {path}"
+    return response.read()
+if __name__ == "__main__": cloud_server(getenv("PORT", 6667))

tinygrad/runtime/ops_cuda.py CHANGED Viewed

@@ -1,30 +1,14 @@
 from __future__ import annotations
-import subprocess, hashlib, tempfile, ctypes, ctypes.util, functools, re
-from pathlib import Path
+import ctypes, ctypes.util, functools
 from typing import Tuple, Optional, List
-import tinygrad.runtime.autogen.cuda as cuda
-from tinygrad.helpers import DEBUG, getenv, from_mv, to_char_p_p, init_c_var, init_c_struct_t, colored, cpu_time_execution
-from tinygrad.device import Compiled, Compiler, CompileError, BufferOptions, LRUAllocator, MallocAllocator
+from tinygrad.helpers import DEBUG, getenv, from_mv, init_c_var, init_c_struct_t
+from tinygrad.device import Compiled, BufferOptions, LRUAllocator
 from tinygrad.renderer.cstyle import CUDARenderer
-from tinygrad.renderer.assembly import PTXRenderer
+from tinygrad.renderer.ptx import PTXRenderer
+from tinygrad.runtime.autogen import cuda
+from tinygrad.runtime.support.compiler_cuda import cuda_disassemble, pretty_ptx, CUDACompiler, PTXCompiler, PTX
 if getenv("IOCTL"): import extra.nv_gpu_driver.nv_ioctl  # noqa: F401  # pylint: disable=unused-import
-def pretty_ptx(s):
-  # all expressions match `<valid_before><expr><valid_after>` and replace it with `<valid_before>color(<expr>)<valid_after>`
-  s = re.sub(r'([!@<\[\s,\+\-;\n])((?:[_%$][\w%\$_]+(?:\.[xyz])?\:?)|(?:buf\d+))([<>\]\s,\+\-;\n\)])', lambda m:m[1]+colored(m[2], "blue")+m[3], s, flags=re.M) # identifiers  # noqa: E501
-  s = re.sub(r'(.)((?:b|s|u|f)(?:8|16|32|64)|pred)([\.\s])', lambda m:m[1]+colored(m[2], "green")+m[3], s, flags=re.M) # types
-  s = re.sub(r'^(\s*)([\w]+)(.*?;$)', lambda m:m[1]+colored(m[2], "yellow")+m[3], s, flags=re.M) # instructions
-  s = re.sub(r'([<>\[\]\s,\+\-;])((?:0[fF][0-9a-fA-F]{8})|(?:[0-9]+)|(?:0[xX][0-9a-fA-F]+))([<>\[\]\s,\+\-;])', lambda m:m[1]+colored(m[2], "yellow")+m[3], s, flags=re.M) # numbers  # noqa: E501
-  s = re.sub(r'(\.)(param|reg|global)', lambda m:m[1]+colored(m[2], "magenta"), s, flags=re.M) # space
-  s = re.sub(r'(\.)(version|target|address_size|visible|entry)', lambda m:m[1]+colored(m[2], "magenta"), s, flags=re.M) # derivatives
-  return s
-CUDACPU = getenv("CUDACPU") == 1
-if CUDACPU:
-  gpuocelot_lib = ctypes.CDLL(ctypes.util.find_library("gpuocelot"))
-  gpuocelot_lib.ptx_run.argtypes = [ctypes.c_char_p, ctypes.c_int, ctypes.POINTER(ctypes.c_void_p), ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int]  # noqa: E501
-  cuda.cuLaunchKernel = lambda src, gx, gy, gz, lx, ly, lz, shared, stream, unused_extra, args: gpuocelot_lib.ptx_run(src, len(args), (ctypes.c_void_p * len(args))(*[ctypes.cast(x, ctypes.c_void_p) for x in args]), lx, ly, lz, gx, gy, gz, shared)  # type: ignore  # noqa: E501
 def check(status):
   if status != 0: raise RuntimeError(f"CUDA Error {status}, {ctypes.string_at(init_c_var(ctypes.POINTER(ctypes.c_char)(), lambda x: cuda.cuGetErrorString(status, ctypes.byref(x)))).decode()}")  # noqa: E501
@@ -36,7 +20,6 @@ def encode_args(args, vals) -> Tuple[ctypes.Structure, ctypes.Array]:
   return c_args, vargs
 def cu_time_execution(cb, enable=False) -> Optional[float]:
-  if CUDACPU: return cpu_time_execution(cb, enable=enable)
   if not enable: return cb()
   evs = [init_c_var(cuda.CUevent(), lambda x: cuda.cuEventCreate(ctypes.byref(x), 0)) for _ in range(2)]
   cuda.cuEventRecord(evs[0], None)
@@ -47,70 +30,34 @@ def cu_time_execution(cb, enable=False) -> Optional[float]:
   for ev in evs: cuda.cuEventDestroy_v2(ev)
   return ret.value * 1e-3
-def _get_bytes(arg, get_str, get_sz, check) -> bytes:
-  sz = init_c_var(ctypes.c_size_t(), lambda x: check(get_sz(arg, ctypes.byref(x))))
-  return ctypes.string_at(init_c_var(ctypes.create_string_buffer(sz.value), lambda x: check(get_str(arg, x))), size=sz.value)
-class PTXCompiler(Compiler):
-  def __init__(self, arch:str):
-    self.arch = arch
-    self.version = "7.8" if arch >= "sm_89" else "7.5"
-    super().__init__(f"compile_ptx_{self.arch}")
-  def compile(self, src:str) -> bytes: return src.replace("TARGET", self.arch).replace("VERSION", self.version).encode()
-class CUDACompiler(Compiler):
-  def __init__(self, arch:str):
-    self.arch = arch
-    check(cuda.nvrtcVersion((nvrtcMajor := ctypes.c_int()), (nvrtcMinor := ctypes.c_int())))
-    self.compile_options = [f'--gpu-architecture={arch}', "-I/usr/local/cuda/include", "-I/usr/include", "-I/opt/cuda/include/"]
-    if (nvrtcMajor.value, nvrtcMinor.value) >= (12, 4): self.compile_options.append("--minimal")
-    super().__init__(f"compile_cuda_{self.arch}")
-  def compile(self, src:str) -> bytes:
-    check(cuda.nvrtcCreateProgram(ctypes.byref(prog := cuda.nvrtcProgram()), src.encode(), "<null>".encode(), 0, None, None))
-    status = cuda.nvrtcCompileProgram(prog, len(self.compile_options), to_char_p_p([o.encode() for o in self.compile_options]))
-    if status != 0: raise CompileError(f"compile failed: {_get_bytes(prog, cuda.nvrtcGetProgramLog, cuda.nvrtcGetProgramLogSize, check).decode()}")
-    return _get_bytes(prog, cuda.nvrtcGetPTX, cuda.nvrtcGetPTXSize, check)
-def cuda_disassemble(lib, arch):
-  try:
-    fn = (Path(tempfile.gettempdir()) / f"tinycuda_{hashlib.md5(lib).hexdigest()}").as_posix()
-    with open(fn + ".ptx", "wb") as f: f.write(lib)
-    subprocess.run(["ptxas", f"-arch={arch}", "-o", fn, fn+".ptx"], check=True)
-    print(subprocess.check_output(['nvdisasm', fn]).decode('utf-8'))
-  except Exception as e: print("failed to generate SASS", str(e))
 class CUDAProgram:
-  def __init__(self, device:CUDADevice, name:str, lib:bytes):
-    self.device, self.name, self.lib = device, name, lib
+  def __init__(self, device:CUDADevice, name:str, lib:bytes, smem:int=0):
+    self.device, self.name, self.lib, self.smem = device, name, lib, smem
     if DEBUG >= 5: print("\n".join([f"{i+1:>3} {line}" for i, line in enumerate(pretty_ptx(lib.decode('utf-8')).split("\n"))]))
     if DEBUG >= 6: cuda_disassemble(lib, device.arch)
-    if CUDACPU: self.prg = lib
-    else:
-      check(cuda.cuCtxSetCurrent(self.device.context))
-      self.module = cuda.CUmodule()
-      status = cuda.cuModuleLoadData(ctypes.byref(self.module), lib)
-      if status != 0:
-        del self.module
-        cuda_disassemble(lib, device.arch)
-        raise RuntimeError(f"module load failed with status code {status}: {cuda.cudaError_enum__enumvalues[status]}")
-      check(cuda.cuModuleGetFunction(ctypes.byref(prg := cuda.CUfunction()), self.module, name.encode("utf-8")))
-      self.prg = prg #type: ignore
+    check(cuda.cuCtxSetCurrent(self.device.context))
+    self.module = cuda.CUmodule()
+    status = cuda.cuModuleLoadData(ctypes.byref(self.module), lib)
+    if status != 0:
+      del self.module
+      cuda_disassemble(lib, device.arch)
+      raise RuntimeError(f"module load failed with status code {status}: {cuda.cudaError_enum__enumvalues[status]}")
+    check(cuda.cuModuleGetFunction(ctypes.byref(prg := cuda.CUfunction()), self.module, name.encode("utf-8")))
+    self.prg = prg
+    if self.smem > 0: check(cuda.cuFuncSetAttribute(self.prg, cuda.CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, self.smem))
   def __del__(self):
     if hasattr(self, 'module'): check(cuda.cuModuleUnload(self.module))
   def __call__(self, *args, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1), vals:Tuple[int, ...]=(), wait=False):
-    if CUDACPU: self.vargs = args+tuple(vals)
+    check(cuda.cuCtxSetCurrent(self.device.context))
+    if not hasattr(self, "vargs"):
+      self.c_args, self.vargs = encode_args(args, vals)
     else:
-      check(cuda.cuCtxSetCurrent(self.device.context))
-      if not hasattr(self, "vargs"):
-        self.c_args, self.vargs = encode_args(args, vals) #type: ignore
-      else:
-        for i in range(len(args)): self.c_args.__setattr__(f'f{i}', args[i])
-        for i in range(len(vals)): self.c_args.__setattr__(f'v{i}', vals[i])
-    return cu_time_execution(lambda: check(cuda.cuLaunchKernel(self.prg, *global_size, *local_size, 0, None, None, self.vargs)), enable=wait)
+      for i in range(len(args)): self.c_args.__setattr__(f'f{i}', args[i])
+      for i in range(len(vals)): self.c_args.__setattr__(f'v{i}', vals[i])
+    return cu_time_execution(lambda: check(cuda.cuLaunchKernel(self.prg, *global_size, *local_size, self.smem, None, None, self.vargs)), enable=wait)
 class CUDAAllocator(LRUAllocator):
   def __init__(self, device:CUDADevice):
@@ -140,7 +87,7 @@ class CUDAAllocator(LRUAllocator):
     check(cuda.cuEventRecord(sync_event, None))
     check(cuda.cuCtxSetCurrent(dest_dev.context))
     check(cuda.cuStreamWaitEvent(None, sync_event, 0)) # sync the default stream on the dest dev
-  def offset(self, buf, size:int, offset:int): return ctypes.c_ulong(buf.value + offset)
+  def offset(self, buf, size:int, offset:int): return cuda.CUdeviceptr_v2(buf.value + offset)
 class CUDADevice(Compiled):
   devices: List[CUDADevice] = []
@@ -148,33 +95,29 @@ class CUDADevice(Compiled):
   def __init__(self, device:str):
     device_id = int(device.split(":")[1]) if ":" in device else 0
-    if not CUDACPU:
-      check(cuda.cuInit(0))
-      self.cu_device = init_c_var(cuda.CUdevice(), lambda x: check(cuda.cuDeviceGet(ctypes.byref(x), device_id)))
-      self.context = init_c_var(cuda.CUcontext(), lambda x: check(cuda.cuCtxCreate_v2(ctypes.byref(x), 0, self.cu_device)))
-      check(cuda.cuDeviceComputeCapability(ctypes.byref(major := ctypes.c_int()), ctypes.byref(minor := ctypes.c_int()), device_id))
-      for dev in CUDADevice.devices:
-        check(cuda.cuDeviceCanAccessPeer(ctypes.byref(val := ctypes.c_int()), self.cu_device, dev.cu_device))
-        if val.value != 1: continue
-        check(cuda.cuCtxSetCurrent(dev.context))
-        check(cuda.cuCtxEnablePeerAccess(self.context, 0))
-        check(cuda.cuCtxSetCurrent(self.context))
-        check(cuda.cuCtxEnablePeerAccess(dev.context, 0))
-        CUDADevice.peer_access = True
-    self.arch = f"sm_{major.value}{minor.value}" if not CUDACPU else "sm_35"
+    check(cuda.cuInit(0))
+    self.cu_device = init_c_var(cuda.CUdevice(), lambda x: check(cuda.cuDeviceGet(ctypes.byref(x), device_id)))
+    self.context = init_c_var(cuda.CUcontext(), lambda x: check(cuda.cuCtxCreate_v2(ctypes.byref(x), 0, self.cu_device)))
+    check(cuda.cuDeviceComputeCapability(ctypes.byref(major := ctypes.c_int()), ctypes.byref(minor := ctypes.c_int()), device_id))
+    for dev in CUDADevice.devices:
+      check(cuda.cuDeviceCanAccessPeer(ctypes.byref(val := ctypes.c_int()), self.cu_device, dev.cu_device))
+      if val.value != 1: continue
+      check(cuda.cuCtxSetCurrent(dev.context))
+      check(cuda.cuCtxEnablePeerAccess(self.context, 0))
+      check(cuda.cuCtxSetCurrent(self.context))
+      check(cuda.cuCtxEnablePeerAccess(dev.context, 0))
+      CUDADevice.peer_access = True
+    self.arch = f"sm_{major.value}{minor.value}"
     self.pending_copyin: List[Tuple[int, int, Optional[BufferOptions]]] = []
     CUDADevice.devices.append(self)
     from tinygrad.runtime.graph.cuda import CUDAGraph
-    super().__init__(device, CUDAAllocator(self) if not CUDACPU else MallocAllocator,
-                     PTXRenderer(self.arch) if getenv("PTX") else CUDARenderer(self.arch),
-                     PTXCompiler(self.arch) if getenv("PTX") else CUDACompiler(self.arch),
-                     functools.partial(CUDAProgram, self), graph=CUDAGraph if not CUDACPU else None)
+    super().__init__(device, CUDAAllocator(self), PTXRenderer(self.arch) if PTX else CUDARenderer(self.arch),
+                     PTXCompiler(self.arch) if PTX else CUDACompiler(self.arch), functools.partial(CUDAProgram, self), graph=CUDAGraph)
   def synchronize(self):
-    if CUDACPU: return
     check(cuda.cuCtxSetCurrent(self.context))
     check(cuda.cuCtxSynchronize())
     for opaque,sz,options in self.pending_copyin: self.allocator.free(opaque, sz, options)

tinygrad/runtime/ops_disk.py CHANGED Viewed

@@ -1,13 +1,11 @@
 from __future__ import annotations
-import os, mmap, _posixshmem, io, ctypes, ctypes.util, platform, contextlib
+import os, sys, mmap, io, ctypes, ctypes.util, contextlib
 from typing import Optional, Generator, Tuple, Callable, List
 from tinygrad.helpers import OSX, round_up
 from tinygrad.device import Compiled, Allocator
-import tinygrad.runtime.autogen.io_uring as io_uring
-libc = ctypes.CDLL(ctypes.util.find_library("c"))
-libc.mmap.argtypes = [ctypes.c_void_p, ctypes.c_size_t, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_long]
-libc.mmap.restype = ctypes.c_void_p
+with contextlib.suppress(ImportError):
+  import _posixshmem
+  from tinygrad.runtime.autogen import io_uring, libc
 class DiskBuffer:
   def __init__(self, device:DiskDevice, size:int, offset=0):
@@ -27,7 +25,7 @@ class DiskAllocator(Allocator):
   def as_buffer(self, src:DiskBuffer): return src._buf()
   def copyin(self, dest:DiskBuffer, src:memoryview): dest._buf()[:] = src
   def copyout(self, dest:memoryview, src:DiskBuffer):
-    if OSX and hasattr(self.device, 'fd'):
+    if OSX and self.device.fd is not None:
       # OSX doesn't seem great at mmap, this is faster
       with io.FileIO(self.device.fd, "a+b", closefd=False) as fo:
         fo.seek(src.offset)
@@ -76,6 +74,7 @@ class DiskDevice(Compiled):
     if not DiskDevice._tried_io_uring_init: self._iouring_setup()
     self.size: Optional[int] = None
+    self.fd: Optional[int] = None
     self.count = 0
     super().__init__(device, DiskAllocator(self), None, None, None)
   def _might_open(self, size):
@@ -85,41 +84,41 @@ class DiskDevice(Compiled):
     filename = self.dname[len("disk:"):]
     self.size = size
-    if filename.startswith("shm:"):
+    if sys.platform != "win32" and filename.startswith("shm:"):
       fd = _posixshmem.shm_open("/"+filename[4:].lstrip("/"), os.O_RDWR, 0o600)
       self.mem = mmap.mmap(fd, self.size, mmap.MAP_SHARED | MAP_POPULATE | MAP_LOCKED)
       os.close(fd)
     else:
-      try: self.fd = os.open(filename, os.O_RDWR|os.O_CREAT|(0 if OSX else os.O_DIRECT))
+      try: self.fd = os.open(filename, os.O_RDWR|os.O_CREAT|getattr(os, "O_DIRECT", 0))
       except OSError: self.fd = os.open(filename, os.O_RDWR|os.O_CREAT)
       if os.fstat(self.fd).st_size < self.size: os.ftruncate(self.fd, self.size)
       self.mem = mmap.mmap(self.fd, self.size)
-    if (hp := getattr(mmap, "MADV_HUGEPAGE", None)) is not None:
+    if hasattr(self.mem, 'madvise') and (hp := getattr(mmap, "MADV_HUGEPAGE", None)) is not None:
       with contextlib.suppress(OSError): self.mem.madvise(hp) # some systems have transparent_hugepage disabled
   def _might_close(self):
     self.count -= 1
     if self.count == 0:
-      if hasattr(self, 'fd'): os.close(self.fd)
+      if self.fd is not None: os.close(self.fd)
       self.size = None
   def _iouring_setup(self):
     DiskDevice._tried_io_uring_init = True
-    if platform.system() != 'Linux': return
-    fd = libc.syscall(io_uring.NR_io_uring_setup, 4096, ctypes.byref(p:=io_uring.struct_io_uring_params()))
-    if fd < 0: return
+    if sys.platform == 'linux' and not hasattr(sys, "getandroidapilevel"):
+      fd = libc.syscall(io_uring.NR_io_uring_setup, 4096, ctypes.byref(p:=io_uring.struct_io_uring_params()))
+      if fd < 0: return
-    sq_ptr = libc.mmap(0, p.sq_off.array + p.sq_entries * 4, mmap.PROT_READ | mmap.PROT_WRITE, mmap.MAP_SHARED | MAP_POPULATE, fd, 0)
-    cq_ptr = libc.mmap(0, p.cq_off.cqes + p.cq_entries * ctypes.sizeof(io_uring.struct_io_uring_cqe),
-                       mmap.PROT_READ | mmap.PROT_WRITE, mmap.MAP_SHARED | MAP_POPULATE, fd, io_uring.IORING_OFF_CQ_RING)
-    sqes = libc.mmap(0, p.sq_entries * ctypes.sizeof(io_uring.struct_io_uring_sqe),
-                     mmap.PROT_READ | mmap.PROT_WRITE, mmap.MAP_SHARED | MAP_POPULATE, fd, io_uring.IORING_OFF_SQES)
+      sq_ptr = libc.mmap(0, p.sq_off.array + p.sq_entries * 4, mmap.PROT_READ | mmap.PROT_WRITE, mmap.MAP_SHARED | MAP_POPULATE, fd, 0)
+      cq_ptr = libc.mmap(0, p.cq_off.cqes + p.cq_entries * ctypes.sizeof(io_uring.struct_io_uring_cqe),
+                        mmap.PROT_READ | mmap.PROT_WRITE, mmap.MAP_SHARED | MAP_POPULATE, fd, io_uring.IORING_OFF_CQ_RING)
+      sqes = libc.mmap(0, p.sq_entries * ctypes.sizeof(io_uring.struct_io_uring_sqe),
+                      mmap.PROT_READ | mmap.PROT_WRITE, mmap.MAP_SHARED | MAP_POPULATE, fd, io_uring.IORING_OFF_SQES)
-    def u32ptr(val): return ctypes.cast(val, ctypes.POINTER(ctypes.c_uint32))
-    sqdesc = io_uring.struct_io_uring_sq(khead=u32ptr(sq_ptr+p.sq_off.head), ktail=u32ptr(sq_ptr+p.sq_off.tail), array=u32ptr(sq_ptr+p.sq_off.array),
-      kring_mask=u32ptr(sq_ptr+p.sq_off.ring_mask), sqes=ctypes.cast(sqes, ctypes.POINTER(io_uring.struct_io_uring_sqe)))
+      def u32ptr(val): return ctypes.cast(val, ctypes.POINTER(ctypes.c_uint32))
+      sqdesc = io_uring.struct_io_uring_sq(khead=u32ptr(sq_ptr+p.sq_off.head), ktail=u32ptr(sq_ptr+p.sq_off.tail),
+                                           array=u32ptr(sq_ptr+p.sq_off.array),
+        kring_mask=u32ptr(sq_ptr+p.sq_off.ring_mask), sqes=ctypes.cast(sqes, ctypes.POINTER(io_uring.struct_io_uring_sqe)))
-    cqdesc = io_uring.struct_io_uring_cq(khead=u32ptr(cq_ptr+p.cq_off.head), ktail=u32ptr(cq_ptr+p.cq_off.tail),
-      kring_mask=u32ptr(sq_ptr+p.cq_off.ring_mask), cqes=ctypes.cast(cq_ptr+p.cq_off.cqes, ctypes.POINTER(io_uring.struct_io_uring_cqe)))
+      cqdesc = io_uring.struct_io_uring_cq(khead=u32ptr(cq_ptr+p.cq_off.head), ktail=u32ptr(cq_ptr+p.cq_off.tail),
+        kring_mask=u32ptr(sq_ptr+p.cq_off.ring_mask), cqes=ctypes.cast(cq_ptr+p.cq_off.cqes, ctypes.POINTER(io_uring.struct_io_uring_cqe)))
-    DiskDevice.io_uring = io_uring.struct_io_uring(ring_fd=fd, sq=sqdesc, cq=cqdesc) # type: ignore
+      DiskDevice.io_uring = io_uring.struct_io_uring(ring_fd=fd, sq=sqdesc, cq=cqdesc) # type: ignore

tinygrad 0.9.1__py3-none-any.whl → 0.10.0__py3-none-any.whl

tinygrad 0.9.1py3-none-any.whl → 0.10.0py3-none-any.whl