tinygrad 0.10.0__py3-none-any.whl → 0.10.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. tinygrad/codegen/devectorizer.py +247 -0
  2. tinygrad/codegen/expander.py +121 -0
  3. tinygrad/codegen/kernel.py +141 -201
  4. tinygrad/codegen/linearize.py +223 -84
  5. tinygrad/codegen/lowerer.py +60 -42
  6. tinygrad/codegen/symbolic.py +476 -0
  7. tinygrad/codegen/transcendental.py +22 -13
  8. tinygrad/device.py +187 -47
  9. tinygrad/dtype.py +39 -28
  10. tinygrad/engine/jit.py +83 -65
  11. tinygrad/engine/memory.py +4 -5
  12. tinygrad/engine/multi.py +161 -0
  13. tinygrad/engine/realize.py +62 -108
  14. tinygrad/engine/schedule.py +396 -357
  15. tinygrad/engine/search.py +55 -66
  16. tinygrad/gradient.py +73 -0
  17. tinygrad/helpers.py +81 -59
  18. tinygrad/nn/__init__.py +30 -32
  19. tinygrad/nn/datasets.py +1 -2
  20. tinygrad/nn/optim.py +22 -26
  21. tinygrad/nn/state.py +91 -66
  22. tinygrad/ops.py +492 -641
  23. tinygrad/renderer/__init__.py +95 -36
  24. tinygrad/renderer/cstyle.py +99 -92
  25. tinygrad/renderer/llvmir.py +83 -34
  26. tinygrad/renderer/ptx.py +83 -99
  27. tinygrad/renderer/wgsl.py +95 -0
  28. tinygrad/runtime/autogen/amd_gpu.py +39507 -12
  29. tinygrad/runtime/autogen/comgr.py +2 -0
  30. tinygrad/runtime/autogen/kfd.py +4 -3
  31. tinygrad/runtime/autogen/kgsl.py +1 -1
  32. tinygrad/runtime/autogen/libc.py +404 -71
  33. tinygrad/runtime/autogen/llvm.py +11379 -0
  34. tinygrad/runtime/autogen/pci.py +1333 -0
  35. tinygrad/runtime/autogen/vfio.py +891 -0
  36. tinygrad/runtime/autogen/webgpu.py +6985 -0
  37. tinygrad/runtime/graph/cuda.py +8 -9
  38. tinygrad/runtime/graph/hcq.py +84 -79
  39. tinygrad/runtime/graph/metal.py +40 -43
  40. tinygrad/runtime/ops_amd.py +498 -334
  41. tinygrad/runtime/ops_cloud.py +34 -34
  42. tinygrad/runtime/ops_cpu.py +24 -0
  43. tinygrad/runtime/ops_cuda.py +30 -27
  44. tinygrad/runtime/ops_disk.py +62 -63
  45. tinygrad/runtime/ops_dsp.py +159 -42
  46. tinygrad/runtime/ops_gpu.py +30 -30
  47. tinygrad/runtime/ops_hip.py +29 -31
  48. tinygrad/runtime/ops_llvm.py +48 -41
  49. tinygrad/runtime/ops_metal.py +149 -113
  50. tinygrad/runtime/ops_npy.py +2 -2
  51. tinygrad/runtime/ops_nv.py +238 -273
  52. tinygrad/runtime/ops_python.py +55 -50
  53. tinygrad/runtime/ops_qcom.py +129 -157
  54. tinygrad/runtime/ops_webgpu.py +225 -0
  55. tinygrad/runtime/support/allocator.py +94 -0
  56. tinygrad/runtime/support/am/__init__.py +0 -0
  57. tinygrad/runtime/support/am/amdev.py +396 -0
  58. tinygrad/runtime/support/am/ip.py +463 -0
  59. tinygrad/runtime/support/compiler_cuda.py +4 -2
  60. tinygrad/runtime/support/elf.py +28 -4
  61. tinygrad/runtime/support/hcq.py +256 -324
  62. tinygrad/runtime/support/llvm.py +26 -0
  63. tinygrad/shape/shapetracker.py +85 -53
  64. tinygrad/shape/view.py +104 -140
  65. tinygrad/spec.py +155 -0
  66. tinygrad/tensor.py +835 -527
  67. tinygrad/viz/assets/cdnjs.cloudflare.com/ajax/libs/highlight.js/11.10.0/highlight.min.js +1232 -0
  68. tinygrad/viz/assets/cdnjs.cloudflare.com/ajax/libs/highlight.js/11.10.0/languages/cpp.min.js +47 -0
  69. tinygrad/viz/assets/cdnjs.cloudflare.com/ajax/libs/highlight.js/11.10.0/languages/python.min.js +42 -0
  70. tinygrad/viz/assets/cdnjs.cloudflare.com/ajax/libs/highlight.js/11.10.0/styles/default.min.css +9 -0
  71. tinygrad/viz/assets/d3js.org/d3.v5.min.js +2 -0
  72. tinygrad/viz/assets/dagrejs.github.io/project/dagre-d3/latest/dagre-d3.min.js +4816 -0
  73. tinygrad/viz/assets/unpkg.com/@highlightjs/cdn-assets@11.10.0/styles/tokyo-night-dark.min.css +8 -0
  74. tinygrad/viz/index.html +544 -0
  75. tinygrad/viz/perfetto.html +178 -0
  76. tinygrad/viz/serve.py +205 -0
  77. {tinygrad-0.10.0.dist-info → tinygrad-0.10.2.dist-info}/METADATA +48 -25
  78. tinygrad-0.10.2.dist-info/RECORD +99 -0
  79. {tinygrad-0.10.0.dist-info → tinygrad-0.10.2.dist-info}/WHEEL +1 -1
  80. tinygrad/codegen/uopgraph.py +0 -506
  81. tinygrad/engine/lazy.py +0 -228
  82. tinygrad/function.py +0 -212
  83. tinygrad/multi.py +0 -177
  84. tinygrad/runtime/graph/clang.py +0 -39
  85. tinygrad/runtime/ops_clang.py +0 -35
  86. tinygrad-0.10.0.dist-info/RECORD +0 -77
  87. {tinygrad-0.10.0.dist-info → tinygrad-0.10.2.dist-info}/LICENSE +0 -0
  88. {tinygrad-0.10.0.dist-info → tinygrad-0.10.2.dist-info}/top_level.txt +0 -0
@@ -5,7 +5,7 @@
5
5
  # it should be a secure (example: no use of pickle) boundary. HTTP is used for RPC
6
6
 
7
7
  from __future__ import annotations
8
- from typing import Tuple, Optional, Dict, Any, DefaultDict, List
8
+ from typing import Optional, Any
9
9
  from collections import defaultdict
10
10
  from dataclasses import dataclass, field
11
11
  import multiprocessing, functools, http.client, hashlib, json, time, os, binascii, struct, ast, contextlib
@@ -13,14 +13,14 @@ from http.server import HTTPServer, BaseHTTPRequestHandler
13
13
  from tinygrad.renderer import Renderer
14
14
  from tinygrad.dtype import dtypes
15
15
  from tinygrad.helpers import getenv, DEBUG, fromimport, unwrap, Timing
16
- from tinygrad.device import Compiled, Allocator, Compiler, Device, BufferOptions
16
+ from tinygrad.device import Compiled, Allocator, Compiler, Device, BufferSpec
17
17
 
18
18
  # ***** API *****
19
19
 
20
20
  class CloudRequest: pass
21
21
 
22
22
  @dataclass(frozen=True)
23
- class BufferAlloc(CloudRequest): buffer_num: int; size: int; options: BufferOptions # noqa: E702
23
+ class BufferAlloc(CloudRequest): buffer_num: int; size: int; options: BufferSpec # noqa: E702
24
24
 
25
25
  @dataclass(frozen=True)
26
26
  class BufferFree(CloudRequest): buffer_num: int # noqa: E702
@@ -39,11 +39,11 @@ class ProgramFree(CloudRequest): name: str; datahash: str # noqa: E702
39
39
 
40
40
  @dataclass(frozen=True)
41
41
  class ProgramExec(CloudRequest):
42
- name: str; datahash: str; bufs: Tuple[int, ...]; vals: Tuple[int, ...] # noqa: E702
43
- global_size: Optional[Tuple[int, ...]]; local_size: Optional[Tuple[int, ...]]; wait: bool # noqa: E702
42
+ name: str; datahash: str; bufs: tuple[int, ...]; vals: tuple[int, ...] # noqa: E702
43
+ global_size: Optional[tuple[int, ...]]; local_size: Optional[tuple[int, ...]]; wait: bool # noqa: E702
44
44
 
45
45
  # for safe deserialization
46
- whitelist = {x.__name__:x for x in [BufferAlloc, BufferFree, CopyIn, CopyOut, ProgramAlloc, ProgramFree, ProgramExec, BufferOptions]}
46
+ whitelist = {x.__name__:x for x in [BufferAlloc, BufferFree, CopyIn, CopyOut, ProgramAlloc, ProgramFree, ProgramExec, BufferSpec]}
47
47
  eval_fxns = {ast.Constant: lambda x: x.value, ast.Tuple: lambda x: tuple(map(safe_eval, x.elts)), ast.List: lambda x: list(map(safe_eval, x.elts)),
48
48
  ast.Call: lambda x: safe_eval(x.func)(*[safe_eval(arg) for arg in x.args], **{kwarg.arg: safe_eval(kwarg.value) for kwarg in x.keywords}),
49
49
  ast.Name: lambda x: whitelist[x.id], ast.Attribute: lambda x: {"imagef": dtypes.imagef, "imageh": dtypes.imageh}[x.attr]}
@@ -51,8 +51,8 @@ def safe_eval(node): return eval_fxns[node.__class__](node)
51
51
 
52
52
  class BatchRequest:
53
53
  def __init__(self):
54
- self._q: List[CloudRequest] = []
55
- self._h: Dict[str, bytes] = {}
54
+ self._q: list[CloudRequest] = []
55
+ self._h: dict[str, bytes] = {}
56
56
  def h(self, d:bytes) -> str:
57
57
  binhash = hashlib.sha256(d).digest()
58
58
  self._h[datahash:=binascii.hexlify(binhash).decode()] = binhash+struct.pack("<Q", len(d))+d
@@ -74,14 +74,14 @@ class BatchRequest:
74
74
 
75
75
  @dataclass
76
76
  class CloudSession:
77
- programs: Dict[Tuple[str, str], Any] = field(default_factory=dict)
77
+ programs: dict[tuple[str, str], Any] = field(default_factory=dict)
78
78
  # TODO: the buffer should track this internally
79
- buffers: Dict[int, Tuple[Any, int, Optional[BufferOptions]]] = field(default_factory=dict)
79
+ buffers: dict[int, tuple[Any, int, Optional[BufferSpec]]] = field(default_factory=dict)
80
80
 
81
81
  class CloudHandler(BaseHTTPRequestHandler):
82
82
  protocol_version = 'HTTP/1.1'
83
- dname: str
84
- sessions: DefaultDict[str, CloudSession] = defaultdict(CloudSession)
83
+ device: str
84
+ sessions: defaultdict[str, CloudSession] = defaultdict(CloudSession)
85
85
 
86
86
  def setup(self):
87
87
  super().setup()
@@ -99,18 +99,18 @@ class CloudHandler(BaseHTTPRequestHandler):
99
99
  match c:
100
100
  case BufferAlloc():
101
101
  assert c.buffer_num not in session.buffers, f"buffer {c.buffer_num} already allocated"
102
- session.buffers[c.buffer_num] = (Device[CloudHandler.dname].allocator.alloc(c.size, c.options), c.size, c.options)
102
+ session.buffers[c.buffer_num] = (Device[CloudHandler.device].allocator.alloc(c.size, c.options), c.size, c.options)
103
103
  case BufferFree():
104
104
  buf,sz,buffer_options = session.buffers[c.buffer_num]
105
- Device[CloudHandler.dname].allocator.free(buf,sz,buffer_options)
105
+ Device[CloudHandler.device].allocator.free(buf,sz,buffer_options)
106
106
  del session.buffers[c.buffer_num]
107
- case CopyIn(): Device[CloudHandler.dname].allocator.copyin(session.buffers[c.buffer_num][0], memoryview(bytearray(req._h[c.datahash])))
107
+ case CopyIn(): Device[CloudHandler.device].allocator._copyin(session.buffers[c.buffer_num][0], memoryview(bytearray(req._h[c.datahash])))
108
108
  case CopyOut():
109
109
  buf,sz,_ = session.buffers[c.buffer_num]
110
- Device[CloudHandler.dname].allocator.copyout(memoryview(ret:=bytearray(sz)), buf)
110
+ Device[CloudHandler.device].allocator._copyout(memoryview(ret:=bytearray(sz)), buf)
111
111
  case ProgramAlloc():
112
- lib = Device[CloudHandler.dname].compiler.compile_cached(req._h[c.datahash].decode())
113
- session.programs[(c.name, c.datahash)] = Device[CloudHandler.dname].runtime(c.name, lib)
112
+ lib = Device[CloudHandler.device].compiler.compile_cached(req._h[c.datahash].decode())
113
+ session.programs[(c.name, c.datahash)] = Device[CloudHandler.device].runtime(c.name, lib)
114
114
  case ProgramFree(): del session.programs[(c.name, c.datahash)]
115
115
  case ProgramExec():
116
116
  bufs = [session.buffers[x][0] for x in c.bufs]
@@ -118,7 +118,7 @@ class CloudHandler(BaseHTTPRequestHandler):
118
118
  r = session.programs[(c.name, c.datahash)](*bufs, vals=c.vals, wait=c.wait, **extra_args)
119
119
  if r is not None: ret = str(r).encode()
120
120
  elif self.path == "/renderer" and method == "GET":
121
- cls, args = Device[CloudHandler.dname].renderer.__reduce__()
121
+ cls, args = Device[CloudHandler.device].renderer.__reduce__()
122
122
  ret = json.dumps((cls.__module__, cls.__name__, args)).encode()
123
123
  else: status_code = 404
124
124
  self.send_response(status_code)
@@ -131,42 +131,42 @@ class CloudHandler(BaseHTTPRequestHandler):
131
131
 
132
132
  def cloud_server(port:int):
133
133
  multiprocessing.current_process().name = "MainProcess"
134
- CloudHandler.dname = getenv("CLOUDDEV", "METAL") if Device.DEFAULT == "CLOUD" else Device.DEFAULT
135
- print(f"start cloud server on {port} with device {CloudHandler.dname}")
134
+ CloudHandler.device = getenv("CLOUDDEV", "METAL") if Device.DEFAULT == "CLOUD" else Device.DEFAULT
135
+ print(f"start cloud server on {port} with device {CloudHandler.device}")
136
136
  server = HTTPServer(('', port), CloudHandler)
137
137
  server.serve_forever()
138
138
 
139
139
  # ***** frontend *****
140
140
 
141
141
  class CloudAllocator(Allocator):
142
- def __init__(self, device:CloudDevice):
143
- self.device = device
142
+ def __init__(self, dev:CloudDevice):
143
+ self.device = dev
144
144
  super().__init__()
145
145
  # TODO: ideally we shouldn't have to deal with images here
146
- def _alloc(self, size:int, options:BufferOptions) -> int:
146
+ def _alloc(self, size:int, options:BufferSpec) -> int:
147
147
  self.device.buffer_num += 1
148
148
  self.device.req.q(BufferAlloc(self.device.buffer_num, size, options))
149
149
  return self.device.buffer_num
150
150
  # TODO: options should not be here in any Allocator
151
151
  def _free(self, opaque:int, options): self.device.req.q(BufferFree(opaque))
152
- def copyin(self, dest:int, src:memoryview): self.device.req.q(CopyIn(dest, self.device.req.h(bytes(src))))
153
- def copyout(self, dest:memoryview, src:int):
152
+ def _copyin(self, dest:int, src:memoryview): self.device.req.q(CopyIn(dest, self.device.req.h(bytes(src))))
153
+ def _copyout(self, dest:memoryview, src:int):
154
154
  self.device.req.q(CopyOut(src))
155
155
  resp = self.device.batch_submit()
156
156
  assert len(resp) == len(dest), f"buffer length mismatch {len(resp)} != {len(dest)}"
157
157
  dest[:] = resp
158
158
 
159
159
  class CloudProgram:
160
- def __init__(self, device:CloudDevice, name:str, lib:bytes):
161
- self.device, self.name = device, name
162
- self.datahash = self.device.req.h(lib)
163
- self.device.req.q(ProgramAlloc(self.name, self.datahash))
160
+ def __init__(self, dev:CloudDevice, name:str, lib:bytes):
161
+ self.dev, self.name = dev, name
162
+ self.datahash = self.dev.req.h(lib)
163
+ self.dev.req.q(ProgramAlloc(self.name, self.datahash))
164
164
  super().__init__()
165
- def __del__(self): self.device.req.q(ProgramFree(self.name, self.datahash))
165
+ def __del__(self): self.dev.req.q(ProgramFree(self.name, self.datahash))
166
166
 
167
- def __call__(self, *bufs, global_size=None, local_size=None, vals:Tuple[int, ...]=(), wait=False):
168
- self.device.req.q(ProgramExec(self.name, self.datahash, bufs, vals, global_size, local_size, wait))
169
- if wait: return float(self.device.batch_submit())
167
+ def __call__(self, *bufs, global_size=None, local_size=None, vals:tuple[int, ...]=(), wait=False):
168
+ self.dev.req.q(ProgramExec(self.name, self.datahash, bufs, vals, global_size, local_size, wait))
169
+ if wait: return float(self.dev.batch_submit())
170
170
 
171
171
  class CloudDevice(Compiled):
172
172
  def __init__(self, device:str):
@@ -0,0 +1,24 @@
1
+ import platform, subprocess, sys
2
+ from tinygrad.helpers import capstone_flatdump, getenv
3
+ from tinygrad.device import Compiled, Compiler, MallocAllocator, CPUProgram
4
+ from tinygrad.runtime.support.elf import jit_loader
5
+ from tinygrad.renderer.cstyle import ClangRenderer
6
+
7
+ class ClangJITCompiler(Compiler):
8
+ def __init__(self, cachekey="compile_clang_jit"): super().__init__(cachekey)
9
+
10
+ def compile(self, src:str) -> bytes:
11
+ # -fno-math-errno is required for __builtin_sqrt to become an instruction instead of a function call
12
+ # x18 is a reserved platform register. It is clobbered on context switch in macos and is used to store TEB pointer in windows on arm, don't use it
13
+ target = 'x86_64' if sys.platform == 'win32' else platform.machine()
14
+ args = ['-march=native', f'--target={target}-none-unknown-elf', '-O2', '-fPIC', '-ffreestanding', '-fno-math-errno', '-nostdlib']
15
+ arch_args = ['-ffixed-x18'] if target == 'arm64' else []
16
+ obj = subprocess.check_output([getenv("CC", 'clang'), '-c', '-x', 'c', *args, *arch_args, '-', '-o', '-'], input=src.encode('utf-8'))
17
+ return jit_loader(obj)
18
+
19
+ def disassemble(self, lib:bytes): return capstone_flatdump(lib)
20
+
21
+ class ClangDevice(Compiled):
22
+ def __init__(self, device:str): super().__init__(device, MallocAllocator, ClangRenderer(), ClangJITCompiler(), CPUProgram)
23
+
24
+ CPUDevice = ClangDevice
@@ -1,25 +1,25 @@
1
1
  from __future__ import annotations
2
2
  import ctypes, ctypes.util, functools
3
- from typing import Tuple, Optional, List
4
3
  from tinygrad.helpers import DEBUG, getenv, from_mv, init_c_var, init_c_struct_t
5
- from tinygrad.device import Compiled, BufferOptions, LRUAllocator
4
+ from tinygrad.device import Compiled, BufferSpec, LRUAllocator
6
5
  from tinygrad.renderer.cstyle import CUDARenderer
7
6
  from tinygrad.renderer.ptx import PTXRenderer
8
7
  from tinygrad.runtime.autogen import cuda
9
8
  from tinygrad.runtime.support.compiler_cuda import cuda_disassemble, pretty_ptx, CUDACompiler, PTXCompiler, PTX
10
9
  if getenv("IOCTL"): import extra.nv_gpu_driver.nv_ioctl # noqa: F401 # pylint: disable=unused-import
10
+ if MOCKGPU:=getenv("MOCKGPU"): from test.mockgpu.cuda import cuda # type: ignore # pylint: disable=reimported
11
11
 
12
12
  def check(status):
13
13
  if status != 0: raise RuntimeError(f"CUDA Error {status}, {ctypes.string_at(init_c_var(ctypes.POINTER(ctypes.c_char)(), lambda x: cuda.cuGetErrorString(status, ctypes.byref(x)))).decode()}") # noqa: E501
14
14
 
15
- def encode_args(args, vals) -> Tuple[ctypes.Structure, ctypes.Array]:
15
+ def encode_args(args, vals) -> tuple[ctypes.Structure, ctypes.Array]:
16
16
  c_args = init_c_struct_t(tuple([(f'f{i}', cuda.CUdeviceptr_v2) for i in range(len(args))] +
17
17
  [(f'v{i}', ctypes.c_int) for i in range(len(vals))]))(*args, *vals)
18
18
  vargs = (ctypes.c_void_p * 5)(ctypes.c_void_p(1), ctypes.cast(ctypes.byref(c_args), ctypes.c_void_p), ctypes.c_void_p(2),
19
19
  ctypes.cast(ctypes.pointer(ctypes.c_size_t(ctypes.sizeof(c_args))), ctypes.c_void_p), ctypes.c_void_p(0))
20
20
  return c_args, vargs
21
21
 
22
- def cu_time_execution(cb, enable=False) -> Optional[float]:
22
+ def cu_time_execution(cb, enable=False) -> float|None:
23
23
  if not enable: return cb()
24
24
  evs = [init_c_var(cuda.CUevent(), lambda x: cuda.cuEventCreate(ctypes.byref(x), 0)) for _ in range(2)]
25
25
  cuda.cuEventRecord(evs[0], None)
@@ -31,17 +31,17 @@ def cu_time_execution(cb, enable=False) -> Optional[float]:
31
31
  return ret.value * 1e-3
32
32
 
33
33
  class CUDAProgram:
34
- def __init__(self, device:CUDADevice, name:str, lib:bytes, smem:int=0):
35
- self.device, self.name, self.lib, self.smem = device, name, lib, smem
34
+ def __init__(self, dev:CUDADevice, name:str, lib:bytes, smem:int=0):
35
+ self.dev, self.name, self.lib, self.smem = dev, name, lib, smem
36
36
  if DEBUG >= 5: print("\n".join([f"{i+1:>3} {line}" for i, line in enumerate(pretty_ptx(lib.decode('utf-8')).split("\n"))]))
37
- if DEBUG >= 6: cuda_disassemble(lib, device.arch)
37
+ if DEBUG >= 6: cuda_disassemble(lib, dev.arch)
38
38
 
39
- check(cuda.cuCtxSetCurrent(self.device.context))
39
+ check(cuda.cuCtxSetCurrent(self.dev.context))
40
40
  self.module = cuda.CUmodule()
41
41
  status = cuda.cuModuleLoadData(ctypes.byref(self.module), lib)
42
42
  if status != 0:
43
43
  del self.module
44
- cuda_disassemble(lib, device.arch)
44
+ cuda_disassemble(lib, dev.arch)
45
45
  raise RuntimeError(f"module load failed with status code {status}: {cuda.cudaError_enum__enumvalues[status]}")
46
46
  check(cuda.cuModuleGetFunction(ctypes.byref(prg := cuda.CUfunction()), self.module, name.encode("utf-8")))
47
47
  self.prg = prg
@@ -50,47 +50,50 @@ class CUDAProgram:
50
50
  def __del__(self):
51
51
  if hasattr(self, 'module'): check(cuda.cuModuleUnload(self.module))
52
52
 
53
- def __call__(self, *args, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1), vals:Tuple[int, ...]=(), wait=False):
54
- check(cuda.cuCtxSetCurrent(self.device.context))
53
+ def __call__(self, *args, global_size:tuple[int,int,int]=(1,1,1), local_size:tuple[int,int,int]=(1,1,1), vals:tuple[int, ...]=(), wait=False):
54
+ check(cuda.cuCtxSetCurrent(self.dev.context))
55
55
  if not hasattr(self, "vargs"):
56
56
  self.c_args, self.vargs = encode_args(args, vals)
57
+
58
+ # HACK: For MOCKGPU send the args struct itself.
59
+ if MOCKGPU: self.vargs = self.c_args # type: ignore[assignment]
57
60
  else:
58
61
  for i in range(len(args)): self.c_args.__setattr__(f'f{i}', args[i])
59
62
  for i in range(len(vals)): self.c_args.__setattr__(f'v{i}', vals[i])
60
63
  return cu_time_execution(lambda: check(cuda.cuLaunchKernel(self.prg, *global_size, *local_size, self.smem, None, None, self.vargs)), enable=wait)
61
64
 
62
65
  class CUDAAllocator(LRUAllocator):
63
- def __init__(self, device:CUDADevice):
64
- self.device = device
66
+ def __init__(self, dev:CUDADevice):
67
+ self.dev = dev
65
68
  super().__init__()
66
- def _alloc(self, size, options:BufferOptions):
67
- check(cuda.cuCtxSetCurrent(self.device.context))
69
+ def _alloc(self, size, options:BufferSpec):
70
+ check(cuda.cuCtxSetCurrent(self.dev.context))
68
71
  if options.host: return init_c_var(ctypes.c_void_p(), lambda x: check(cuda.cuMemHostAlloc(ctypes.byref(x), size, 0x01)))
69
72
  return init_c_var(cuda.CUdeviceptr(), lambda x: check(cuda.cuMemAlloc_v2(ctypes.byref(x), size)))
70
- def _free(self, opaque, options:BufferOptions):
73
+ def _free(self, opaque, options:BufferSpec):
71
74
  if options.host: check(cuda.cuMemFreeHost(opaque))
72
75
  else: check(cuda.cuMemFree_v2(opaque))
73
- def copyin(self, dest, src:memoryview):
74
- check(cuda.cuCtxSetCurrent(self.device.context))
75
- host_mem = self.alloc(len(src), BufferOptions(host=True))
76
- self.device.pending_copyin.append((host_mem, len(src), BufferOptions(host=True)))
76
+ def _copyin(self, dest, src:memoryview):
77
+ check(cuda.cuCtxSetCurrent(self.dev.context))
78
+ host_mem = self.alloc(len(src), BufferSpec(host=True))
79
+ self.dev.pending_copyin.append((host_mem, len(src), BufferSpec(host=True)))
77
80
  ctypes.memmove(host_mem, from_mv(src), len(src))
78
81
  check(cuda.cuMemcpyHtoDAsync_v2(dest, host_mem, len(src), None))
79
- def copyout(self, dest:memoryview, src):
82
+ def _copyout(self, dest:memoryview, src):
80
83
  CUDADevice.synchronize_system()
81
- check(cuda.cuCtxSetCurrent(self.device.context))
84
+ check(cuda.cuCtxSetCurrent(self.dev.context))
82
85
  check(cuda.cuMemcpyDtoH_v2(from_mv(dest), src, len(dest)))
83
- def transfer(self, dest, src, sz:int, src_dev, dest_dev):
86
+ def _transfer(self, dest, src, sz:int, src_dev, dest_dev):
84
87
  check(cuda.cuCtxSetCurrent(src_dev.context))
85
88
  check(cuda.cuEventCreate(ctypes.byref(sync_event := cuda.CUevent()), 0))
86
89
  check(cuda.cuMemcpyDtoDAsync_v2(dest, src, sz, None))
87
90
  check(cuda.cuEventRecord(sync_event, None))
88
91
  check(cuda.cuCtxSetCurrent(dest_dev.context))
89
92
  check(cuda.cuStreamWaitEvent(None, sync_event, 0)) # sync the default stream on the dest dev
90
- def offset(self, buf, size:int, offset:int): return cuda.CUdeviceptr_v2(buf.value + offset)
93
+ def _offset(self, buf, size:int, offset:int): return cuda.CUdeviceptr_v2(buf.value + offset)
91
94
 
92
95
  class CUDADevice(Compiled):
93
- devices: List[CUDADevice] = []
96
+ devices: list[CUDADevice] = []
94
97
  peer_access = False
95
98
 
96
99
  def __init__(self, device:str):
@@ -110,12 +113,12 @@ class CUDADevice(Compiled):
110
113
  CUDADevice.peer_access = True
111
114
 
112
115
  self.arch = f"sm_{major.value}{minor.value}"
113
- self.pending_copyin: List[Tuple[int, int, Optional[BufferOptions]]] = []
116
+ self.pending_copyin: list[tuple[int, int, BufferSpec|None]] = []
114
117
  CUDADevice.devices.append(self)
115
118
 
116
119
  from tinygrad.runtime.graph.cuda import CUDAGraph
117
120
  super().__init__(device, CUDAAllocator(self), PTXRenderer(self.arch) if PTX else CUDARenderer(self.arch),
118
- PTXCompiler(self.arch) if PTX else CUDACompiler(self.arch), functools.partial(CUDAProgram, self), graph=CUDAGraph)
121
+ PTXCompiler(self.arch) if PTX else CUDACompiler(self.arch), functools.partial(CUDAProgram, self), None if MOCKGPU else CUDAGraph)
119
122
 
120
123
  def synchronize(self):
121
124
  check(cuda.cuCtxSetCurrent(self.context))
@@ -1,72 +1,11 @@
1
- from __future__ import annotations
2
1
  import os, sys, mmap, io, ctypes, ctypes.util, contextlib
3
- from typing import Optional, Generator, Tuple, Callable, List
2
+ from typing import Optional, Generator, Callable
4
3
  from tinygrad.helpers import OSX, round_up
5
4
  from tinygrad.device import Compiled, Allocator
6
5
  with contextlib.suppress(ImportError):
7
6
  import _posixshmem
8
7
  from tinygrad.runtime.autogen import io_uring, libc
9
8
 
10
- class DiskBuffer:
11
- def __init__(self, device:DiskDevice, size:int, offset=0):
12
- self.device, self.size, self.offset = device, size, offset
13
- def __repr__(self): return f"<DiskBuffer size={self.size} offset={self.offset}>"
14
- def _buf(self) -> memoryview:
15
- assert self.device.mem is not None, "DiskBuffer wasn't opened"
16
- return memoryview(self.device.mem)[self.offset:self.offset+self.size]
17
-
18
- MAP_LOCKED, MAP_POPULATE = 0 if OSX else 0x2000, getattr(mmap, "MAP_POPULATE", 0 if OSX else 0x008000)
19
- class DiskAllocator(Allocator):
20
- def __init__(self, device:DiskDevice): self.device = device
21
- def _alloc(self, size:int, options):
22
- self.device._might_open(size)
23
- return DiskBuffer(self.device, size)
24
- def _free(self, opaque, options): self.device._might_close()
25
- def as_buffer(self, src:DiskBuffer): return src._buf()
26
- def copyin(self, dest:DiskBuffer, src:memoryview): dest._buf()[:] = src
27
- def copyout(self, dest:memoryview, src:DiskBuffer):
28
- if OSX and self.device.fd is not None:
29
- # OSX doesn't seem great at mmap, this is faster
30
- with io.FileIO(self.device.fd, "a+b", closefd=False) as fo:
31
- fo.seek(src.offset)
32
- fo.readinto(dest)
33
- else:
34
- dest[:] = src._buf()
35
-
36
- def _copyout_sharded(self, src:DiskBuffer, size:int, _get_free_buf:Callable, seg_len:int) -> Generator[Tuple[int, int, int, int], None, None]:
37
- assert hasattr(DiskDevice, 'io_uring'), "function requires io uring support"
38
-
39
- fd_offset = src.offset - (minor_offset := src.offset % mmap.PAGESIZE)
40
- processed_reqs_cnt, copied_in, next_read_offset, total_copy_size = 0, 0, 0, round_up(size + minor_offset, mmap.PAGESIZE)
41
- reqs: List[Tuple[int, int, int, int]] = []
42
-
43
- while next_read_offset < total_copy_size or len(reqs) != processed_reqs_cnt:
44
- if next_read_offset < total_copy_size and (copy_batch := _get_free_buf()) is not None:
45
- # Prepare sqe
46
- sqe_index = (tail:=DiskDevice.io_uring.sq.ktail[0]) & DiskDevice.io_uring.sq.kring_mask[0]
47
- sqe = DiskDevice.io_uring.sq.sqes[sqe_index]
48
- sqe.opcode, sqe.fd, sqe.off = io_uring.IORING_OP_READ, self.device.fd, fd_offset + next_read_offset
49
- sqe.addr, sqe.len, sqe.user_data = copy_batch[0], min(seg_len, total_copy_size - next_read_offset), len(reqs)
50
-
51
- # Send sqe
52
- DiskDevice.io_uring.sq.array[sqe_index] = sqe_index
53
- DiskDevice.io_uring.sq.ktail[0] = tail + 1
54
- libc.syscall(io_uring.NR_io_uring_enter, DiskDevice.io_uring.ring_fd, 1, 1, io_uring.IORING_ENTER_GETEVENTS)
55
-
56
- reqs.append((copy_batch, copied_in, minor_offset, real_copy_size:=min(sqe.len - minor_offset, size - copied_in)))
57
- next_read_offset += sqe.len
58
- copied_in += real_copy_size
59
- minor_offset = 0
60
-
61
- if (head:=DiskDevice.io_uring.cq.khead[0]) != DiskDevice.io_uring.cq.ktail[0]:
62
- cqe = DiskDevice.io_uring.cq.cqes[head & DiskDevice.io_uring.cq.kring_mask[0]]
63
- assert cqe.res >= 0, f"read from disk failed, err: {cqe.res}"
64
- yield reqs[cqe.user_data]
65
- DiskDevice.io_uring.cq.khead[0] = head + 1 # advance
66
- processed_reqs_cnt += 1
67
-
68
- def offset(self, buf:DiskBuffer, size:int, offset:int): return DiskBuffer(buf.device, size, offset)
69
-
70
9
  class DiskDevice(Compiled):
71
10
  _tried_io_uring_init = False
72
11
 
@@ -81,7 +20,7 @@ class DiskDevice(Compiled):
81
20
  self.count += 1
82
21
  assert self.size is None or size <= self.size, f"can't reopen Disk tensor with larger size, opened with {self.size}, tried to open with {size}"
83
22
  if self.size is not None: return
84
- filename = self.dname[len("disk:"):]
23
+ filename = self.device[len("disk:"):]
85
24
  self.size = size
86
25
 
87
26
  if sys.platform != "win32" and filename.startswith("shm:"):
@@ -122,3 +61,63 @@ class DiskDevice(Compiled):
122
61
  kring_mask=u32ptr(sq_ptr+p.cq_off.ring_mask), cqes=ctypes.cast(cq_ptr+p.cq_off.cqes, ctypes.POINTER(io_uring.struct_io_uring_cqe)))
123
62
 
124
63
  DiskDevice.io_uring = io_uring.struct_io_uring(ring_fd=fd, sq=sqdesc, cq=cqdesc) # type: ignore
64
+
65
+ class DiskBuffer:
66
+ def __init__(self, device:DiskDevice, size:int, offset=0):
67
+ self.device, self.size, self.offset = device, size, offset
68
+ def __repr__(self): return f"<DiskBuffer size={self.size} offset={self.offset}>"
69
+ def _buf(self) -> memoryview:
70
+ assert hasattr(self.device, "mem"), f"DiskBuffer wasn't opened: {self.device.device}"
71
+ return memoryview(self.device.mem)[self.offset:self.offset+self.size]
72
+
73
+ MAP_LOCKED, MAP_POPULATE = 0 if OSX else 0x2000, getattr(mmap, "MAP_POPULATE", 0 if OSX else 0x008000)
74
+ class DiskAllocator(Allocator):
75
+ def __init__(self, dev:DiskDevice): self.dev = dev
76
+ def _alloc(self, size:int, options):
77
+ self.dev._might_open(size)
78
+ return DiskBuffer(self.dev, size)
79
+ def _free(self, opaque, options): self.dev._might_close()
80
+ def _as_buffer(self, src:DiskBuffer): return src._buf()
81
+ def _copyin(self, dest:DiskBuffer, src:memoryview): dest._buf()[:] = src
82
+ def _copyout(self, dest:memoryview, src:DiskBuffer):
83
+ if OSX and self.dev.fd is not None:
84
+ # OSX doesn't seem great at mmap, this is faster
85
+ with io.FileIO(self.dev.fd, "a+b", closefd=False) as fo:
86
+ fo.seek(src.offset)
87
+ fo.readinto(dest)
88
+ else:
89
+ dest[:] = src._buf()
90
+
91
+ def _copyout_sharded(self, src:DiskBuffer, size:int, _get_free_buf:Callable, seg_len:int) -> Generator[tuple[int, int, int, int], None, None]:
92
+ assert hasattr(DiskDevice, 'io_uring'), "function requires io uring support"
93
+
94
+ fd_offset = src.offset - (minor_offset := src.offset % mmap.PAGESIZE)
95
+ processed_reqs_cnt, copied_in, next_read_offset, total_copy_size = 0, 0, 0, round_up(size + minor_offset, mmap.PAGESIZE)
96
+ reqs: list[tuple[int, int, int, int]] = []
97
+
98
+ while next_read_offset < total_copy_size or len(reqs) != processed_reqs_cnt:
99
+ if next_read_offset < total_copy_size and (copy_batch := _get_free_buf()) is not None:
100
+ # Prepare sqe
101
+ sqe_index = (tail:=DiskDevice.io_uring.sq.ktail[0]) & DiskDevice.io_uring.sq.kring_mask[0]
102
+ sqe = DiskDevice.io_uring.sq.sqes[sqe_index]
103
+ sqe.opcode, sqe.fd, sqe.off = io_uring.IORING_OP_READ, self.dev.fd, fd_offset + next_read_offset
104
+ sqe.addr, sqe.len, sqe.user_data = copy_batch[0], min(seg_len, total_copy_size - next_read_offset), len(reqs)
105
+
106
+ # Send sqe
107
+ DiskDevice.io_uring.sq.array[sqe_index] = sqe_index
108
+ DiskDevice.io_uring.sq.ktail[0] = tail + 1
109
+ libc.syscall(io_uring.NR_io_uring_enter, DiskDevice.io_uring.ring_fd, 1, 1, io_uring.IORING_ENTER_GETEVENTS)
110
+
111
+ reqs.append((copy_batch, copied_in, minor_offset, real_copy_size:=min(sqe.len - minor_offset, size - copied_in)))
112
+ next_read_offset += sqe.len
113
+ copied_in += real_copy_size
114
+ minor_offset = 0
115
+
116
+ if (head:=DiskDevice.io_uring.cq.khead[0]) != DiskDevice.io_uring.cq.ktail[0]:
117
+ cqe = DiskDevice.io_uring.cq.cqes[head & DiskDevice.io_uring.cq.kring_mask[0]]
118
+ assert cqe.res >= 0, f"read from disk failed, err: {cqe.res}"
119
+ yield reqs[cqe.user_data]
120
+ DiskDevice.io_uring.cq.khead[0] = head + 1 # advance
121
+ processed_reqs_cnt += 1
122
+
123
+ def _offset(self, buf:DiskBuffer, size:int, offset:int): return DiskBuffer(buf.device, size, offset)