tinygrad 0.10.0__py3-none-any.whl → 0.10.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tinygrad/codegen/devectorizer.py +247 -0
- tinygrad/codegen/expander.py +121 -0
- tinygrad/codegen/kernel.py +141 -201
- tinygrad/codegen/linearize.py +223 -84
- tinygrad/codegen/lowerer.py +60 -42
- tinygrad/codegen/symbolic.py +476 -0
- tinygrad/codegen/transcendental.py +22 -13
- tinygrad/device.py +187 -47
- tinygrad/dtype.py +39 -28
- tinygrad/engine/jit.py +83 -65
- tinygrad/engine/memory.py +4 -5
- tinygrad/engine/multi.py +161 -0
- tinygrad/engine/realize.py +62 -108
- tinygrad/engine/schedule.py +396 -357
- tinygrad/engine/search.py +55 -66
- tinygrad/gradient.py +73 -0
- tinygrad/helpers.py +81 -59
- tinygrad/nn/__init__.py +30 -32
- tinygrad/nn/datasets.py +1 -2
- tinygrad/nn/optim.py +22 -26
- tinygrad/nn/state.py +91 -66
- tinygrad/ops.py +492 -641
- tinygrad/renderer/__init__.py +95 -36
- tinygrad/renderer/cstyle.py +99 -92
- tinygrad/renderer/llvmir.py +83 -34
- tinygrad/renderer/ptx.py +83 -99
- tinygrad/renderer/wgsl.py +95 -0
- tinygrad/runtime/autogen/amd_gpu.py +39507 -12
- tinygrad/runtime/autogen/comgr.py +2 -0
- tinygrad/runtime/autogen/kfd.py +4 -3
- tinygrad/runtime/autogen/kgsl.py +1 -1
- tinygrad/runtime/autogen/libc.py +404 -71
- tinygrad/runtime/autogen/llvm.py +11379 -0
- tinygrad/runtime/autogen/pci.py +1333 -0
- tinygrad/runtime/autogen/vfio.py +891 -0
- tinygrad/runtime/autogen/webgpu.py +6985 -0
- tinygrad/runtime/graph/cuda.py +8 -9
- tinygrad/runtime/graph/hcq.py +84 -79
- tinygrad/runtime/graph/metal.py +40 -43
- tinygrad/runtime/ops_amd.py +498 -334
- tinygrad/runtime/ops_cloud.py +34 -34
- tinygrad/runtime/ops_cpu.py +24 -0
- tinygrad/runtime/ops_cuda.py +30 -27
- tinygrad/runtime/ops_disk.py +62 -63
- tinygrad/runtime/ops_dsp.py +159 -42
- tinygrad/runtime/ops_gpu.py +30 -30
- tinygrad/runtime/ops_hip.py +29 -31
- tinygrad/runtime/ops_llvm.py +48 -41
- tinygrad/runtime/ops_metal.py +149 -113
- tinygrad/runtime/ops_npy.py +2 -2
- tinygrad/runtime/ops_nv.py +238 -273
- tinygrad/runtime/ops_python.py +55 -50
- tinygrad/runtime/ops_qcom.py +129 -157
- tinygrad/runtime/ops_webgpu.py +225 -0
- tinygrad/runtime/support/allocator.py +94 -0
- tinygrad/runtime/support/am/__init__.py +0 -0
- tinygrad/runtime/support/am/amdev.py +396 -0
- tinygrad/runtime/support/am/ip.py +463 -0
- tinygrad/runtime/support/compiler_cuda.py +4 -2
- tinygrad/runtime/support/elf.py +28 -4
- tinygrad/runtime/support/hcq.py +256 -324
- tinygrad/runtime/support/llvm.py +26 -0
- tinygrad/shape/shapetracker.py +85 -53
- tinygrad/shape/view.py +104 -140
- tinygrad/spec.py +155 -0
- tinygrad/tensor.py +835 -527
- tinygrad/viz/assets/cdnjs.cloudflare.com/ajax/libs/highlight.js/11.10.0/highlight.min.js +1232 -0
- tinygrad/viz/assets/cdnjs.cloudflare.com/ajax/libs/highlight.js/11.10.0/languages/cpp.min.js +47 -0
- tinygrad/viz/assets/cdnjs.cloudflare.com/ajax/libs/highlight.js/11.10.0/languages/python.min.js +42 -0
- tinygrad/viz/assets/cdnjs.cloudflare.com/ajax/libs/highlight.js/11.10.0/styles/default.min.css +9 -0
- tinygrad/viz/assets/d3js.org/d3.v5.min.js +2 -0
- tinygrad/viz/assets/dagrejs.github.io/project/dagre-d3/latest/dagre-d3.min.js +4816 -0
- tinygrad/viz/assets/unpkg.com/@highlightjs/cdn-assets@11.10.0/styles/tokyo-night-dark.min.css +8 -0
- tinygrad/viz/index.html +544 -0
- tinygrad/viz/perfetto.html +178 -0
- tinygrad/viz/serve.py +205 -0
- {tinygrad-0.10.0.dist-info → tinygrad-0.10.2.dist-info}/METADATA +48 -25
- tinygrad-0.10.2.dist-info/RECORD +99 -0
- {tinygrad-0.10.0.dist-info → tinygrad-0.10.2.dist-info}/WHEEL +1 -1
- tinygrad/codegen/uopgraph.py +0 -506
- tinygrad/engine/lazy.py +0 -228
- tinygrad/function.py +0 -212
- tinygrad/multi.py +0 -177
- tinygrad/runtime/graph/clang.py +0 -39
- tinygrad/runtime/ops_clang.py +0 -35
- tinygrad-0.10.0.dist-info/RECORD +0 -77
- {tinygrad-0.10.0.dist-info → tinygrad-0.10.2.dist-info}/LICENSE +0 -0
- {tinygrad-0.10.0.dist-info → tinygrad-0.10.2.dist-info}/top_level.txt +0 -0
tinygrad/runtime/ops_cloud.py
CHANGED
@@ -5,7 +5,7 @@
|
|
5
5
|
# it should be a secure (example: no use of pickle) boundary. HTTP is used for RPC
|
6
6
|
|
7
7
|
from __future__ import annotations
|
8
|
-
from typing import
|
8
|
+
from typing import Optional, Any
|
9
9
|
from collections import defaultdict
|
10
10
|
from dataclasses import dataclass, field
|
11
11
|
import multiprocessing, functools, http.client, hashlib, json, time, os, binascii, struct, ast, contextlib
|
@@ -13,14 +13,14 @@ from http.server import HTTPServer, BaseHTTPRequestHandler
|
|
13
13
|
from tinygrad.renderer import Renderer
|
14
14
|
from tinygrad.dtype import dtypes
|
15
15
|
from tinygrad.helpers import getenv, DEBUG, fromimport, unwrap, Timing
|
16
|
-
from tinygrad.device import Compiled, Allocator, Compiler, Device,
|
16
|
+
from tinygrad.device import Compiled, Allocator, Compiler, Device, BufferSpec
|
17
17
|
|
18
18
|
# ***** API *****
|
19
19
|
|
20
20
|
class CloudRequest: pass
|
21
21
|
|
22
22
|
@dataclass(frozen=True)
|
23
|
-
class BufferAlloc(CloudRequest): buffer_num: int; size: int; options:
|
23
|
+
class BufferAlloc(CloudRequest): buffer_num: int; size: int; options: BufferSpec # noqa: E702
|
24
24
|
|
25
25
|
@dataclass(frozen=True)
|
26
26
|
class BufferFree(CloudRequest): buffer_num: int # noqa: E702
|
@@ -39,11 +39,11 @@ class ProgramFree(CloudRequest): name: str; datahash: str # noqa: E702
|
|
39
39
|
|
40
40
|
@dataclass(frozen=True)
|
41
41
|
class ProgramExec(CloudRequest):
|
42
|
-
name: str; datahash: str; bufs:
|
43
|
-
global_size: Optional[
|
42
|
+
name: str; datahash: str; bufs: tuple[int, ...]; vals: tuple[int, ...] # noqa: E702
|
43
|
+
global_size: Optional[tuple[int, ...]]; local_size: Optional[tuple[int, ...]]; wait: bool # noqa: E702
|
44
44
|
|
45
45
|
# for safe deserialization
|
46
|
-
whitelist = {x.__name__:x for x in [BufferAlloc, BufferFree, CopyIn, CopyOut, ProgramAlloc, ProgramFree, ProgramExec,
|
46
|
+
whitelist = {x.__name__:x for x in [BufferAlloc, BufferFree, CopyIn, CopyOut, ProgramAlloc, ProgramFree, ProgramExec, BufferSpec]}
|
47
47
|
eval_fxns = {ast.Constant: lambda x: x.value, ast.Tuple: lambda x: tuple(map(safe_eval, x.elts)), ast.List: lambda x: list(map(safe_eval, x.elts)),
|
48
48
|
ast.Call: lambda x: safe_eval(x.func)(*[safe_eval(arg) for arg in x.args], **{kwarg.arg: safe_eval(kwarg.value) for kwarg in x.keywords}),
|
49
49
|
ast.Name: lambda x: whitelist[x.id], ast.Attribute: lambda x: {"imagef": dtypes.imagef, "imageh": dtypes.imageh}[x.attr]}
|
@@ -51,8 +51,8 @@ def safe_eval(node): return eval_fxns[node.__class__](node)
|
|
51
51
|
|
52
52
|
class BatchRequest:
|
53
53
|
def __init__(self):
|
54
|
-
self._q:
|
55
|
-
self._h:
|
54
|
+
self._q: list[CloudRequest] = []
|
55
|
+
self._h: dict[str, bytes] = {}
|
56
56
|
def h(self, d:bytes) -> str:
|
57
57
|
binhash = hashlib.sha256(d).digest()
|
58
58
|
self._h[datahash:=binascii.hexlify(binhash).decode()] = binhash+struct.pack("<Q", len(d))+d
|
@@ -74,14 +74,14 @@ class BatchRequest:
|
|
74
74
|
|
75
75
|
@dataclass
|
76
76
|
class CloudSession:
|
77
|
-
programs:
|
77
|
+
programs: dict[tuple[str, str], Any] = field(default_factory=dict)
|
78
78
|
# TODO: the buffer should track this internally
|
79
|
-
buffers:
|
79
|
+
buffers: dict[int, tuple[Any, int, Optional[BufferSpec]]] = field(default_factory=dict)
|
80
80
|
|
81
81
|
class CloudHandler(BaseHTTPRequestHandler):
|
82
82
|
protocol_version = 'HTTP/1.1'
|
83
|
-
|
84
|
-
sessions:
|
83
|
+
device: str
|
84
|
+
sessions: defaultdict[str, CloudSession] = defaultdict(CloudSession)
|
85
85
|
|
86
86
|
def setup(self):
|
87
87
|
super().setup()
|
@@ -99,18 +99,18 @@ class CloudHandler(BaseHTTPRequestHandler):
|
|
99
99
|
match c:
|
100
100
|
case BufferAlloc():
|
101
101
|
assert c.buffer_num not in session.buffers, f"buffer {c.buffer_num} already allocated"
|
102
|
-
session.buffers[c.buffer_num] = (Device[CloudHandler.
|
102
|
+
session.buffers[c.buffer_num] = (Device[CloudHandler.device].allocator.alloc(c.size, c.options), c.size, c.options)
|
103
103
|
case BufferFree():
|
104
104
|
buf,sz,buffer_options = session.buffers[c.buffer_num]
|
105
|
-
Device[CloudHandler.
|
105
|
+
Device[CloudHandler.device].allocator.free(buf,sz,buffer_options)
|
106
106
|
del session.buffers[c.buffer_num]
|
107
|
-
case CopyIn(): Device[CloudHandler.
|
107
|
+
case CopyIn(): Device[CloudHandler.device].allocator._copyin(session.buffers[c.buffer_num][0], memoryview(bytearray(req._h[c.datahash])))
|
108
108
|
case CopyOut():
|
109
109
|
buf,sz,_ = session.buffers[c.buffer_num]
|
110
|
-
Device[CloudHandler.
|
110
|
+
Device[CloudHandler.device].allocator._copyout(memoryview(ret:=bytearray(sz)), buf)
|
111
111
|
case ProgramAlloc():
|
112
|
-
lib = Device[CloudHandler.
|
113
|
-
session.programs[(c.name, c.datahash)] = Device[CloudHandler.
|
112
|
+
lib = Device[CloudHandler.device].compiler.compile_cached(req._h[c.datahash].decode())
|
113
|
+
session.programs[(c.name, c.datahash)] = Device[CloudHandler.device].runtime(c.name, lib)
|
114
114
|
case ProgramFree(): del session.programs[(c.name, c.datahash)]
|
115
115
|
case ProgramExec():
|
116
116
|
bufs = [session.buffers[x][0] for x in c.bufs]
|
@@ -118,7 +118,7 @@ class CloudHandler(BaseHTTPRequestHandler):
|
|
118
118
|
r = session.programs[(c.name, c.datahash)](*bufs, vals=c.vals, wait=c.wait, **extra_args)
|
119
119
|
if r is not None: ret = str(r).encode()
|
120
120
|
elif self.path == "/renderer" and method == "GET":
|
121
|
-
cls, args = Device[CloudHandler.
|
121
|
+
cls, args = Device[CloudHandler.device].renderer.__reduce__()
|
122
122
|
ret = json.dumps((cls.__module__, cls.__name__, args)).encode()
|
123
123
|
else: status_code = 404
|
124
124
|
self.send_response(status_code)
|
@@ -131,42 +131,42 @@ class CloudHandler(BaseHTTPRequestHandler):
|
|
131
131
|
|
132
132
|
def cloud_server(port:int):
|
133
133
|
multiprocessing.current_process().name = "MainProcess"
|
134
|
-
CloudHandler.
|
135
|
-
print(f"start cloud server on {port} with device {CloudHandler.
|
134
|
+
CloudHandler.device = getenv("CLOUDDEV", "METAL") if Device.DEFAULT == "CLOUD" else Device.DEFAULT
|
135
|
+
print(f"start cloud server on {port} with device {CloudHandler.device}")
|
136
136
|
server = HTTPServer(('', port), CloudHandler)
|
137
137
|
server.serve_forever()
|
138
138
|
|
139
139
|
# ***** frontend *****
|
140
140
|
|
141
141
|
class CloudAllocator(Allocator):
|
142
|
-
def __init__(self,
|
143
|
-
self.device =
|
142
|
+
def __init__(self, dev:CloudDevice):
|
143
|
+
self.device = dev
|
144
144
|
super().__init__()
|
145
145
|
# TODO: ideally we shouldn't have to deal with images here
|
146
|
-
def _alloc(self, size:int, options:
|
146
|
+
def _alloc(self, size:int, options:BufferSpec) -> int:
|
147
147
|
self.device.buffer_num += 1
|
148
148
|
self.device.req.q(BufferAlloc(self.device.buffer_num, size, options))
|
149
149
|
return self.device.buffer_num
|
150
150
|
# TODO: options should not be here in any Allocator
|
151
151
|
def _free(self, opaque:int, options): self.device.req.q(BufferFree(opaque))
|
152
|
-
def
|
153
|
-
def
|
152
|
+
def _copyin(self, dest:int, src:memoryview): self.device.req.q(CopyIn(dest, self.device.req.h(bytes(src))))
|
153
|
+
def _copyout(self, dest:memoryview, src:int):
|
154
154
|
self.device.req.q(CopyOut(src))
|
155
155
|
resp = self.device.batch_submit()
|
156
156
|
assert len(resp) == len(dest), f"buffer length mismatch {len(resp)} != {len(dest)}"
|
157
157
|
dest[:] = resp
|
158
158
|
|
159
159
|
class CloudProgram:
|
160
|
-
def __init__(self,
|
161
|
-
self.
|
162
|
-
self.datahash = self.
|
163
|
-
self.
|
160
|
+
def __init__(self, dev:CloudDevice, name:str, lib:bytes):
|
161
|
+
self.dev, self.name = dev, name
|
162
|
+
self.datahash = self.dev.req.h(lib)
|
163
|
+
self.dev.req.q(ProgramAlloc(self.name, self.datahash))
|
164
164
|
super().__init__()
|
165
|
-
def __del__(self): self.
|
165
|
+
def __del__(self): self.dev.req.q(ProgramFree(self.name, self.datahash))
|
166
166
|
|
167
|
-
def __call__(self, *bufs, global_size=None, local_size=None, vals:
|
168
|
-
self.
|
169
|
-
if wait: return float(self.
|
167
|
+
def __call__(self, *bufs, global_size=None, local_size=None, vals:tuple[int, ...]=(), wait=False):
|
168
|
+
self.dev.req.q(ProgramExec(self.name, self.datahash, bufs, vals, global_size, local_size, wait))
|
169
|
+
if wait: return float(self.dev.batch_submit())
|
170
170
|
|
171
171
|
class CloudDevice(Compiled):
|
172
172
|
def __init__(self, device:str):
|
@@ -0,0 +1,24 @@
|
|
1
|
+
import platform, subprocess, sys
|
2
|
+
from tinygrad.helpers import capstone_flatdump, getenv
|
3
|
+
from tinygrad.device import Compiled, Compiler, MallocAllocator, CPUProgram
|
4
|
+
from tinygrad.runtime.support.elf import jit_loader
|
5
|
+
from tinygrad.renderer.cstyle import ClangRenderer
|
6
|
+
|
7
|
+
class ClangJITCompiler(Compiler):
|
8
|
+
def __init__(self, cachekey="compile_clang_jit"): super().__init__(cachekey)
|
9
|
+
|
10
|
+
def compile(self, src:str) -> bytes:
|
11
|
+
# -fno-math-errno is required for __builtin_sqrt to become an instruction instead of a function call
|
12
|
+
# x18 is a reserved platform register. It is clobbered on context switch in macos and is used to store TEB pointer in windows on arm, don't use it
|
13
|
+
target = 'x86_64' if sys.platform == 'win32' else platform.machine()
|
14
|
+
args = ['-march=native', f'--target={target}-none-unknown-elf', '-O2', '-fPIC', '-ffreestanding', '-fno-math-errno', '-nostdlib']
|
15
|
+
arch_args = ['-ffixed-x18'] if target == 'arm64' else []
|
16
|
+
obj = subprocess.check_output([getenv("CC", 'clang'), '-c', '-x', 'c', *args, *arch_args, '-', '-o', '-'], input=src.encode('utf-8'))
|
17
|
+
return jit_loader(obj)
|
18
|
+
|
19
|
+
def disassemble(self, lib:bytes): return capstone_flatdump(lib)
|
20
|
+
|
21
|
+
class ClangDevice(Compiled):
|
22
|
+
def __init__(self, device:str): super().__init__(device, MallocAllocator, ClangRenderer(), ClangJITCompiler(), CPUProgram)
|
23
|
+
|
24
|
+
CPUDevice = ClangDevice
|
tinygrad/runtime/ops_cuda.py
CHANGED
@@ -1,25 +1,25 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
import ctypes, ctypes.util, functools
|
3
|
-
from typing import Tuple, Optional, List
|
4
3
|
from tinygrad.helpers import DEBUG, getenv, from_mv, init_c_var, init_c_struct_t
|
5
|
-
from tinygrad.device import Compiled,
|
4
|
+
from tinygrad.device import Compiled, BufferSpec, LRUAllocator
|
6
5
|
from tinygrad.renderer.cstyle import CUDARenderer
|
7
6
|
from tinygrad.renderer.ptx import PTXRenderer
|
8
7
|
from tinygrad.runtime.autogen import cuda
|
9
8
|
from tinygrad.runtime.support.compiler_cuda import cuda_disassemble, pretty_ptx, CUDACompiler, PTXCompiler, PTX
|
10
9
|
if getenv("IOCTL"): import extra.nv_gpu_driver.nv_ioctl # noqa: F401 # pylint: disable=unused-import
|
10
|
+
if MOCKGPU:=getenv("MOCKGPU"): from test.mockgpu.cuda import cuda # type: ignore # pylint: disable=reimported
|
11
11
|
|
12
12
|
def check(status):
|
13
13
|
if status != 0: raise RuntimeError(f"CUDA Error {status}, {ctypes.string_at(init_c_var(ctypes.POINTER(ctypes.c_char)(), lambda x: cuda.cuGetErrorString(status, ctypes.byref(x)))).decode()}") # noqa: E501
|
14
14
|
|
15
|
-
def encode_args(args, vals) ->
|
15
|
+
def encode_args(args, vals) -> tuple[ctypes.Structure, ctypes.Array]:
|
16
16
|
c_args = init_c_struct_t(tuple([(f'f{i}', cuda.CUdeviceptr_v2) for i in range(len(args))] +
|
17
17
|
[(f'v{i}', ctypes.c_int) for i in range(len(vals))]))(*args, *vals)
|
18
18
|
vargs = (ctypes.c_void_p * 5)(ctypes.c_void_p(1), ctypes.cast(ctypes.byref(c_args), ctypes.c_void_p), ctypes.c_void_p(2),
|
19
19
|
ctypes.cast(ctypes.pointer(ctypes.c_size_t(ctypes.sizeof(c_args))), ctypes.c_void_p), ctypes.c_void_p(0))
|
20
20
|
return c_args, vargs
|
21
21
|
|
22
|
-
def cu_time_execution(cb, enable=False) ->
|
22
|
+
def cu_time_execution(cb, enable=False) -> float|None:
|
23
23
|
if not enable: return cb()
|
24
24
|
evs = [init_c_var(cuda.CUevent(), lambda x: cuda.cuEventCreate(ctypes.byref(x), 0)) for _ in range(2)]
|
25
25
|
cuda.cuEventRecord(evs[0], None)
|
@@ -31,17 +31,17 @@ def cu_time_execution(cb, enable=False) -> Optional[float]:
|
|
31
31
|
return ret.value * 1e-3
|
32
32
|
|
33
33
|
class CUDAProgram:
|
34
|
-
def __init__(self,
|
35
|
-
self.
|
34
|
+
def __init__(self, dev:CUDADevice, name:str, lib:bytes, smem:int=0):
|
35
|
+
self.dev, self.name, self.lib, self.smem = dev, name, lib, smem
|
36
36
|
if DEBUG >= 5: print("\n".join([f"{i+1:>3} {line}" for i, line in enumerate(pretty_ptx(lib.decode('utf-8')).split("\n"))]))
|
37
|
-
if DEBUG >= 6: cuda_disassemble(lib,
|
37
|
+
if DEBUG >= 6: cuda_disassemble(lib, dev.arch)
|
38
38
|
|
39
|
-
check(cuda.cuCtxSetCurrent(self.
|
39
|
+
check(cuda.cuCtxSetCurrent(self.dev.context))
|
40
40
|
self.module = cuda.CUmodule()
|
41
41
|
status = cuda.cuModuleLoadData(ctypes.byref(self.module), lib)
|
42
42
|
if status != 0:
|
43
43
|
del self.module
|
44
|
-
cuda_disassemble(lib,
|
44
|
+
cuda_disassemble(lib, dev.arch)
|
45
45
|
raise RuntimeError(f"module load failed with status code {status}: {cuda.cudaError_enum__enumvalues[status]}")
|
46
46
|
check(cuda.cuModuleGetFunction(ctypes.byref(prg := cuda.CUfunction()), self.module, name.encode("utf-8")))
|
47
47
|
self.prg = prg
|
@@ -50,47 +50,50 @@ class CUDAProgram:
|
|
50
50
|
def __del__(self):
|
51
51
|
if hasattr(self, 'module'): check(cuda.cuModuleUnload(self.module))
|
52
52
|
|
53
|
-
def __call__(self, *args, global_size:
|
54
|
-
check(cuda.cuCtxSetCurrent(self.
|
53
|
+
def __call__(self, *args, global_size:tuple[int,int,int]=(1,1,1), local_size:tuple[int,int,int]=(1,1,1), vals:tuple[int, ...]=(), wait=False):
|
54
|
+
check(cuda.cuCtxSetCurrent(self.dev.context))
|
55
55
|
if not hasattr(self, "vargs"):
|
56
56
|
self.c_args, self.vargs = encode_args(args, vals)
|
57
|
+
|
58
|
+
# HACK: For MOCKGPU send the args struct itself.
|
59
|
+
if MOCKGPU: self.vargs = self.c_args # type: ignore[assignment]
|
57
60
|
else:
|
58
61
|
for i in range(len(args)): self.c_args.__setattr__(f'f{i}', args[i])
|
59
62
|
for i in range(len(vals)): self.c_args.__setattr__(f'v{i}', vals[i])
|
60
63
|
return cu_time_execution(lambda: check(cuda.cuLaunchKernel(self.prg, *global_size, *local_size, self.smem, None, None, self.vargs)), enable=wait)
|
61
64
|
|
62
65
|
class CUDAAllocator(LRUAllocator):
|
63
|
-
def __init__(self,
|
64
|
-
self.
|
66
|
+
def __init__(self, dev:CUDADevice):
|
67
|
+
self.dev = dev
|
65
68
|
super().__init__()
|
66
|
-
def _alloc(self, size, options:
|
67
|
-
check(cuda.cuCtxSetCurrent(self.
|
69
|
+
def _alloc(self, size, options:BufferSpec):
|
70
|
+
check(cuda.cuCtxSetCurrent(self.dev.context))
|
68
71
|
if options.host: return init_c_var(ctypes.c_void_p(), lambda x: check(cuda.cuMemHostAlloc(ctypes.byref(x), size, 0x01)))
|
69
72
|
return init_c_var(cuda.CUdeviceptr(), lambda x: check(cuda.cuMemAlloc_v2(ctypes.byref(x), size)))
|
70
|
-
def _free(self, opaque, options:
|
73
|
+
def _free(self, opaque, options:BufferSpec):
|
71
74
|
if options.host: check(cuda.cuMemFreeHost(opaque))
|
72
75
|
else: check(cuda.cuMemFree_v2(opaque))
|
73
|
-
def
|
74
|
-
check(cuda.cuCtxSetCurrent(self.
|
75
|
-
host_mem = self.alloc(len(src),
|
76
|
-
self.
|
76
|
+
def _copyin(self, dest, src:memoryview):
|
77
|
+
check(cuda.cuCtxSetCurrent(self.dev.context))
|
78
|
+
host_mem = self.alloc(len(src), BufferSpec(host=True))
|
79
|
+
self.dev.pending_copyin.append((host_mem, len(src), BufferSpec(host=True)))
|
77
80
|
ctypes.memmove(host_mem, from_mv(src), len(src))
|
78
81
|
check(cuda.cuMemcpyHtoDAsync_v2(dest, host_mem, len(src), None))
|
79
|
-
def
|
82
|
+
def _copyout(self, dest:memoryview, src):
|
80
83
|
CUDADevice.synchronize_system()
|
81
|
-
check(cuda.cuCtxSetCurrent(self.
|
84
|
+
check(cuda.cuCtxSetCurrent(self.dev.context))
|
82
85
|
check(cuda.cuMemcpyDtoH_v2(from_mv(dest), src, len(dest)))
|
83
|
-
def
|
86
|
+
def _transfer(self, dest, src, sz:int, src_dev, dest_dev):
|
84
87
|
check(cuda.cuCtxSetCurrent(src_dev.context))
|
85
88
|
check(cuda.cuEventCreate(ctypes.byref(sync_event := cuda.CUevent()), 0))
|
86
89
|
check(cuda.cuMemcpyDtoDAsync_v2(dest, src, sz, None))
|
87
90
|
check(cuda.cuEventRecord(sync_event, None))
|
88
91
|
check(cuda.cuCtxSetCurrent(dest_dev.context))
|
89
92
|
check(cuda.cuStreamWaitEvent(None, sync_event, 0)) # sync the default stream on the dest dev
|
90
|
-
def
|
93
|
+
def _offset(self, buf, size:int, offset:int): return cuda.CUdeviceptr_v2(buf.value + offset)
|
91
94
|
|
92
95
|
class CUDADevice(Compiled):
|
93
|
-
devices:
|
96
|
+
devices: list[CUDADevice] = []
|
94
97
|
peer_access = False
|
95
98
|
|
96
99
|
def __init__(self, device:str):
|
@@ -110,12 +113,12 @@ class CUDADevice(Compiled):
|
|
110
113
|
CUDADevice.peer_access = True
|
111
114
|
|
112
115
|
self.arch = f"sm_{major.value}{minor.value}"
|
113
|
-
self.pending_copyin:
|
116
|
+
self.pending_copyin: list[tuple[int, int, BufferSpec|None]] = []
|
114
117
|
CUDADevice.devices.append(self)
|
115
118
|
|
116
119
|
from tinygrad.runtime.graph.cuda import CUDAGraph
|
117
120
|
super().__init__(device, CUDAAllocator(self), PTXRenderer(self.arch) if PTX else CUDARenderer(self.arch),
|
118
|
-
PTXCompiler(self.arch) if PTX else CUDACompiler(self.arch), functools.partial(CUDAProgram, self),
|
121
|
+
PTXCompiler(self.arch) if PTX else CUDACompiler(self.arch), functools.partial(CUDAProgram, self), None if MOCKGPU else CUDAGraph)
|
119
122
|
|
120
123
|
def synchronize(self):
|
121
124
|
check(cuda.cuCtxSetCurrent(self.context))
|
tinygrad/runtime/ops_disk.py
CHANGED
@@ -1,72 +1,11 @@
|
|
1
|
-
from __future__ import annotations
|
2
1
|
import os, sys, mmap, io, ctypes, ctypes.util, contextlib
|
3
|
-
from typing import Optional, Generator,
|
2
|
+
from typing import Optional, Generator, Callable
|
4
3
|
from tinygrad.helpers import OSX, round_up
|
5
4
|
from tinygrad.device import Compiled, Allocator
|
6
5
|
with contextlib.suppress(ImportError):
|
7
6
|
import _posixshmem
|
8
7
|
from tinygrad.runtime.autogen import io_uring, libc
|
9
8
|
|
10
|
-
class DiskBuffer:
|
11
|
-
def __init__(self, device:DiskDevice, size:int, offset=0):
|
12
|
-
self.device, self.size, self.offset = device, size, offset
|
13
|
-
def __repr__(self): return f"<DiskBuffer size={self.size} offset={self.offset}>"
|
14
|
-
def _buf(self) -> memoryview:
|
15
|
-
assert self.device.mem is not None, "DiskBuffer wasn't opened"
|
16
|
-
return memoryview(self.device.mem)[self.offset:self.offset+self.size]
|
17
|
-
|
18
|
-
MAP_LOCKED, MAP_POPULATE = 0 if OSX else 0x2000, getattr(mmap, "MAP_POPULATE", 0 if OSX else 0x008000)
|
19
|
-
class DiskAllocator(Allocator):
|
20
|
-
def __init__(self, device:DiskDevice): self.device = device
|
21
|
-
def _alloc(self, size:int, options):
|
22
|
-
self.device._might_open(size)
|
23
|
-
return DiskBuffer(self.device, size)
|
24
|
-
def _free(self, opaque, options): self.device._might_close()
|
25
|
-
def as_buffer(self, src:DiskBuffer): return src._buf()
|
26
|
-
def copyin(self, dest:DiskBuffer, src:memoryview): dest._buf()[:] = src
|
27
|
-
def copyout(self, dest:memoryview, src:DiskBuffer):
|
28
|
-
if OSX and self.device.fd is not None:
|
29
|
-
# OSX doesn't seem great at mmap, this is faster
|
30
|
-
with io.FileIO(self.device.fd, "a+b", closefd=False) as fo:
|
31
|
-
fo.seek(src.offset)
|
32
|
-
fo.readinto(dest)
|
33
|
-
else:
|
34
|
-
dest[:] = src._buf()
|
35
|
-
|
36
|
-
def _copyout_sharded(self, src:DiskBuffer, size:int, _get_free_buf:Callable, seg_len:int) -> Generator[Tuple[int, int, int, int], None, None]:
|
37
|
-
assert hasattr(DiskDevice, 'io_uring'), "function requires io uring support"
|
38
|
-
|
39
|
-
fd_offset = src.offset - (minor_offset := src.offset % mmap.PAGESIZE)
|
40
|
-
processed_reqs_cnt, copied_in, next_read_offset, total_copy_size = 0, 0, 0, round_up(size + minor_offset, mmap.PAGESIZE)
|
41
|
-
reqs: List[Tuple[int, int, int, int]] = []
|
42
|
-
|
43
|
-
while next_read_offset < total_copy_size or len(reqs) != processed_reqs_cnt:
|
44
|
-
if next_read_offset < total_copy_size and (copy_batch := _get_free_buf()) is not None:
|
45
|
-
# Prepare sqe
|
46
|
-
sqe_index = (tail:=DiskDevice.io_uring.sq.ktail[0]) & DiskDevice.io_uring.sq.kring_mask[0]
|
47
|
-
sqe = DiskDevice.io_uring.sq.sqes[sqe_index]
|
48
|
-
sqe.opcode, sqe.fd, sqe.off = io_uring.IORING_OP_READ, self.device.fd, fd_offset + next_read_offset
|
49
|
-
sqe.addr, sqe.len, sqe.user_data = copy_batch[0], min(seg_len, total_copy_size - next_read_offset), len(reqs)
|
50
|
-
|
51
|
-
# Send sqe
|
52
|
-
DiskDevice.io_uring.sq.array[sqe_index] = sqe_index
|
53
|
-
DiskDevice.io_uring.sq.ktail[0] = tail + 1
|
54
|
-
libc.syscall(io_uring.NR_io_uring_enter, DiskDevice.io_uring.ring_fd, 1, 1, io_uring.IORING_ENTER_GETEVENTS)
|
55
|
-
|
56
|
-
reqs.append((copy_batch, copied_in, minor_offset, real_copy_size:=min(sqe.len - minor_offset, size - copied_in)))
|
57
|
-
next_read_offset += sqe.len
|
58
|
-
copied_in += real_copy_size
|
59
|
-
minor_offset = 0
|
60
|
-
|
61
|
-
if (head:=DiskDevice.io_uring.cq.khead[0]) != DiskDevice.io_uring.cq.ktail[0]:
|
62
|
-
cqe = DiskDevice.io_uring.cq.cqes[head & DiskDevice.io_uring.cq.kring_mask[0]]
|
63
|
-
assert cqe.res >= 0, f"read from disk failed, err: {cqe.res}"
|
64
|
-
yield reqs[cqe.user_data]
|
65
|
-
DiskDevice.io_uring.cq.khead[0] = head + 1 # advance
|
66
|
-
processed_reqs_cnt += 1
|
67
|
-
|
68
|
-
def offset(self, buf:DiskBuffer, size:int, offset:int): return DiskBuffer(buf.device, size, offset)
|
69
|
-
|
70
9
|
class DiskDevice(Compiled):
|
71
10
|
_tried_io_uring_init = False
|
72
11
|
|
@@ -81,7 +20,7 @@ class DiskDevice(Compiled):
|
|
81
20
|
self.count += 1
|
82
21
|
assert self.size is None or size <= self.size, f"can't reopen Disk tensor with larger size, opened with {self.size}, tried to open with {size}"
|
83
22
|
if self.size is not None: return
|
84
|
-
filename = self.
|
23
|
+
filename = self.device[len("disk:"):]
|
85
24
|
self.size = size
|
86
25
|
|
87
26
|
if sys.platform != "win32" and filename.startswith("shm:"):
|
@@ -122,3 +61,63 @@ class DiskDevice(Compiled):
|
|
122
61
|
kring_mask=u32ptr(sq_ptr+p.cq_off.ring_mask), cqes=ctypes.cast(cq_ptr+p.cq_off.cqes, ctypes.POINTER(io_uring.struct_io_uring_cqe)))
|
123
62
|
|
124
63
|
DiskDevice.io_uring = io_uring.struct_io_uring(ring_fd=fd, sq=sqdesc, cq=cqdesc) # type: ignore
|
64
|
+
|
65
|
+
class DiskBuffer:
|
66
|
+
def __init__(self, device:DiskDevice, size:int, offset=0):
|
67
|
+
self.device, self.size, self.offset = device, size, offset
|
68
|
+
def __repr__(self): return f"<DiskBuffer size={self.size} offset={self.offset}>"
|
69
|
+
def _buf(self) -> memoryview:
|
70
|
+
assert hasattr(self.device, "mem"), f"DiskBuffer wasn't opened: {self.device.device}"
|
71
|
+
return memoryview(self.device.mem)[self.offset:self.offset+self.size]
|
72
|
+
|
73
|
+
MAP_LOCKED, MAP_POPULATE = 0 if OSX else 0x2000, getattr(mmap, "MAP_POPULATE", 0 if OSX else 0x008000)
|
74
|
+
class DiskAllocator(Allocator):
|
75
|
+
def __init__(self, dev:DiskDevice): self.dev = dev
|
76
|
+
def _alloc(self, size:int, options):
|
77
|
+
self.dev._might_open(size)
|
78
|
+
return DiskBuffer(self.dev, size)
|
79
|
+
def _free(self, opaque, options): self.dev._might_close()
|
80
|
+
def _as_buffer(self, src:DiskBuffer): return src._buf()
|
81
|
+
def _copyin(self, dest:DiskBuffer, src:memoryview): dest._buf()[:] = src
|
82
|
+
def _copyout(self, dest:memoryview, src:DiskBuffer):
|
83
|
+
if OSX and self.dev.fd is not None:
|
84
|
+
# OSX doesn't seem great at mmap, this is faster
|
85
|
+
with io.FileIO(self.dev.fd, "a+b", closefd=False) as fo:
|
86
|
+
fo.seek(src.offset)
|
87
|
+
fo.readinto(dest)
|
88
|
+
else:
|
89
|
+
dest[:] = src._buf()
|
90
|
+
|
91
|
+
def _copyout_sharded(self, src:DiskBuffer, size:int, _get_free_buf:Callable, seg_len:int) -> Generator[tuple[int, int, int, int], None, None]:
|
92
|
+
assert hasattr(DiskDevice, 'io_uring'), "function requires io uring support"
|
93
|
+
|
94
|
+
fd_offset = src.offset - (minor_offset := src.offset % mmap.PAGESIZE)
|
95
|
+
processed_reqs_cnt, copied_in, next_read_offset, total_copy_size = 0, 0, 0, round_up(size + minor_offset, mmap.PAGESIZE)
|
96
|
+
reqs: list[tuple[int, int, int, int]] = []
|
97
|
+
|
98
|
+
while next_read_offset < total_copy_size or len(reqs) != processed_reqs_cnt:
|
99
|
+
if next_read_offset < total_copy_size and (copy_batch := _get_free_buf()) is not None:
|
100
|
+
# Prepare sqe
|
101
|
+
sqe_index = (tail:=DiskDevice.io_uring.sq.ktail[0]) & DiskDevice.io_uring.sq.kring_mask[0]
|
102
|
+
sqe = DiskDevice.io_uring.sq.sqes[sqe_index]
|
103
|
+
sqe.opcode, sqe.fd, sqe.off = io_uring.IORING_OP_READ, self.dev.fd, fd_offset + next_read_offset
|
104
|
+
sqe.addr, sqe.len, sqe.user_data = copy_batch[0], min(seg_len, total_copy_size - next_read_offset), len(reqs)
|
105
|
+
|
106
|
+
# Send sqe
|
107
|
+
DiskDevice.io_uring.sq.array[sqe_index] = sqe_index
|
108
|
+
DiskDevice.io_uring.sq.ktail[0] = tail + 1
|
109
|
+
libc.syscall(io_uring.NR_io_uring_enter, DiskDevice.io_uring.ring_fd, 1, 1, io_uring.IORING_ENTER_GETEVENTS)
|
110
|
+
|
111
|
+
reqs.append((copy_batch, copied_in, minor_offset, real_copy_size:=min(sqe.len - minor_offset, size - copied_in)))
|
112
|
+
next_read_offset += sqe.len
|
113
|
+
copied_in += real_copy_size
|
114
|
+
minor_offset = 0
|
115
|
+
|
116
|
+
if (head:=DiskDevice.io_uring.cq.khead[0]) != DiskDevice.io_uring.cq.ktail[0]:
|
117
|
+
cqe = DiskDevice.io_uring.cq.cqes[head & DiskDevice.io_uring.cq.kring_mask[0]]
|
118
|
+
assert cqe.res >= 0, f"read from disk failed, err: {cqe.res}"
|
119
|
+
yield reqs[cqe.user_data]
|
120
|
+
DiskDevice.io_uring.cq.khead[0] = head + 1 # advance
|
121
|
+
processed_reqs_cnt += 1
|
122
|
+
|
123
|
+
def _offset(self, buf:DiskBuffer, size:int, offset:int): return DiskBuffer(buf.device, size, offset)
|