tinygrad 0.9.0__py3-none-any.whl → 0.9.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tinygrad/codegen/__init__.py +0 -0
- tinygrad/codegen/kernel.py +78 -90
- tinygrad/codegen/linearizer.py +237 -169
- tinygrad/codegen/uops.py +278 -242
- tinygrad/device.py +147 -10
- tinygrad/dtype.py +7 -7
- tinygrad/engine/graph.py +16 -16
- tinygrad/engine/jit.py +39 -36
- tinygrad/engine/realize.py +6 -5
- tinygrad/engine/schedule.py +15 -7
- tinygrad/engine/search.py +6 -3
- tinygrad/function.py +17 -23
- tinygrad/helpers.py +77 -8
- tinygrad/lazy.py +26 -26
- tinygrad/multi.py +13 -9
- tinygrad/nn/__init__.py +1 -1
- tinygrad/nn/datasets.py +2 -1
- tinygrad/nn/state.py +3 -4
- tinygrad/ops.py +49 -16
- tinygrad/renderer/__init__.py +8 -4
- tinygrad/renderer/assembly.py +93 -100
- tinygrad/renderer/cstyle.py +47 -42
- tinygrad/renderer/llvmir.py +30 -30
- tinygrad/runtime/__init__.py +0 -0
- tinygrad/runtime/autogen/amd_gpu.py +11504 -1
- tinygrad/runtime/autogen/comgr.py +36 -10
- tinygrad/runtime/autogen/hsa.py +146 -14
- tinygrad/runtime/autogen/io_uring.py +1486 -0
- tinygrad/runtime/autogen/nv_gpu.py +269 -0
- tinygrad/runtime/driver/__init__.py +0 -0
- tinygrad/runtime/driver/hip_comgr.py +20 -11
- tinygrad/runtime/graph/__init__.py +0 -0
- tinygrad/runtime/graph/clang.py +3 -2
- tinygrad/runtime/graph/cuda.py +2 -2
- tinygrad/runtime/graph/hcq.py +122 -78
- tinygrad/runtime/ops_amd.py +302 -316
- tinygrad/runtime/ops_cuda.py +3 -3
- tinygrad/runtime/ops_disk.py +70 -5
- tinygrad/runtime/ops_gpu.py +2 -2
- tinygrad/runtime/ops_metal.py +5 -6
- tinygrad/runtime/ops_npy.py +1 -1
- tinygrad/runtime/ops_nv.py +161 -166
- tinygrad/runtime/ops_python.py +20 -16
- tinygrad/shape/__init__.py +0 -0
- tinygrad/shape/shapetracker.py +5 -2
- tinygrad/shape/symbolic.py +1 -3
- tinygrad/shape/view.py +34 -19
- tinygrad/tensor.py +219 -135
- {tinygrad-0.9.0.dist-info → tinygrad-0.9.1.dist-info}/METADATA +14 -6
- tinygrad-0.9.1.dist-info/RECORD +63 -0
- {tinygrad-0.9.0.dist-info → tinygrad-0.9.1.dist-info}/WHEEL +1 -1
- tinygrad/runtime/driver/hsa.py +0 -143
- tinygrad/runtime/graph/hsa.py +0 -171
- tinygrad/runtime/ops_hsa.py +0 -278
- tinygrad-0.9.0.dist-info/RECORD +0 -60
- {tinygrad-0.9.0.dist-info → tinygrad-0.9.1.dist-info}/LICENSE +0 -0
- {tinygrad-0.9.0.dist-info → tinygrad-0.9.1.dist-info}/top_level.txt +0 -0
tinygrad/runtime/ops_cuda.py
CHANGED
@@ -7,7 +7,7 @@ from tinygrad.helpers import DEBUG, getenv, from_mv, to_char_p_p, init_c_var, in
|
|
7
7
|
from tinygrad.device import Compiled, Compiler, CompileError, BufferOptions, LRUAllocator, MallocAllocator
|
8
8
|
from tinygrad.renderer.cstyle import CUDARenderer
|
9
9
|
from tinygrad.renderer.assembly import PTXRenderer
|
10
|
-
if getenv("IOCTL"): import extra.nv_gpu_driver.nv_ioctl # noqa: F401
|
10
|
+
if getenv("IOCTL"): import extra.nv_gpu_driver.nv_ioctl # noqa: F401 # pylint: disable=unused-import
|
11
11
|
|
12
12
|
def pretty_ptx(s):
|
13
13
|
# all expressions match `<valid_before><expr><valid_after>` and replace it with `<valid_before>color(<expr>)<valid_after>`
|
@@ -119,9 +119,9 @@ class CUDAAllocator(LRUAllocator):
|
|
119
119
|
def _alloc(self, size, options:BufferOptions):
|
120
120
|
check(cuda.cuCtxSetCurrent(self.device.context))
|
121
121
|
if options.host: return init_c_var(ctypes.c_void_p(), lambda x: check(cuda.cuMemHostAlloc(ctypes.byref(x), size, 0x01)))
|
122
|
-
|
122
|
+
return init_c_var(cuda.CUdeviceptr(), lambda x: check(cuda.cuMemAlloc_v2(ctypes.byref(x), size)))
|
123
123
|
def _free(self, opaque, options:BufferOptions):
|
124
|
-
if options.host:
|
124
|
+
if options.host: check(cuda.cuMemFreeHost(opaque))
|
125
125
|
else: check(cuda.cuMemFree_v2(opaque))
|
126
126
|
def copyin(self, dest, src:memoryview):
|
127
127
|
check(cuda.cuCtxSetCurrent(self.device.context))
|
tinygrad/runtime/ops_disk.py
CHANGED
@@ -1,8 +1,13 @@
|
|
1
1
|
from __future__ import annotations
|
2
|
-
import os, mmap, _posixshmem, io
|
3
|
-
from typing import Optional
|
4
|
-
from tinygrad.helpers import OSX
|
2
|
+
import os, mmap, _posixshmem, io, ctypes, ctypes.util, platform, contextlib
|
3
|
+
from typing import Optional, Generator, Tuple, Callable, List
|
4
|
+
from tinygrad.helpers import OSX, round_up
|
5
5
|
from tinygrad.device import Compiled, Allocator
|
6
|
+
import tinygrad.runtime.autogen.io_uring as io_uring
|
7
|
+
|
8
|
+
libc = ctypes.CDLL(ctypes.util.find_library("c"))
|
9
|
+
libc.mmap.argtypes = [ctypes.c_void_p, ctypes.c_size_t, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_long]
|
10
|
+
libc.mmap.restype = ctypes.c_void_p
|
6
11
|
|
7
12
|
class DiskBuffer:
|
8
13
|
def __init__(self, device:DiskDevice, size:int, offset=0):
|
@@ -18,7 +23,7 @@ class DiskAllocator(Allocator):
|
|
18
23
|
def _alloc(self, size:int, options):
|
19
24
|
self.device._might_open(size)
|
20
25
|
return DiskBuffer(self.device, size)
|
21
|
-
def _free(self,
|
26
|
+
def _free(self, opaque, options): self.device._might_close()
|
22
27
|
def as_buffer(self, src:DiskBuffer): return src._buf()
|
23
28
|
def copyin(self, dest:DiskBuffer, src:memoryview): dest._buf()[:] = src
|
24
29
|
def copyout(self, dest:memoryview, src:DiskBuffer):
|
@@ -29,10 +34,47 @@ class DiskAllocator(Allocator):
|
|
29
34
|
fo.readinto(dest)
|
30
35
|
else:
|
31
36
|
dest[:] = src._buf()
|
37
|
+
|
38
|
+
def _copyout_sharded(self, src:DiskBuffer, size:int, _get_free_buf:Callable, seg_len:int) -> Generator[Tuple[int, int, int, int], None, None]:
|
39
|
+
assert hasattr(DiskDevice, 'io_uring'), "function requires io uring support"
|
40
|
+
|
41
|
+
fd_offset = src.offset - (minor_offset := src.offset % mmap.PAGESIZE)
|
42
|
+
processed_reqs_cnt, copied_in, next_read_offset, total_copy_size = 0, 0, 0, round_up(size + minor_offset, mmap.PAGESIZE)
|
43
|
+
reqs: List[Tuple[int, int, int, int]] = []
|
44
|
+
|
45
|
+
while next_read_offset < total_copy_size or len(reqs) != processed_reqs_cnt:
|
46
|
+
if next_read_offset < total_copy_size and (copy_batch := _get_free_buf()) is not None:
|
47
|
+
# Prepare sqe
|
48
|
+
sqe_index = (tail:=DiskDevice.io_uring.sq.ktail[0]) & DiskDevice.io_uring.sq.kring_mask[0]
|
49
|
+
sqe = DiskDevice.io_uring.sq.sqes[sqe_index]
|
50
|
+
sqe.opcode, sqe.fd, sqe.off = io_uring.IORING_OP_READ, self.device.fd, fd_offset + next_read_offset
|
51
|
+
sqe.addr, sqe.len, sqe.user_data = copy_batch[0], min(seg_len, total_copy_size - next_read_offset), len(reqs)
|
52
|
+
|
53
|
+
# Send sqe
|
54
|
+
DiskDevice.io_uring.sq.array[sqe_index] = sqe_index
|
55
|
+
DiskDevice.io_uring.sq.ktail[0] = tail + 1
|
56
|
+
libc.syscall(io_uring.NR_io_uring_enter, DiskDevice.io_uring.ring_fd, 1, 1, io_uring.IORING_ENTER_GETEVENTS)
|
57
|
+
|
58
|
+
reqs.append((copy_batch, copied_in, minor_offset, real_copy_size:=min(sqe.len - minor_offset, size - copied_in)))
|
59
|
+
next_read_offset += sqe.len
|
60
|
+
copied_in += real_copy_size
|
61
|
+
minor_offset = 0
|
62
|
+
|
63
|
+
if (head:=DiskDevice.io_uring.cq.khead[0]) != DiskDevice.io_uring.cq.ktail[0]:
|
64
|
+
cqe = DiskDevice.io_uring.cq.cqes[head & DiskDevice.io_uring.cq.kring_mask[0]]
|
65
|
+
assert cqe.res >= 0, f"read from disk failed, err: {cqe.res}"
|
66
|
+
yield reqs[cqe.user_data]
|
67
|
+
DiskDevice.io_uring.cq.khead[0] = head + 1 # advance
|
68
|
+
processed_reqs_cnt += 1
|
69
|
+
|
32
70
|
def offset(self, buf:DiskBuffer, size:int, offset:int): return DiskBuffer(buf.device, size, offset)
|
33
71
|
|
34
72
|
class DiskDevice(Compiled):
|
73
|
+
_tried_io_uring_init = False
|
74
|
+
|
35
75
|
def __init__(self, device:str):
|
76
|
+
if not DiskDevice._tried_io_uring_init: self._iouring_setup()
|
77
|
+
|
36
78
|
self.size: Optional[int] = None
|
37
79
|
self.count = 0
|
38
80
|
super().__init__(device, DiskAllocator(self), None, None, None)
|
@@ -52,9 +94,32 @@ class DiskDevice(Compiled):
|
|
52
94
|
except OSError: self.fd = os.open(filename, os.O_RDWR|os.O_CREAT)
|
53
95
|
if os.fstat(self.fd).st_size < self.size: os.ftruncate(self.fd, self.size)
|
54
96
|
self.mem = mmap.mmap(self.fd, self.size)
|
55
|
-
if (hp := getattr(mmap, "MADV_HUGEPAGE", None)) is not None:
|
97
|
+
if (hp := getattr(mmap, "MADV_HUGEPAGE", None)) is not None:
|
98
|
+
with contextlib.suppress(OSError): self.mem.madvise(hp) # some systems have transparent_hugepage disabled
|
56
99
|
def _might_close(self):
|
57
100
|
self.count -= 1
|
58
101
|
if self.count == 0:
|
59
102
|
if hasattr(self, 'fd'): os.close(self.fd)
|
60
103
|
self.size = None
|
104
|
+
def _iouring_setup(self):
|
105
|
+
DiskDevice._tried_io_uring_init = True
|
106
|
+
|
107
|
+
if platform.system() != 'Linux': return
|
108
|
+
|
109
|
+
fd = libc.syscall(io_uring.NR_io_uring_setup, 4096, ctypes.byref(p:=io_uring.struct_io_uring_params()))
|
110
|
+
if fd < 0: return
|
111
|
+
|
112
|
+
sq_ptr = libc.mmap(0, p.sq_off.array + p.sq_entries * 4, mmap.PROT_READ | mmap.PROT_WRITE, mmap.MAP_SHARED | MAP_POPULATE, fd, 0)
|
113
|
+
cq_ptr = libc.mmap(0, p.cq_off.cqes + p.cq_entries * ctypes.sizeof(io_uring.struct_io_uring_cqe),
|
114
|
+
mmap.PROT_READ | mmap.PROT_WRITE, mmap.MAP_SHARED | MAP_POPULATE, fd, io_uring.IORING_OFF_CQ_RING)
|
115
|
+
sqes = libc.mmap(0, p.sq_entries * ctypes.sizeof(io_uring.struct_io_uring_sqe),
|
116
|
+
mmap.PROT_READ | mmap.PROT_WRITE, mmap.MAP_SHARED | MAP_POPULATE, fd, io_uring.IORING_OFF_SQES)
|
117
|
+
|
118
|
+
def u32ptr(val): return ctypes.cast(val, ctypes.POINTER(ctypes.c_uint32))
|
119
|
+
sqdesc = io_uring.struct_io_uring_sq(khead=u32ptr(sq_ptr+p.sq_off.head), ktail=u32ptr(sq_ptr+p.sq_off.tail), array=u32ptr(sq_ptr+p.sq_off.array),
|
120
|
+
kring_mask=u32ptr(sq_ptr+p.sq_off.ring_mask), sqes=ctypes.cast(sqes, ctypes.POINTER(io_uring.struct_io_uring_sqe)))
|
121
|
+
|
122
|
+
cqdesc = io_uring.struct_io_uring_cq(khead=u32ptr(cq_ptr+p.cq_off.head), ktail=u32ptr(cq_ptr+p.cq_off.tail),
|
123
|
+
kring_mask=u32ptr(sq_ptr+p.cq_off.ring_mask), cqes=ctypes.cast(cq_ptr+p.cq_off.cqes, ctypes.POINTER(io_uring.struct_io_uring_cqe)))
|
124
|
+
|
125
|
+
DiskDevice.io_uring = io_uring.struct_io_uring(ring_fd=fd, sq=sqdesc, cq=cqdesc) # type: ignore
|
tinygrad/runtime/ops_gpu.py
CHANGED
@@ -66,8 +66,8 @@ class CLAllocator(LRUAllocator):
|
|
66
66
|
return checked(cl.clCreateImage2D(self.device.context, cl.CL_MEM_READ_WRITE,
|
67
67
|
cl.cl_image_format(cl.CL_RGBA, {2: cl.CL_HALF_FLOAT, 4: cl.CL_FLOAT}[options.image.itemsize]),
|
68
68
|
options.image.shape[1], options.image.shape[0], 0, None, status := ctypes.c_int32()), status)
|
69
|
-
|
70
|
-
def _free(self,
|
69
|
+
return checked(cl.clCreateBuffer(self.device.context, cl.CL_MEM_READ_WRITE, size, None, status := ctypes.c_int32()), status)
|
70
|
+
def _free(self, opaque:ctypes._CData, options:BufferOptions): check(cl.clReleaseMemObject(opaque))
|
71
71
|
def copyin(self, dest:ctypes._CData, src:memoryview):
|
72
72
|
check(cl.clEnqueueWriteBuffer(self.device.queue, dest, False, 0, len(src)*src.itemsize, from_mv(src), 0, None, None))
|
73
73
|
self.device.pending_copyin.append(src) # NOTE: these can't be freed until the GPU actually executes this command
|
tinygrad/runtime/ops_metal.py
CHANGED
@@ -20,12 +20,11 @@ class MetalCompiler(Compiler):
|
|
20
20
|
# NOTE: if you run llvm-dis on "air" you can see the llvm bytecode
|
21
21
|
air = subprocess.check_output(['xcrun', '-sdk', 'macosx', 'metal', '-x', 'metal', '-c', '-', '-o', '-'], input=src.encode('utf-8'))
|
22
22
|
return subprocess.check_output(['xcrun', '-sdk', 'macosx', 'metallib', '-', '-o', '-'], input=air)
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
return library.libraryDataContents().bytes().tobytes()
|
23
|
+
options = Metal.MTLCompileOptions.new()
|
24
|
+
options.setFastMathEnabled_(getenv("METAL_FAST_MATH"))
|
25
|
+
try: library = unwrap2(self.device.device.newLibraryWithSource_options_error_(src, options, None))
|
26
|
+
except AssertionError as e: raise CompileError(e) from e
|
27
|
+
return library.libraryDataContents().bytes().tobytes()
|
29
28
|
|
30
29
|
class MetalProgram:
|
31
30
|
def __init__(self, device:MetalDevice, name:str, lib:bytes):
|
tinygrad/runtime/ops_npy.py
CHANGED
@@ -2,7 +2,7 @@ import numpy as np
|
|
2
2
|
from tinygrad.helpers import flat_mv
|
3
3
|
from tinygrad.device import Compiled, Allocator
|
4
4
|
|
5
|
-
class NpyAllocator(Allocator):
|
5
|
+
class NpyAllocator(Allocator): # pylint: disable=abstract-method
|
6
6
|
def copyout(self, dest:memoryview, src:np.ndarray): dest[:] = flat_mv(np.require(src, requirements='C').data)
|
7
7
|
|
8
8
|
class NpyDevice(Compiled):
|