tinygrad 0.9.0__py3-none-any.whl → 0.9.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. tinygrad/codegen/__init__.py +0 -0
  2. tinygrad/codegen/kernel.py +78 -90
  3. tinygrad/codegen/linearizer.py +237 -169
  4. tinygrad/codegen/uops.py +278 -242
  5. tinygrad/device.py +147 -10
  6. tinygrad/dtype.py +7 -7
  7. tinygrad/engine/graph.py +16 -16
  8. tinygrad/engine/jit.py +39 -36
  9. tinygrad/engine/realize.py +6 -5
  10. tinygrad/engine/schedule.py +15 -7
  11. tinygrad/engine/search.py +6 -3
  12. tinygrad/function.py +17 -23
  13. tinygrad/helpers.py +77 -8
  14. tinygrad/lazy.py +26 -26
  15. tinygrad/multi.py +13 -9
  16. tinygrad/nn/__init__.py +1 -1
  17. tinygrad/nn/datasets.py +2 -1
  18. tinygrad/nn/state.py +3 -4
  19. tinygrad/ops.py +49 -16
  20. tinygrad/renderer/__init__.py +8 -4
  21. tinygrad/renderer/assembly.py +93 -100
  22. tinygrad/renderer/cstyle.py +47 -42
  23. tinygrad/renderer/llvmir.py +30 -30
  24. tinygrad/runtime/__init__.py +0 -0
  25. tinygrad/runtime/autogen/amd_gpu.py +11504 -1
  26. tinygrad/runtime/autogen/comgr.py +36 -10
  27. tinygrad/runtime/autogen/hsa.py +146 -14
  28. tinygrad/runtime/autogen/io_uring.py +1486 -0
  29. tinygrad/runtime/autogen/nv_gpu.py +269 -0
  30. tinygrad/runtime/driver/__init__.py +0 -0
  31. tinygrad/runtime/driver/hip_comgr.py +20 -11
  32. tinygrad/runtime/graph/__init__.py +0 -0
  33. tinygrad/runtime/graph/clang.py +3 -2
  34. tinygrad/runtime/graph/cuda.py +2 -2
  35. tinygrad/runtime/graph/hcq.py +122 -78
  36. tinygrad/runtime/ops_amd.py +302 -316
  37. tinygrad/runtime/ops_cuda.py +3 -3
  38. tinygrad/runtime/ops_disk.py +70 -5
  39. tinygrad/runtime/ops_gpu.py +2 -2
  40. tinygrad/runtime/ops_metal.py +5 -6
  41. tinygrad/runtime/ops_npy.py +1 -1
  42. tinygrad/runtime/ops_nv.py +161 -166
  43. tinygrad/runtime/ops_python.py +20 -16
  44. tinygrad/shape/__init__.py +0 -0
  45. tinygrad/shape/shapetracker.py +5 -2
  46. tinygrad/shape/symbolic.py +1 -3
  47. tinygrad/shape/view.py +34 -19
  48. tinygrad/tensor.py +219 -135
  49. {tinygrad-0.9.0.dist-info → tinygrad-0.9.1.dist-info}/METADATA +14 -6
  50. tinygrad-0.9.1.dist-info/RECORD +63 -0
  51. {tinygrad-0.9.0.dist-info → tinygrad-0.9.1.dist-info}/WHEEL +1 -1
  52. tinygrad/runtime/driver/hsa.py +0 -143
  53. tinygrad/runtime/graph/hsa.py +0 -171
  54. tinygrad/runtime/ops_hsa.py +0 -278
  55. tinygrad-0.9.0.dist-info/RECORD +0 -60
  56. {tinygrad-0.9.0.dist-info → tinygrad-0.9.1.dist-info}/LICENSE +0 -0
  57. {tinygrad-0.9.0.dist-info → tinygrad-0.9.1.dist-info}/top_level.txt +0 -0
@@ -7,7 +7,7 @@ from tinygrad.helpers import DEBUG, getenv, from_mv, to_char_p_p, init_c_var, in
7
7
  from tinygrad.device import Compiled, Compiler, CompileError, BufferOptions, LRUAllocator, MallocAllocator
8
8
  from tinygrad.renderer.cstyle import CUDARenderer
9
9
  from tinygrad.renderer.assembly import PTXRenderer
10
- if getenv("IOCTL"): import extra.nv_gpu_driver.nv_ioctl # noqa: F401
10
+ if getenv("IOCTL"): import extra.nv_gpu_driver.nv_ioctl # noqa: F401 # pylint: disable=unused-import
11
11
 
12
12
  def pretty_ptx(s):
13
13
  # all expressions match `<valid_before><expr><valid_after>` and replace it with `<valid_before>color(<expr>)<valid_after>`
@@ -119,9 +119,9 @@ class CUDAAllocator(LRUAllocator):
119
119
  def _alloc(self, size, options:BufferOptions):
120
120
  check(cuda.cuCtxSetCurrent(self.device.context))
121
121
  if options.host: return init_c_var(ctypes.c_void_p(), lambda x: check(cuda.cuMemHostAlloc(ctypes.byref(x), size, 0x01)))
122
- else: return init_c_var(cuda.CUdeviceptr(), lambda x: check(cuda.cuMemAlloc_v2(ctypes.byref(x), size)))
122
+ return init_c_var(cuda.CUdeviceptr(), lambda x: check(cuda.cuMemAlloc_v2(ctypes.byref(x), size)))
123
123
  def _free(self, opaque, options:BufferOptions):
124
- if options.host: return check(cuda.cuMemFreeHost(opaque))
124
+ if options.host: check(cuda.cuMemFreeHost(opaque))
125
125
  else: check(cuda.cuMemFree_v2(opaque))
126
126
  def copyin(self, dest, src:memoryview):
127
127
  check(cuda.cuCtxSetCurrent(self.device.context))
@@ -1,8 +1,13 @@
1
1
  from __future__ import annotations
2
- import os, mmap, _posixshmem, io
3
- from typing import Optional
4
- from tinygrad.helpers import OSX
2
+ import os, mmap, _posixshmem, io, ctypes, ctypes.util, platform, contextlib
3
+ from typing import Optional, Generator, Tuple, Callable, List
4
+ from tinygrad.helpers import OSX, round_up
5
5
  from tinygrad.device import Compiled, Allocator
6
+ import tinygrad.runtime.autogen.io_uring as io_uring
7
+
8
+ libc = ctypes.CDLL(ctypes.util.find_library("c"))
9
+ libc.mmap.argtypes = [ctypes.c_void_p, ctypes.c_size_t, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_long]
10
+ libc.mmap.restype = ctypes.c_void_p
6
11
 
7
12
  class DiskBuffer:
8
13
  def __init__(self, device:DiskDevice, size:int, offset=0):
@@ -18,7 +23,7 @@ class DiskAllocator(Allocator):
18
23
  def _alloc(self, size:int, options):
19
24
  self.device._might_open(size)
20
25
  return DiskBuffer(self.device, size)
21
- def _free(self, buf, options): self.device._might_close()
26
+ def _free(self, opaque, options): self.device._might_close()
22
27
  def as_buffer(self, src:DiskBuffer): return src._buf()
23
28
  def copyin(self, dest:DiskBuffer, src:memoryview): dest._buf()[:] = src
24
29
  def copyout(self, dest:memoryview, src:DiskBuffer):
@@ -29,10 +34,47 @@ class DiskAllocator(Allocator):
29
34
  fo.readinto(dest)
30
35
  else:
31
36
  dest[:] = src._buf()
37
+
38
+ def _copyout_sharded(self, src:DiskBuffer, size:int, _get_free_buf:Callable, seg_len:int) -> Generator[Tuple[int, int, int, int], None, None]:
39
+ assert hasattr(DiskDevice, 'io_uring'), "function requires io uring support"
40
+
41
+ fd_offset = src.offset - (minor_offset := src.offset % mmap.PAGESIZE)
42
+ processed_reqs_cnt, copied_in, next_read_offset, total_copy_size = 0, 0, 0, round_up(size + minor_offset, mmap.PAGESIZE)
43
+ reqs: List[Tuple[int, int, int, int]] = []
44
+
45
+ while next_read_offset < total_copy_size or len(reqs) != processed_reqs_cnt:
46
+ if next_read_offset < total_copy_size and (copy_batch := _get_free_buf()) is not None:
47
+ # Prepare sqe
48
+ sqe_index = (tail:=DiskDevice.io_uring.sq.ktail[0]) & DiskDevice.io_uring.sq.kring_mask[0]
49
+ sqe = DiskDevice.io_uring.sq.sqes[sqe_index]
50
+ sqe.opcode, sqe.fd, sqe.off = io_uring.IORING_OP_READ, self.device.fd, fd_offset + next_read_offset
51
+ sqe.addr, sqe.len, sqe.user_data = copy_batch[0], min(seg_len, total_copy_size - next_read_offset), len(reqs)
52
+
53
+ # Send sqe
54
+ DiskDevice.io_uring.sq.array[sqe_index] = sqe_index
55
+ DiskDevice.io_uring.sq.ktail[0] = tail + 1
56
+ libc.syscall(io_uring.NR_io_uring_enter, DiskDevice.io_uring.ring_fd, 1, 1, io_uring.IORING_ENTER_GETEVENTS)
57
+
58
+ reqs.append((copy_batch, copied_in, minor_offset, real_copy_size:=min(sqe.len - minor_offset, size - copied_in)))
59
+ next_read_offset += sqe.len
60
+ copied_in += real_copy_size
61
+ minor_offset = 0
62
+
63
+ if (head:=DiskDevice.io_uring.cq.khead[0]) != DiskDevice.io_uring.cq.ktail[0]:
64
+ cqe = DiskDevice.io_uring.cq.cqes[head & DiskDevice.io_uring.cq.kring_mask[0]]
65
+ assert cqe.res >= 0, f"read from disk failed, err: {cqe.res}"
66
+ yield reqs[cqe.user_data]
67
+ DiskDevice.io_uring.cq.khead[0] = head + 1 # advance
68
+ processed_reqs_cnt += 1
69
+
32
70
  def offset(self, buf:DiskBuffer, size:int, offset:int): return DiskBuffer(buf.device, size, offset)
33
71
 
34
72
  class DiskDevice(Compiled):
73
+ _tried_io_uring_init = False
74
+
35
75
  def __init__(self, device:str):
76
+ if not DiskDevice._tried_io_uring_init: self._iouring_setup()
77
+
36
78
  self.size: Optional[int] = None
37
79
  self.count = 0
38
80
  super().__init__(device, DiskAllocator(self), None, None, None)
@@ -52,9 +94,32 @@ class DiskDevice(Compiled):
52
94
  except OSError: self.fd = os.open(filename, os.O_RDWR|os.O_CREAT)
53
95
  if os.fstat(self.fd).st_size < self.size: os.ftruncate(self.fd, self.size)
54
96
  self.mem = mmap.mmap(self.fd, self.size)
55
- if (hp := getattr(mmap, "MADV_HUGEPAGE", None)) is not None: self.mem.madvise(hp) # type: ignore
97
+ if (hp := getattr(mmap, "MADV_HUGEPAGE", None)) is not None:
98
+ with contextlib.suppress(OSError): self.mem.madvise(hp) # some systems have transparent_hugepage disabled
56
99
  def _might_close(self):
57
100
  self.count -= 1
58
101
  if self.count == 0:
59
102
  if hasattr(self, 'fd'): os.close(self.fd)
60
103
  self.size = None
104
+ def _iouring_setup(self):
105
+ DiskDevice._tried_io_uring_init = True
106
+
107
+ if platform.system() != 'Linux': return
108
+
109
+ fd = libc.syscall(io_uring.NR_io_uring_setup, 4096, ctypes.byref(p:=io_uring.struct_io_uring_params()))
110
+ if fd < 0: return
111
+
112
+ sq_ptr = libc.mmap(0, p.sq_off.array + p.sq_entries * 4, mmap.PROT_READ | mmap.PROT_WRITE, mmap.MAP_SHARED | MAP_POPULATE, fd, 0)
113
+ cq_ptr = libc.mmap(0, p.cq_off.cqes + p.cq_entries * ctypes.sizeof(io_uring.struct_io_uring_cqe),
114
+ mmap.PROT_READ | mmap.PROT_WRITE, mmap.MAP_SHARED | MAP_POPULATE, fd, io_uring.IORING_OFF_CQ_RING)
115
+ sqes = libc.mmap(0, p.sq_entries * ctypes.sizeof(io_uring.struct_io_uring_sqe),
116
+ mmap.PROT_READ | mmap.PROT_WRITE, mmap.MAP_SHARED | MAP_POPULATE, fd, io_uring.IORING_OFF_SQES)
117
+
118
+ def u32ptr(val): return ctypes.cast(val, ctypes.POINTER(ctypes.c_uint32))
119
+ sqdesc = io_uring.struct_io_uring_sq(khead=u32ptr(sq_ptr+p.sq_off.head), ktail=u32ptr(sq_ptr+p.sq_off.tail), array=u32ptr(sq_ptr+p.sq_off.array),
120
+ kring_mask=u32ptr(sq_ptr+p.sq_off.ring_mask), sqes=ctypes.cast(sqes, ctypes.POINTER(io_uring.struct_io_uring_sqe)))
121
+
122
+ cqdesc = io_uring.struct_io_uring_cq(khead=u32ptr(cq_ptr+p.cq_off.head), ktail=u32ptr(cq_ptr+p.cq_off.tail),
123
+ kring_mask=u32ptr(sq_ptr+p.cq_off.ring_mask), cqes=ctypes.cast(cq_ptr+p.cq_off.cqes, ctypes.POINTER(io_uring.struct_io_uring_cqe)))
124
+
125
+ DiskDevice.io_uring = io_uring.struct_io_uring(ring_fd=fd, sq=sqdesc, cq=cqdesc) # type: ignore
@@ -66,8 +66,8 @@ class CLAllocator(LRUAllocator):
66
66
  return checked(cl.clCreateImage2D(self.device.context, cl.CL_MEM_READ_WRITE,
67
67
  cl.cl_image_format(cl.CL_RGBA, {2: cl.CL_HALF_FLOAT, 4: cl.CL_FLOAT}[options.image.itemsize]),
68
68
  options.image.shape[1], options.image.shape[0], 0, None, status := ctypes.c_int32()), status)
69
- else: return checked(cl.clCreateBuffer(self.device.context, cl.CL_MEM_READ_WRITE, size, None, status := ctypes.c_int32()), status)
70
- def _free(self, buf:ctypes._CData, options:BufferOptions): check(cl.clReleaseMemObject(buf))
69
+ return checked(cl.clCreateBuffer(self.device.context, cl.CL_MEM_READ_WRITE, size, None, status := ctypes.c_int32()), status)
70
+ def _free(self, opaque:ctypes._CData, options:BufferOptions): check(cl.clReleaseMemObject(opaque))
71
71
  def copyin(self, dest:ctypes._CData, src:memoryview):
72
72
  check(cl.clEnqueueWriteBuffer(self.device.queue, dest, False, 0, len(src)*src.itemsize, from_mv(src), 0, None, None))
73
73
  self.device.pending_copyin.append(src) # NOTE: these can't be freed until the GPU actually executes this command
@@ -20,12 +20,11 @@ class MetalCompiler(Compiler):
20
20
  # NOTE: if you run llvm-dis on "air" you can see the llvm bytecode
21
21
  air = subprocess.check_output(['xcrun', '-sdk', 'macosx', 'metal', '-x', 'metal', '-c', '-', '-o', '-'], input=src.encode('utf-8'))
22
22
  return subprocess.check_output(['xcrun', '-sdk', 'macosx', 'metallib', '-', '-o', '-'], input=air)
23
- else:
24
- options = Metal.MTLCompileOptions.new()
25
- options.setFastMathEnabled_(getenv("METAL_FAST_MATH"))
26
- try: library = unwrap2(self.device.device.newLibraryWithSource_options_error_(src, options, None))
27
- except AssertionError as e: raise CompileError(e)
28
- return library.libraryDataContents().bytes().tobytes()
23
+ options = Metal.MTLCompileOptions.new()
24
+ options.setFastMathEnabled_(getenv("METAL_FAST_MATH"))
25
+ try: library = unwrap2(self.device.device.newLibraryWithSource_options_error_(src, options, None))
26
+ except AssertionError as e: raise CompileError(e) from e
27
+ return library.libraryDataContents().bytes().tobytes()
29
28
 
30
29
  class MetalProgram:
31
30
  def __init__(self, device:MetalDevice, name:str, lib:bytes):
@@ -2,7 +2,7 @@ import numpy as np
2
2
  from tinygrad.helpers import flat_mv
3
3
  from tinygrad.device import Compiled, Allocator
4
4
 
5
- class NpyAllocator(Allocator):
5
+ class NpyAllocator(Allocator): # pylint: disable=abstract-method
6
6
  def copyout(self, dest:memoryview, src:np.ndarray): dest[:] = flat_mv(np.require(src, requirements='C').data)
7
7
 
8
8
  class NpyDevice(Compiled):