tinygrad 0.10.2__py3-none-any.whl → 0.11.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (131) hide show
  1. tinygrad/__init__.py +1 -1
  2. tinygrad/apps/llm.py +206 -0
  3. tinygrad/codegen/__init__.py +116 -0
  4. tinygrad/codegen/devectorizer.py +315 -172
  5. tinygrad/codegen/expander.py +8 -16
  6. tinygrad/codegen/gpudims.py +89 -0
  7. tinygrad/codegen/linearize.py +205 -203
  8. tinygrad/codegen/lowerer.py +92 -139
  9. tinygrad/codegen/opt/__init__.py +38 -0
  10. tinygrad/codegen/opt/heuristic.py +125 -0
  11. tinygrad/codegen/opt/kernel.py +510 -0
  12. tinygrad/{engine → codegen/opt}/search.py +51 -35
  13. tinygrad/codegen/opt/swizzler.py +134 -0
  14. tinygrad/codegen/opt/tc.py +127 -0
  15. tinygrad/codegen/quantize.py +67 -0
  16. tinygrad/device.py +122 -132
  17. tinygrad/dtype.py +152 -35
  18. tinygrad/engine/jit.py +81 -54
  19. tinygrad/engine/memory.py +46 -27
  20. tinygrad/engine/realize.py +82 -41
  21. tinygrad/engine/schedule.py +70 -445
  22. tinygrad/frontend/__init__.py +0 -0
  23. tinygrad/frontend/onnx.py +1253 -0
  24. tinygrad/frontend/torch.py +5 -0
  25. tinygrad/gradient.py +19 -27
  26. tinygrad/helpers.py +95 -47
  27. tinygrad/nn/__init__.py +7 -8
  28. tinygrad/nn/optim.py +72 -41
  29. tinygrad/nn/state.py +37 -23
  30. tinygrad/renderer/__init__.py +40 -60
  31. tinygrad/renderer/cstyle.py +143 -128
  32. tinygrad/renderer/llvmir.py +113 -62
  33. tinygrad/renderer/ptx.py +50 -32
  34. tinygrad/renderer/wgsl.py +27 -23
  35. tinygrad/runtime/autogen/am/am.py +5861 -0
  36. tinygrad/runtime/autogen/am/pm4_nv.py +962 -0
  37. tinygrad/runtime/autogen/am/pm4_soc15.py +931 -0
  38. tinygrad/runtime/autogen/am/sdma_4_0_0.py +5209 -0
  39. tinygrad/runtime/autogen/am/sdma_4_4_2.py +5209 -0
  40. tinygrad/runtime/autogen/am/sdma_5_0_0.py +7103 -0
  41. tinygrad/runtime/autogen/am/sdma_6_0_0.py +8085 -0
  42. tinygrad/runtime/autogen/am/smu_v13_0_0.py +3068 -0
  43. tinygrad/runtime/autogen/am/smu_v14_0_2.py +3605 -0
  44. tinygrad/runtime/autogen/amd_gpu.py +1433 -67197
  45. tinygrad/runtime/autogen/comgr.py +35 -9
  46. tinygrad/runtime/autogen/comgr_3.py +906 -0
  47. tinygrad/runtime/autogen/cuda.py +2419 -494
  48. tinygrad/runtime/autogen/hsa.py +57 -16
  49. tinygrad/runtime/autogen/ib.py +7171 -0
  50. tinygrad/runtime/autogen/io_uring.py +917 -118
  51. tinygrad/runtime/autogen/kfd.py +748 -26
  52. tinygrad/runtime/autogen/libc.py +613 -218
  53. tinygrad/runtime/autogen/libusb.py +1643 -0
  54. tinygrad/runtime/autogen/nv/nv.py +8602 -0
  55. tinygrad/runtime/autogen/nv_gpu.py +7218 -2072
  56. tinygrad/runtime/autogen/opencl.py +2 -4
  57. tinygrad/runtime/autogen/sqtt.py +1789 -0
  58. tinygrad/runtime/autogen/vfio.py +3 -3
  59. tinygrad/runtime/autogen/webgpu.py +273 -264
  60. tinygrad/runtime/graph/cuda.py +3 -3
  61. tinygrad/runtime/graph/hcq.py +68 -29
  62. tinygrad/runtime/graph/metal.py +29 -13
  63. tinygrad/runtime/graph/remote.py +114 -0
  64. tinygrad/runtime/ops_amd.py +537 -320
  65. tinygrad/runtime/ops_cpu.py +108 -7
  66. tinygrad/runtime/ops_cuda.py +12 -14
  67. tinygrad/runtime/ops_disk.py +13 -10
  68. tinygrad/runtime/ops_dsp.py +47 -40
  69. tinygrad/runtime/ops_gpu.py +13 -11
  70. tinygrad/runtime/ops_hip.py +6 -9
  71. tinygrad/runtime/ops_llvm.py +35 -15
  72. tinygrad/runtime/ops_metal.py +29 -19
  73. tinygrad/runtime/ops_npy.py +5 -3
  74. tinygrad/runtime/ops_null.py +28 -0
  75. tinygrad/runtime/ops_nv.py +306 -234
  76. tinygrad/runtime/ops_python.py +62 -52
  77. tinygrad/runtime/ops_qcom.py +28 -39
  78. tinygrad/runtime/ops_remote.py +482 -0
  79. tinygrad/runtime/ops_webgpu.py +28 -28
  80. tinygrad/runtime/support/am/amdev.py +114 -249
  81. tinygrad/runtime/support/am/ip.py +211 -172
  82. tinygrad/runtime/support/amd.py +138 -0
  83. tinygrad/runtime/support/{compiler_hip.py → compiler_amd.py} +40 -8
  84. tinygrad/runtime/support/compiler_cuda.py +8 -11
  85. tinygrad/runtime/support/elf.py +2 -1
  86. tinygrad/runtime/support/hcq.py +184 -97
  87. tinygrad/runtime/support/ib.py +172 -0
  88. tinygrad/runtime/support/llvm.py +3 -4
  89. tinygrad/runtime/support/memory.py +251 -0
  90. tinygrad/runtime/support/nv/__init__.py +0 -0
  91. tinygrad/runtime/support/nv/ip.py +581 -0
  92. tinygrad/runtime/support/nv/nvdev.py +183 -0
  93. tinygrad/runtime/support/system.py +170 -0
  94. tinygrad/runtime/support/usb.py +268 -0
  95. tinygrad/runtime/support/webgpu.py +18 -0
  96. tinygrad/schedule/__init__.py +0 -0
  97. tinygrad/schedule/grouper.py +119 -0
  98. tinygrad/schedule/kernelize.py +368 -0
  99. tinygrad/schedule/multi.py +231 -0
  100. tinygrad/shape/shapetracker.py +40 -46
  101. tinygrad/shape/view.py +88 -52
  102. tinygrad/tensor.py +968 -542
  103. tinygrad/uop/__init__.py +117 -0
  104. tinygrad/{codegen/transcendental.py → uop/decompositions.py} +125 -38
  105. tinygrad/uop/mathtraits.py +169 -0
  106. tinygrad/uop/ops.py +1021 -0
  107. tinygrad/uop/spec.py +228 -0
  108. tinygrad/{codegen → uop}/symbolic.py +239 -216
  109. tinygrad/uop/upat.py +163 -0
  110. tinygrad/viz/assets/cdnjs.cloudflare.com/ajax/libs/highlight.js/11.10.0/languages/x86asm.min.js +19 -0
  111. tinygrad/viz/assets/d3js.org/d3.v7.min.js +2 -0
  112. tinygrad/viz/assets/dagrejs.github.io/project/dagre/latest/dagre.min.js +801 -0
  113. tinygrad/viz/index.html +203 -403
  114. tinygrad/viz/js/index.js +718 -0
  115. tinygrad/viz/js/worker.js +29 -0
  116. tinygrad/viz/serve.py +224 -102
  117. {tinygrad-0.10.2.dist-info → tinygrad-0.11.0.dist-info}/METADATA +24 -16
  118. tinygrad-0.11.0.dist-info/RECORD +141 -0
  119. {tinygrad-0.10.2.dist-info → tinygrad-0.11.0.dist-info}/WHEEL +1 -1
  120. tinygrad/codegen/kernel.py +0 -693
  121. tinygrad/engine/multi.py +0 -161
  122. tinygrad/ops.py +0 -1003
  123. tinygrad/runtime/ops_cloud.py +0 -220
  124. tinygrad/runtime/support/allocator.py +0 -94
  125. tinygrad/spec.py +0 -155
  126. tinygrad/viz/assets/d3js.org/d3.v5.min.js +0 -2
  127. tinygrad/viz/assets/dagrejs.github.io/project/dagre-d3/latest/dagre-d3.min.js +0 -4816
  128. tinygrad/viz/perfetto.html +0 -178
  129. tinygrad-0.10.2.dist-info/RECORD +0 -99
  130. {tinygrad-0.10.2.dist-info → tinygrad-0.11.0.dist-info/licenses}/LICENSE +0 -0
  131. {tinygrad-0.10.2.dist-info → tinygrad-0.11.0.dist-info}/top_level.txt +0 -0
@@ -1,7 +1,7 @@
1
- import os, pathlib, struct, ctypes, tempfile, functools, contextlib, decimal, platform
2
- from typing import Any, Union, cast
3
- from tinygrad.helpers import prod, to_mv, getenv, round_up, cache_dir, T, init_c_struct_t, PROFILE
4
- from tinygrad.device import Compiled, Compiler, CompileError, LRUAllocator, cpu_profile, ProfileDeviceEvent, ProfileRangeEvent
1
+ import subprocess, pathlib, struct, ctypes, tempfile, functools, contextlib, decimal, platform
2
+ from typing import Any, cast
3
+ from tinygrad.helpers import prod, to_mv, getenv, round_up, cache_dir, T, init_c_struct_t, PROFILE, ProfileRangeEvent, cpu_profile, unwrap
4
+ from tinygrad.device import Compiled, Compiler, CompileError, LRUAllocator, ProfileDeviceEvent
5
5
  from tinygrad.renderer.cstyle import MetalRenderer
6
6
 
7
7
  class objc_id(ctypes.c_void_p): # This prevents ctypes from converting response to plain int, and dict.fromkeys() can use it to dedup
@@ -35,7 +35,7 @@ libobjc.sel_registerName.restype = objc_id
35
35
  libmetal.MTLCreateSystemDefaultDevice.restype = objc_instance
36
36
  libdispatch.dispatch_data_create.restype = objc_instance
37
37
 
38
- @functools.lru_cache(None)
38
+ @functools.cache
39
39
  def msg(selector: str, restype: type[T] = objc_id): # type: ignore [assignment]
40
40
  resname = libobjc.sel_registerName(selector.encode())
41
41
  sender = libobjc["objc_msgSend"] # Using attribute access returns a new reference so setting restype is safe
@@ -43,11 +43,12 @@ def msg(selector: str, restype: type[T] = objc_id): # type: ignore [assignment]
43
43
  def _msg(ptr: objc_id, *args: Any) -> T: return sender(ptr, resname, *args)
44
44
  return _msg
45
45
 
46
- @functools.lru_cache(None)
46
+ @functools.cache
47
47
  def to_ns_str(s: str): return msg("stringWithUTF8String:", objc_instance)(libobjc.objc_getClass(b"NSString"), s.encode())
48
48
  def from_ns_str(s): return bytes(msg("UTF8String", ctypes.c_char_p)(s)).decode()
49
49
 
50
- def to_struct(*t: int, _type: type = ctypes.c_ulong): return init_c_struct_t(tuple([(f"field{i}", _type) for i in range(len(t))]))(*t)
50
+ def to_struct(*t: int, _type: type[ctypes._SimpleCData] = ctypes.c_ulong):
51
+ return init_c_struct_t(tuple([(f"field{i}", _type) for i in range(len(t))]))(*t)
51
52
 
52
53
  def wait_check(cbuf: Any):
53
54
  msg("waitUntilCompleted")(cbuf)
@@ -73,14 +74,17 @@ class MetalDevice(Compiled):
73
74
  Compiled.profile_events += [ProfileDeviceEvent(device)]
74
75
 
75
76
  from tinygrad.runtime.graph.metal import MetalGraph
77
+ # NOTE: GitHub CI macOS runners use paravirtualized metal which is broken with graph.
78
+ # This can be reproduced locally with any virtualization software (like utm) that can create macOS VMs with apple's own virtualization framework.
76
79
  super().__init__(device, MetalAllocator(self), MetalRenderer(), MetalCompiler() if getenv("METAL_DIRECT", 1) else Compiler(),
77
- functools.partial(MetalProgram, self), MetalGraph)
80
+ functools.partial(MetalProgram, self), MetalGraph if 'virtual' not in from_ns_str(msg('name')(self.sysdevice)).lower() else None)
78
81
 
79
82
  def synchronize(self):
80
83
  for cbuf in self.mtl_buffers_in_flight:
81
84
  wait_check(cbuf)
82
85
  st, en = decimal.Decimal(cmdbuf_st_time(cbuf)) * 1000000, decimal.Decimal(cmdbuf_en_time(cbuf)) * 1000000
83
- if PROFILE and (lb:=cmdbuf_label(cbuf)) is not None:
86
+ # NOTE: command buffers from MetalGraph are not profiled here
87
+ if PROFILE and (lb:=cmdbuf_label(cbuf)) is not None and not lb.startswith("batched"):
84
88
  Compiled.profile_events += [ProfileRangeEvent(self.device, lb, st, en, is_copy=lb.startswith("COPY"))]
85
89
  self.mtl_buffers_in_flight.clear()
86
90
 
@@ -97,7 +101,7 @@ class MetalCompiler(Compiler):
97
101
  # This means that MTLCompiler's llvm will create it's own instances of global state because RTLD_LOCAL doesn't export symbols, but if RTLD_GLOBAL
98
102
  # library is loaded first then RTLD_LOCAL library will just use it's symbols. On linux there is RTLD_DEEPBIND to prevent that, but on macos there
99
103
  # doesn't seem to be anything we can do.
100
- with contextlib.suppress(FileNotFoundError):
104
+ with contextlib.suppress(FileNotFoundError, ModuleNotFoundError):
101
105
  import tinygrad.runtime.autogen.llvm # noqa: F401
102
106
  support = ctypes.CDLL("/System/Library/PrivateFrameworks/MTLCompiler.framework/MTLCompiler")
103
107
  support.MTLCodeGenServiceCreate.restype = ctypes.c_void_p
@@ -107,7 +111,7 @@ class MetalCompiler(Compiler):
107
111
  super().__init__("compile_metal_direct")
108
112
  def __reduce__(self): return (MetalCompiler,()) # force pickle to create new instance for each multiprocessing fork
109
113
  def compile(self, src:str) -> bytes:
110
- ret: Union[Exception, bytes] = CompileError("MTLCodeGenServiceBuildRequest returned without calling the callback")
114
+ ret: Exception|bytes = CompileError("MTLCodeGenServiceBuildRequest returned without calling the callback")
111
115
  @ctypes.CFUNCTYPE(None, ctypes.c_void_p, ctypes.c_int32, ctypes.c_void_p, ctypes.c_size_t, ctypes.c_char_p)
112
116
  def callback(blockptr, error, dataPtr, dataLen, errorMessage):
113
117
  nonlocal ret
@@ -140,7 +144,10 @@ class MetalCompiler(Compiler):
140
144
  with tempfile.NamedTemporaryFile(delete=True) as shader:
141
145
  shader.write(lib)
142
146
  shader.flush()
143
- ret = os.system(f"cd {pathlib.Path(__file__).parents[2]}/extra/disassemblers/applegpu && python3 compiler_explorer.py {shader.name}")
147
+ proc = subprocess.Popen(f"cd {pathlib.Path(__file__).parents[2]}/extra/disassemblers/applegpu && python3 compiler_explorer.py {shader.name}",
148
+ stdout=subprocess.PIPE, shell=True, text=True, bufsize=1)
149
+ for line in unwrap(proc.stdout): print(line, end="")
150
+ ret = proc.wait()
144
151
  if ret: print("Disassembler Error: Make sure you have https://github.com/dougallj/applegpu cloned to tinygrad/extra/disassemblers/applegpu")
145
152
 
146
153
  class MetalProgram:
@@ -187,16 +194,16 @@ class MetalProgram:
187
194
  class MetalBuffer:
188
195
  def __init__(self, buf:Any, size:int, offset=0): self.buf, self.size, self.offset = buf, size, offset
189
196
 
190
- class MetalAllocator(LRUAllocator):
191
- def __init__(self, dev:MetalDevice):
192
- self.dev:MetalDevice = dev
193
- super().__init__()
197
+ class MetalAllocator(LRUAllocator[MetalDevice]):
194
198
  def _alloc(self, size:int, options) -> MetalBuffer:
199
+ if options.external_ptr: return MetalBuffer(objc_id(options.external_ptr), size)
200
+
195
201
  # Buffer is explicitly released in _free() rather than garbage collected via reference count
196
202
  ret = msg("newBufferWithLength:options:", objc_id)(self.dev.sysdevice, ctypes.c_ulong(size), MTLResourceOptions.MTLResourceStorageModeShared)
197
203
  if ret.value is None: raise MemoryError(f"Metal OOM while allocating {size=}")
198
204
  return MetalBuffer(ret, size)
199
- def _free(self, opaque:MetalBuffer, options): msg("release")(opaque.buf)
205
+ def _free(self, opaque:MetalBuffer, options):
206
+ if msg is not None and libobjc is not None: msg("release")(opaque.buf)
200
207
  def _transfer(self, dest:MetalBuffer, src:MetalBuffer, sz:int, src_dev:MetalDevice, dest_dev:MetalDevice):
201
208
  dest_dev.synchronize()
202
209
  src_command_buffer = msg("commandBuffer", objc_instance)(src_dev.mtl_queue)
@@ -214,11 +221,14 @@ class MetalAllocator(LRUAllocator):
214
221
  msg("setLabel:")(src_command_buffer, to_ns_str(f"COPY {src_dev.device} -> {dest_dev.device}"))
215
222
  msg("commit")(src_command_buffer)
216
223
  src_dev.mtl_buffers_in_flight.append(src_command_buffer)
224
+ # Transfers currently synchronize the completion. Otherwise, copies can sometimes lead to incorrect values.
225
+ # There is no real metal multidevice support for now, so transfer is used only for tests.
226
+ src_dev.synchronize()
217
227
  def _cp_mv(self, dst, src, prof_desc):
218
228
  with cpu_profile(prof_desc, self.dev.device, is_copy=True): dst[:] = src
219
229
  def _as_buffer(self, src:MetalBuffer) -> memoryview:
220
230
  self.dev.synchronize()
221
231
  return to_mv(cast(int, msg("contents", objc_id)(src.buf).value), src.size + src.offset)[src.offset:]
222
- def _copyin(self, dest:MetalBuffer, src:memoryview): self._cp_mv(self._as_buffer(dest), src, "CPU -> METAL")
223
- def _copyout(self, dest:memoryview, src:MetalBuffer): self._cp_mv(dest, self._as_buffer(src), "METAL -> CPU")
232
+ def _copyin(self, dest:MetalBuffer, src:memoryview): self._cp_mv(self._as_buffer(dest), src, "TINY -> METAL")
233
+ def _copyout(self, dest:memoryview, src:MetalBuffer): self._cp_mv(dest, self._as_buffer(src), "METAL -> TINY")
224
234
  def _offset(self, buf:MetalBuffer, size:int, offset:int): return MetalBuffer(buf.buf, size, offset)
@@ -2,8 +2,10 @@ import numpy as np
2
2
  from tinygrad.helpers import flat_mv
3
3
  from tinygrad.device import Compiled, Allocator
4
4
 
5
- class NpyAllocator(Allocator):
6
- def _copyout(self, dest:memoryview, src:np.ndarray): dest[:] = flat_mv(np.require(src, requirements='C').data)
5
+ class NpyAllocator(Allocator['NpyDevice']):
6
+ def _alloc(self, size:int, options=None) -> np.ndarray: return np.empty(size, dtype=np.uint8)
7
+ def _as_buffer(self, src:np.ndarray) -> memoryview: return flat_mv(np.require(src, requirements='C').data)
8
+ def _copyout(self, dest:memoryview, src:np.ndarray): dest[:] = self._as_buffer(src)
7
9
 
8
10
  class NpyDevice(Compiled):
9
- def __init__(self, device:str): super().__init__(device, NpyAllocator(), None, None, None)
11
+ def __init__(self, device:str): super().__init__(device, NpyAllocator(self), None, None, None)
@@ -0,0 +1,28 @@
1
+ from tinygrad.device import Compiled, Compiler, Allocator
2
+ from tinygrad.engine.jit import MultiGraphRunner
3
+ from tinygrad.renderer.cstyle import CStyleLanguage
4
+ from tinygrad.uop.ops import Ops
5
+
6
+ class NullRenderer(CStyleLanguage):
7
+ device = "NULL"
8
+ has_local = False
9
+ float4 = "float4"
10
+ code_for_op = {**CStyleLanguage.code_for_op, Ops.THREEFRY: lambda a,b,dtype: f"threefry({a},{b})", Ops.MAX: lambda a,b,dtype: f"max({a},{b})"}
11
+
12
+ class NullProgram:
13
+ def __init__(self, name:str, lib:bytes): pass
14
+ def __call__(self, *bufs, global_size:tuple[int,int,int]=(1,1,1), local_size:tuple[int,int,int]=(1,1,1), vals:tuple[int, ...]=(), wait=False):
15
+ return 1e-4
16
+
17
+ class NullAllocator(Allocator['NullDevice']):
18
+ def _alloc(self, size, options): pass
19
+ def _copyin(self, dest, src:memoryview): pass
20
+ def _copyout(self, dest:memoryview, src): pass
21
+ def _transfer(self, dest, src, sz:int, src_dev, dest_dev): pass
22
+ def _offset(self, buf, offset:int, size:int): pass
23
+
24
+ class NullGraph(MultiGraphRunner):
25
+ def __call__(self, input_rawbuffers, var_vals, wait=False) -> float|None: return 1e-3
26
+
27
+ class NullDevice(Compiled):
28
+ def __init__(self, device:str): super().__init__(device, NullAllocator(self), NullRenderer(), Compiler(), NullProgram, NullGraph)