tinygrad 0.10.2__py3-none-any.whl → 0.11.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tinygrad/__init__.py +1 -1
- tinygrad/apps/llm.py +206 -0
- tinygrad/codegen/__init__.py +116 -0
- tinygrad/codegen/devectorizer.py +315 -172
- tinygrad/codegen/expander.py +8 -16
- tinygrad/codegen/gpudims.py +89 -0
- tinygrad/codegen/linearize.py +205 -203
- tinygrad/codegen/lowerer.py +92 -139
- tinygrad/codegen/opt/__init__.py +38 -0
- tinygrad/codegen/opt/heuristic.py +125 -0
- tinygrad/codegen/opt/kernel.py +510 -0
- tinygrad/{engine → codegen/opt}/search.py +51 -35
- tinygrad/codegen/opt/swizzler.py +134 -0
- tinygrad/codegen/opt/tc.py +127 -0
- tinygrad/codegen/quantize.py +67 -0
- tinygrad/device.py +122 -132
- tinygrad/dtype.py +152 -35
- tinygrad/engine/jit.py +81 -54
- tinygrad/engine/memory.py +46 -27
- tinygrad/engine/realize.py +82 -41
- tinygrad/engine/schedule.py +70 -445
- tinygrad/frontend/__init__.py +0 -0
- tinygrad/frontend/onnx.py +1253 -0
- tinygrad/frontend/torch.py +5 -0
- tinygrad/gradient.py +19 -27
- tinygrad/helpers.py +95 -47
- tinygrad/nn/__init__.py +7 -8
- tinygrad/nn/optim.py +72 -41
- tinygrad/nn/state.py +37 -23
- tinygrad/renderer/__init__.py +40 -60
- tinygrad/renderer/cstyle.py +143 -128
- tinygrad/renderer/llvmir.py +113 -62
- tinygrad/renderer/ptx.py +50 -32
- tinygrad/renderer/wgsl.py +27 -23
- tinygrad/runtime/autogen/am/am.py +5861 -0
- tinygrad/runtime/autogen/am/pm4_nv.py +962 -0
- tinygrad/runtime/autogen/am/pm4_soc15.py +931 -0
- tinygrad/runtime/autogen/am/sdma_4_0_0.py +5209 -0
- tinygrad/runtime/autogen/am/sdma_4_4_2.py +5209 -0
- tinygrad/runtime/autogen/am/sdma_5_0_0.py +7103 -0
- tinygrad/runtime/autogen/am/sdma_6_0_0.py +8085 -0
- tinygrad/runtime/autogen/am/smu_v13_0_0.py +3068 -0
- tinygrad/runtime/autogen/am/smu_v14_0_2.py +3605 -0
- tinygrad/runtime/autogen/amd_gpu.py +1433 -67197
- tinygrad/runtime/autogen/comgr.py +35 -9
- tinygrad/runtime/autogen/comgr_3.py +906 -0
- tinygrad/runtime/autogen/cuda.py +2419 -494
- tinygrad/runtime/autogen/hsa.py +57 -16
- tinygrad/runtime/autogen/ib.py +7171 -0
- tinygrad/runtime/autogen/io_uring.py +917 -118
- tinygrad/runtime/autogen/kfd.py +748 -26
- tinygrad/runtime/autogen/libc.py +613 -218
- tinygrad/runtime/autogen/libusb.py +1643 -0
- tinygrad/runtime/autogen/nv/nv.py +8602 -0
- tinygrad/runtime/autogen/nv_gpu.py +7218 -2072
- tinygrad/runtime/autogen/opencl.py +2 -4
- tinygrad/runtime/autogen/sqtt.py +1789 -0
- tinygrad/runtime/autogen/vfio.py +3 -3
- tinygrad/runtime/autogen/webgpu.py +273 -264
- tinygrad/runtime/graph/cuda.py +3 -3
- tinygrad/runtime/graph/hcq.py +68 -29
- tinygrad/runtime/graph/metal.py +29 -13
- tinygrad/runtime/graph/remote.py +114 -0
- tinygrad/runtime/ops_amd.py +537 -320
- tinygrad/runtime/ops_cpu.py +108 -7
- tinygrad/runtime/ops_cuda.py +12 -14
- tinygrad/runtime/ops_disk.py +13 -10
- tinygrad/runtime/ops_dsp.py +47 -40
- tinygrad/runtime/ops_gpu.py +13 -11
- tinygrad/runtime/ops_hip.py +6 -9
- tinygrad/runtime/ops_llvm.py +35 -15
- tinygrad/runtime/ops_metal.py +29 -19
- tinygrad/runtime/ops_npy.py +5 -3
- tinygrad/runtime/ops_null.py +28 -0
- tinygrad/runtime/ops_nv.py +306 -234
- tinygrad/runtime/ops_python.py +62 -52
- tinygrad/runtime/ops_qcom.py +28 -39
- tinygrad/runtime/ops_remote.py +482 -0
- tinygrad/runtime/ops_webgpu.py +28 -28
- tinygrad/runtime/support/am/amdev.py +114 -249
- tinygrad/runtime/support/am/ip.py +211 -172
- tinygrad/runtime/support/amd.py +138 -0
- tinygrad/runtime/support/{compiler_hip.py → compiler_amd.py} +40 -8
- tinygrad/runtime/support/compiler_cuda.py +8 -11
- tinygrad/runtime/support/elf.py +2 -1
- tinygrad/runtime/support/hcq.py +184 -97
- tinygrad/runtime/support/ib.py +172 -0
- tinygrad/runtime/support/llvm.py +3 -4
- tinygrad/runtime/support/memory.py +251 -0
- tinygrad/runtime/support/nv/__init__.py +0 -0
- tinygrad/runtime/support/nv/ip.py +581 -0
- tinygrad/runtime/support/nv/nvdev.py +183 -0
- tinygrad/runtime/support/system.py +170 -0
- tinygrad/runtime/support/usb.py +268 -0
- tinygrad/runtime/support/webgpu.py +18 -0
- tinygrad/schedule/__init__.py +0 -0
- tinygrad/schedule/grouper.py +119 -0
- tinygrad/schedule/kernelize.py +368 -0
- tinygrad/schedule/multi.py +231 -0
- tinygrad/shape/shapetracker.py +40 -46
- tinygrad/shape/view.py +88 -52
- tinygrad/tensor.py +968 -542
- tinygrad/uop/__init__.py +117 -0
- tinygrad/{codegen/transcendental.py → uop/decompositions.py} +125 -38
- tinygrad/uop/mathtraits.py +169 -0
- tinygrad/uop/ops.py +1021 -0
- tinygrad/uop/spec.py +228 -0
- tinygrad/{codegen → uop}/symbolic.py +239 -216
- tinygrad/uop/upat.py +163 -0
- tinygrad/viz/assets/cdnjs.cloudflare.com/ajax/libs/highlight.js/11.10.0/languages/x86asm.min.js +19 -0
- tinygrad/viz/assets/d3js.org/d3.v7.min.js +2 -0
- tinygrad/viz/assets/dagrejs.github.io/project/dagre/latest/dagre.min.js +801 -0
- tinygrad/viz/index.html +203 -403
- tinygrad/viz/js/index.js +718 -0
- tinygrad/viz/js/worker.js +29 -0
- tinygrad/viz/serve.py +224 -102
- {tinygrad-0.10.2.dist-info → tinygrad-0.11.0.dist-info}/METADATA +24 -16
- tinygrad-0.11.0.dist-info/RECORD +141 -0
- {tinygrad-0.10.2.dist-info → tinygrad-0.11.0.dist-info}/WHEEL +1 -1
- tinygrad/codegen/kernel.py +0 -693
- tinygrad/engine/multi.py +0 -161
- tinygrad/ops.py +0 -1003
- tinygrad/runtime/ops_cloud.py +0 -220
- tinygrad/runtime/support/allocator.py +0 -94
- tinygrad/spec.py +0 -155
- tinygrad/viz/assets/d3js.org/d3.v5.min.js +0 -2
- tinygrad/viz/assets/dagrejs.github.io/project/dagre-d3/latest/dagre-d3.min.js +0 -4816
- tinygrad/viz/perfetto.html +0 -178
- tinygrad-0.10.2.dist-info/RECORD +0 -99
- {tinygrad-0.10.2.dist-info → tinygrad-0.11.0.dist-info/licenses}/LICENSE +0 -0
- {tinygrad-0.10.2.dist-info → tinygrad-0.11.0.dist-info}/top_level.txt +0 -0
tinygrad/runtime/ops_metal.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1
|
-
import
|
2
|
-
from typing import Any,
|
3
|
-
from tinygrad.helpers import prod, to_mv, getenv, round_up, cache_dir, T, init_c_struct_t, PROFILE
|
4
|
-
from tinygrad.device import Compiled, Compiler, CompileError, LRUAllocator,
|
1
|
+
import subprocess, pathlib, struct, ctypes, tempfile, functools, contextlib, decimal, platform
|
2
|
+
from typing import Any, cast
|
3
|
+
from tinygrad.helpers import prod, to_mv, getenv, round_up, cache_dir, T, init_c_struct_t, PROFILE, ProfileRangeEvent, cpu_profile, unwrap
|
4
|
+
from tinygrad.device import Compiled, Compiler, CompileError, LRUAllocator, ProfileDeviceEvent
|
5
5
|
from tinygrad.renderer.cstyle import MetalRenderer
|
6
6
|
|
7
7
|
class objc_id(ctypes.c_void_p): # This prevents ctypes from converting response to plain int, and dict.fromkeys() can use it to dedup
|
@@ -35,7 +35,7 @@ libobjc.sel_registerName.restype = objc_id
|
|
35
35
|
libmetal.MTLCreateSystemDefaultDevice.restype = objc_instance
|
36
36
|
libdispatch.dispatch_data_create.restype = objc_instance
|
37
37
|
|
38
|
-
@functools.
|
38
|
+
@functools.cache
|
39
39
|
def msg(selector: str, restype: type[T] = objc_id): # type: ignore [assignment]
|
40
40
|
resname = libobjc.sel_registerName(selector.encode())
|
41
41
|
sender = libobjc["objc_msgSend"] # Using attribute access returns a new reference so setting restype is safe
|
@@ -43,11 +43,12 @@ def msg(selector: str, restype: type[T] = objc_id): # type: ignore [assignment]
|
|
43
43
|
def _msg(ptr: objc_id, *args: Any) -> T: return sender(ptr, resname, *args)
|
44
44
|
return _msg
|
45
45
|
|
46
|
-
@functools.
|
46
|
+
@functools.cache
|
47
47
|
def to_ns_str(s: str): return msg("stringWithUTF8String:", objc_instance)(libobjc.objc_getClass(b"NSString"), s.encode())
|
48
48
|
def from_ns_str(s): return bytes(msg("UTF8String", ctypes.c_char_p)(s)).decode()
|
49
49
|
|
50
|
-
def to_struct(*t: int, _type: type = ctypes.c_ulong):
|
50
|
+
def to_struct(*t: int, _type: type[ctypes._SimpleCData] = ctypes.c_ulong):
|
51
|
+
return init_c_struct_t(tuple([(f"field{i}", _type) for i in range(len(t))]))(*t)
|
51
52
|
|
52
53
|
def wait_check(cbuf: Any):
|
53
54
|
msg("waitUntilCompleted")(cbuf)
|
@@ -73,14 +74,17 @@ class MetalDevice(Compiled):
|
|
73
74
|
Compiled.profile_events += [ProfileDeviceEvent(device)]
|
74
75
|
|
75
76
|
from tinygrad.runtime.graph.metal import MetalGraph
|
77
|
+
# NOTE: GitHub CI macOS runners use paravirtualized metal which is broken with graph.
|
78
|
+
# This can be reproduced locally with any virtualization software (like utm) that can create macOS VMs with apple's own virtualization framework.
|
76
79
|
super().__init__(device, MetalAllocator(self), MetalRenderer(), MetalCompiler() if getenv("METAL_DIRECT", 1) else Compiler(),
|
77
|
-
functools.partial(MetalProgram, self), MetalGraph)
|
80
|
+
functools.partial(MetalProgram, self), MetalGraph if 'virtual' not in from_ns_str(msg('name')(self.sysdevice)).lower() else None)
|
78
81
|
|
79
82
|
def synchronize(self):
|
80
83
|
for cbuf in self.mtl_buffers_in_flight:
|
81
84
|
wait_check(cbuf)
|
82
85
|
st, en = decimal.Decimal(cmdbuf_st_time(cbuf)) * 1000000, decimal.Decimal(cmdbuf_en_time(cbuf)) * 1000000
|
83
|
-
|
86
|
+
# NOTE: command buffers from MetalGraph are not profiled here
|
87
|
+
if PROFILE and (lb:=cmdbuf_label(cbuf)) is not None and not lb.startswith("batched"):
|
84
88
|
Compiled.profile_events += [ProfileRangeEvent(self.device, lb, st, en, is_copy=lb.startswith("COPY"))]
|
85
89
|
self.mtl_buffers_in_flight.clear()
|
86
90
|
|
@@ -97,7 +101,7 @@ class MetalCompiler(Compiler):
|
|
97
101
|
# This means that MTLCompiler's llvm will create it's own instances of global state because RTLD_LOCAL doesn't export symbols, but if RTLD_GLOBAL
|
98
102
|
# library is loaded first then RTLD_LOCAL library will just use it's symbols. On linux there is RTLD_DEEPBIND to prevent that, but on macos there
|
99
103
|
# doesn't seem to be anything we can do.
|
100
|
-
with contextlib.suppress(FileNotFoundError):
|
104
|
+
with contextlib.suppress(FileNotFoundError, ModuleNotFoundError):
|
101
105
|
import tinygrad.runtime.autogen.llvm # noqa: F401
|
102
106
|
support = ctypes.CDLL("/System/Library/PrivateFrameworks/MTLCompiler.framework/MTLCompiler")
|
103
107
|
support.MTLCodeGenServiceCreate.restype = ctypes.c_void_p
|
@@ -107,7 +111,7 @@ class MetalCompiler(Compiler):
|
|
107
111
|
super().__init__("compile_metal_direct")
|
108
112
|
def __reduce__(self): return (MetalCompiler,()) # force pickle to create new instance for each multiprocessing fork
|
109
113
|
def compile(self, src:str) -> bytes:
|
110
|
-
ret:
|
114
|
+
ret: Exception|bytes = CompileError("MTLCodeGenServiceBuildRequest returned without calling the callback")
|
111
115
|
@ctypes.CFUNCTYPE(None, ctypes.c_void_p, ctypes.c_int32, ctypes.c_void_p, ctypes.c_size_t, ctypes.c_char_p)
|
112
116
|
def callback(blockptr, error, dataPtr, dataLen, errorMessage):
|
113
117
|
nonlocal ret
|
@@ -140,7 +144,10 @@ class MetalCompiler(Compiler):
|
|
140
144
|
with tempfile.NamedTemporaryFile(delete=True) as shader:
|
141
145
|
shader.write(lib)
|
142
146
|
shader.flush()
|
143
|
-
|
147
|
+
proc = subprocess.Popen(f"cd {pathlib.Path(__file__).parents[2]}/extra/disassemblers/applegpu && python3 compiler_explorer.py {shader.name}",
|
148
|
+
stdout=subprocess.PIPE, shell=True, text=True, bufsize=1)
|
149
|
+
for line in unwrap(proc.stdout): print(line, end="")
|
150
|
+
ret = proc.wait()
|
144
151
|
if ret: print("Disassembler Error: Make sure you have https://github.com/dougallj/applegpu cloned to tinygrad/extra/disassemblers/applegpu")
|
145
152
|
|
146
153
|
class MetalProgram:
|
@@ -187,16 +194,16 @@ class MetalProgram:
|
|
187
194
|
class MetalBuffer:
|
188
195
|
def __init__(self, buf:Any, size:int, offset=0): self.buf, self.size, self.offset = buf, size, offset
|
189
196
|
|
190
|
-
class MetalAllocator(LRUAllocator):
|
191
|
-
def __init__(self, dev:MetalDevice):
|
192
|
-
self.dev:MetalDevice = dev
|
193
|
-
super().__init__()
|
197
|
+
class MetalAllocator(LRUAllocator[MetalDevice]):
|
194
198
|
def _alloc(self, size:int, options) -> MetalBuffer:
|
199
|
+
if options.external_ptr: return MetalBuffer(objc_id(options.external_ptr), size)
|
200
|
+
|
195
201
|
# Buffer is explicitly released in _free() rather than garbage collected via reference count
|
196
202
|
ret = msg("newBufferWithLength:options:", objc_id)(self.dev.sysdevice, ctypes.c_ulong(size), MTLResourceOptions.MTLResourceStorageModeShared)
|
197
203
|
if ret.value is None: raise MemoryError(f"Metal OOM while allocating {size=}")
|
198
204
|
return MetalBuffer(ret, size)
|
199
|
-
def _free(self, opaque:MetalBuffer, options):
|
205
|
+
def _free(self, opaque:MetalBuffer, options):
|
206
|
+
if msg is not None and libobjc is not None: msg("release")(opaque.buf)
|
200
207
|
def _transfer(self, dest:MetalBuffer, src:MetalBuffer, sz:int, src_dev:MetalDevice, dest_dev:MetalDevice):
|
201
208
|
dest_dev.synchronize()
|
202
209
|
src_command_buffer = msg("commandBuffer", objc_instance)(src_dev.mtl_queue)
|
@@ -214,11 +221,14 @@ class MetalAllocator(LRUAllocator):
|
|
214
221
|
msg("setLabel:")(src_command_buffer, to_ns_str(f"COPY {src_dev.device} -> {dest_dev.device}"))
|
215
222
|
msg("commit")(src_command_buffer)
|
216
223
|
src_dev.mtl_buffers_in_flight.append(src_command_buffer)
|
224
|
+
# Transfers currently synchronize the completion. Otherwise, copies can sometimes lead to incorrect values.
|
225
|
+
# There is no real metal multidevice support for now, so transfer is used only for tests.
|
226
|
+
src_dev.synchronize()
|
217
227
|
def _cp_mv(self, dst, src, prof_desc):
|
218
228
|
with cpu_profile(prof_desc, self.dev.device, is_copy=True): dst[:] = src
|
219
229
|
def _as_buffer(self, src:MetalBuffer) -> memoryview:
|
220
230
|
self.dev.synchronize()
|
221
231
|
return to_mv(cast(int, msg("contents", objc_id)(src.buf).value), src.size + src.offset)[src.offset:]
|
222
|
-
def _copyin(self, dest:MetalBuffer, src:memoryview): self._cp_mv(self._as_buffer(dest), src, "
|
223
|
-
def _copyout(self, dest:memoryview, src:MetalBuffer): self._cp_mv(dest, self._as_buffer(src), "METAL ->
|
232
|
+
def _copyin(self, dest:MetalBuffer, src:memoryview): self._cp_mv(self._as_buffer(dest), src, "TINY -> METAL")
|
233
|
+
def _copyout(self, dest:memoryview, src:MetalBuffer): self._cp_mv(dest, self._as_buffer(src), "METAL -> TINY")
|
224
234
|
def _offset(self, buf:MetalBuffer, size:int, offset:int): return MetalBuffer(buf.buf, size, offset)
|
tinygrad/runtime/ops_npy.py
CHANGED
@@ -2,8 +2,10 @@ import numpy as np
|
|
2
2
|
from tinygrad.helpers import flat_mv
|
3
3
|
from tinygrad.device import Compiled, Allocator
|
4
4
|
|
5
|
-
class NpyAllocator(Allocator):
|
6
|
-
def
|
5
|
+
class NpyAllocator(Allocator['NpyDevice']):
|
6
|
+
def _alloc(self, size:int, options=None) -> np.ndarray: return np.empty(size, dtype=np.uint8)
|
7
|
+
def _as_buffer(self, src:np.ndarray) -> memoryview: return flat_mv(np.require(src, requirements='C').data)
|
8
|
+
def _copyout(self, dest:memoryview, src:np.ndarray): dest[:] = self._as_buffer(src)
|
7
9
|
|
8
10
|
class NpyDevice(Compiled):
|
9
|
-
def __init__(self, device:str): super().__init__(device, NpyAllocator(), None, None, None)
|
11
|
+
def __init__(self, device:str): super().__init__(device, NpyAllocator(self), None, None, None)
|
@@ -0,0 +1,28 @@
|
|
1
|
+
from tinygrad.device import Compiled, Compiler, Allocator
|
2
|
+
from tinygrad.engine.jit import MultiGraphRunner
|
3
|
+
from tinygrad.renderer.cstyle import CStyleLanguage
|
4
|
+
from tinygrad.uop.ops import Ops
|
5
|
+
|
6
|
+
class NullRenderer(CStyleLanguage):
|
7
|
+
device = "NULL"
|
8
|
+
has_local = False
|
9
|
+
float4 = "float4"
|
10
|
+
code_for_op = {**CStyleLanguage.code_for_op, Ops.THREEFRY: lambda a,b,dtype: f"threefry({a},{b})", Ops.MAX: lambda a,b,dtype: f"max({a},{b})"}
|
11
|
+
|
12
|
+
class NullProgram:
|
13
|
+
def __init__(self, name:str, lib:bytes): pass
|
14
|
+
def __call__(self, *bufs, global_size:tuple[int,int,int]=(1,1,1), local_size:tuple[int,int,int]=(1,1,1), vals:tuple[int, ...]=(), wait=False):
|
15
|
+
return 1e-4
|
16
|
+
|
17
|
+
class NullAllocator(Allocator['NullDevice']):
|
18
|
+
def _alloc(self, size, options): pass
|
19
|
+
def _copyin(self, dest, src:memoryview): pass
|
20
|
+
def _copyout(self, dest:memoryview, src): pass
|
21
|
+
def _transfer(self, dest, src, sz:int, src_dev, dest_dev): pass
|
22
|
+
def _offset(self, buf, offset:int, size:int): pass
|
23
|
+
|
24
|
+
class NullGraph(MultiGraphRunner):
|
25
|
+
def __call__(self, input_rawbuffers, var_vals, wait=False) -> float|None: return 1e-3
|
26
|
+
|
27
|
+
class NullDevice(Compiled):
|
28
|
+
def __init__(self, device:str): super().__init__(device, NullAllocator(self), NullRenderer(), Compiler(), NullProgram, NullGraph)
|