tinygrad 0.7.0__py3-none-any.whl → 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. tinygrad/__init__.py +6 -0
  2. tinygrad/codegen/kernel.py +572 -83
  3. tinygrad/codegen/linearizer.py +415 -395
  4. tinygrad/codegen/uops.py +415 -0
  5. tinygrad/device.py +183 -0
  6. tinygrad/dtype.py +113 -0
  7. tinygrad/engine/__init__.py +0 -0
  8. tinygrad/engine/graph.py +100 -0
  9. tinygrad/engine/jit.py +195 -0
  10. tinygrad/engine/realize.py +191 -0
  11. tinygrad/engine/schedule.py +362 -0
  12. tinygrad/engine/search.py +196 -0
  13. tinygrad/{mlops.py → function.py} +76 -55
  14. tinygrad/helpers.py +196 -89
  15. tinygrad/lazy.py +210 -371
  16. tinygrad/multi.py +169 -0
  17. tinygrad/nn/__init__.py +202 -22
  18. tinygrad/nn/datasets.py +7 -0
  19. tinygrad/nn/optim.py +112 -32
  20. tinygrad/nn/state.py +136 -39
  21. tinygrad/ops.py +119 -202
  22. tinygrad/renderer/__init__.py +61 -0
  23. tinygrad/renderer/assembly.py +276 -0
  24. tinygrad/renderer/cstyle.py +353 -166
  25. tinygrad/renderer/llvmir.py +150 -138
  26. tinygrad/runtime/autogen/amd_gpu.py +1900 -0
  27. tinygrad/runtime/autogen/comgr.py +865 -0
  28. tinygrad/runtime/autogen/cuda.py +5923 -0
  29. tinygrad/runtime/autogen/hip.py +5909 -0
  30. tinygrad/runtime/autogen/hsa.py +5761 -0
  31. tinygrad/runtime/autogen/kfd.py +812 -0
  32. tinygrad/runtime/autogen/nv_gpu.py +33328 -0
  33. tinygrad/runtime/autogen/opencl.py +1795 -0
  34. tinygrad/runtime/driver/hip_comgr.py +47 -0
  35. tinygrad/runtime/driver/hsa.py +143 -0
  36. tinygrad/runtime/graph/clang.py +38 -0
  37. tinygrad/runtime/graph/cuda.py +81 -0
  38. tinygrad/runtime/graph/hcq.py +143 -0
  39. tinygrad/runtime/graph/hsa.py +171 -0
  40. tinygrad/runtime/graph/metal.py +75 -0
  41. tinygrad/runtime/ops_amd.py +564 -0
  42. tinygrad/runtime/ops_clang.py +24 -77
  43. tinygrad/runtime/ops_cuda.py +175 -89
  44. tinygrad/runtime/ops_disk.py +56 -33
  45. tinygrad/runtime/ops_gpu.py +92 -95
  46. tinygrad/runtime/ops_hsa.py +278 -0
  47. tinygrad/runtime/ops_llvm.py +39 -60
  48. tinygrad/runtime/ops_metal.py +92 -74
  49. tinygrad/runtime/ops_npy.py +9 -0
  50. tinygrad/runtime/ops_nv.py +630 -0
  51. tinygrad/runtime/ops_python.py +204 -0
  52. tinygrad/shape/shapetracker.py +86 -254
  53. tinygrad/shape/symbolic.py +166 -141
  54. tinygrad/shape/view.py +296 -0
  55. tinygrad/tensor.py +2619 -448
  56. {tinygrad-0.7.0.dist-info → tinygrad-0.9.0.dist-info}/LICENSE +1 -1
  57. tinygrad-0.9.0.dist-info/METADATA +227 -0
  58. tinygrad-0.9.0.dist-info/RECORD +60 -0
  59. {tinygrad-0.7.0.dist-info → tinygrad-0.9.0.dist-info}/WHEEL +1 -1
  60. tinygrad/codegen/assembly.py +0 -190
  61. tinygrad/codegen/optimizer.py +0 -379
  62. tinygrad/codegen/search.py +0 -72
  63. tinygrad/graph.py +0 -83
  64. tinygrad/jit.py +0 -57
  65. tinygrad/nn/image.py +0 -100
  66. tinygrad/renderer/assembly_arm64.py +0 -169
  67. tinygrad/renderer/assembly_ptx.py +0 -98
  68. tinygrad/renderer/wgsl.py +0 -53
  69. tinygrad/runtime/lib.py +0 -113
  70. tinygrad/runtime/ops_cpu.py +0 -51
  71. tinygrad/runtime/ops_hip.py +0 -82
  72. tinygrad/runtime/ops_shm.py +0 -29
  73. tinygrad/runtime/ops_torch.py +0 -30
  74. tinygrad/runtime/ops_webgpu.py +0 -45
  75. tinygrad-0.7.0.dist-info/METADATA +0 -212
  76. tinygrad-0.7.0.dist-info/RECORD +0 -40
  77. {tinygrad-0.7.0.dist-info → tinygrad-0.9.0.dist-info}/top_level.txt +0 -0
@@ -1,88 +1,106 @@
1
- # pip3 install pyobjc-framework-Metal pyobjc-framework-Cocoa pyobjc-framework-libdispatch
2
- import os, subprocess, pathlib, functools, ctypes
3
- import Metal, Cocoa, libdispatch # type: ignore
4
- from typing import List, Any
5
- from tinygrad.codegen.linearizer import LinearizerOptions
6
- from tinygrad.renderer.cstyle import uops_to_cstyle, CStyleLanguage
7
- from tinygrad.helpers import prod, getenv, DEBUG, DType, dtypes
8
- from tinygrad.ops import Compiled
9
- from tinygrad.runtime.lib import RawBufferMapped, LRUAllocator
1
+ from __future__ import annotations
2
+ import os, subprocess, pathlib, ctypes, tempfile, functools
3
+ import Metal, libdispatch
4
+ from typing import List, Set, Any, Tuple, Optional
5
+ from tinygrad.helpers import prod, getenv, DEBUG, unwrap2
6
+ from tinygrad.device import Compiled, Compiler, CompileError, LRUAllocator
7
+ from tinygrad.renderer.cstyle import MetalRenderer
10
8
 
11
- METAL_XCODE = getenv("METAL_XCODE")
9
+ def wait_check(cbuf: Any):
10
+ cbuf.waitUntilCompleted()
11
+ if (error := cbuf.error()) is not None:
12
+ raise RuntimeError(error)
12
13
 
13
- class MetalAllocator(LRUAllocator):
14
- def _do_alloc(self, size, dtype, device, **kwargs): return METAL.device.newBufferWithLength_options_(size*dtype.itemsize, Metal.MTLResourceStorageModeShared)
15
- def _do_free(self, buf): buf.release()
16
- def _cached_bufkey(self, size, dtype, device): return (device, size*dtype.itemsize) # Buffers of the same length could be reused, no matter what dtype.
17
-
18
- class _METAL:
19
- def __init__(self):
20
- self.mtl_buffers_in_flight: List[Any] = []
21
- self.device = Metal.MTLCreateSystemDefaultDevice()
22
- self.mtl_queue = self.device.newCommandQueueWithMaxCommandBufferCount_(1024)
23
- self.allocator = MetalAllocator(self.device.dedicatedMemorySize() or self.device.sharedMemorySize())
24
- # TODO: is there a better way to do this?
25
- def synchronize(self):
26
- for cbuf in self.mtl_buffers_in_flight: cbuf.waitUntilCompleted()
27
- self.mtl_buffers_in_flight.clear()
28
- METAL = _METAL()
29
-
30
- class RawMetalBuffer(RawBufferMapped):
31
- def __init__(self, size:int, dtype:DType):
32
- assert dtype != dtypes.double, f"METAL does not support {dtype.name}"
33
- super().__init__(size, dtype, allocator=METAL.allocator)
34
- def _buffer(self):
35
- METAL.synchronize()
36
- return self._buf.contents().as_buffer(self._buf.length())
37
-
38
- def unwrap(x):
39
- ret, err = x
40
- assert err is None, str(err)
41
- return ret
42
-
43
- class MetalProgram:
44
- def __init__(self, name:str, prg:str, binary:bool=False):
45
- if METAL_XCODE:
46
- air = subprocess.check_output(['xcrun', '-sdk', 'macosx', 'metal', '-x', 'metal', '-c', '-', '-o', '-'], input=prg.encode('utf-8'))
14
+ class MetalCompiler(Compiler):
15
+ def __init__(self, device:Optional[MetalDevice]):
16
+ self.device = device
17
+ super().__init__("compile_metal")
18
+ def compile(self, src:str) -> bytes:
19
+ if self.device is None:
47
20
  # NOTE: if you run llvm-dis on "air" you can see the llvm bytecode
48
- lib = subprocess.check_output(['xcrun', '-sdk', 'macosx', 'metallib', '-', '-o', '-'], input=air)
49
- data = libdispatch.dispatch_data_create(lib, len(lib), None, None)
50
- self.library = unwrap(METAL.device.newLibraryWithData_error_(data, None))
21
+ air = subprocess.check_output(['xcrun', '-sdk', 'macosx', 'metal', '-x', 'metal', '-c', '-', '-o', '-'], input=src.encode('utf-8'))
22
+ return subprocess.check_output(['xcrun', '-sdk', 'macosx', 'metallib', '-', '-o', '-'], input=air)
51
23
  else:
52
- options = Metal.MTLCompileOptions.alloc().init()
53
- self.library = unwrap(METAL.device.newLibraryWithSource_options_error_(prg, options, None))
24
+ options = Metal.MTLCompileOptions.new()
25
+ options.setFastMathEnabled_(getenv("METAL_FAST_MATH"))
26
+ try: library = unwrap2(self.device.device.newLibraryWithSource_options_error_(src, options, None))
27
+ except AssertionError as e: raise CompileError(e)
28
+ return library.libraryDataContents().bytes().tobytes()
29
+
30
+ class MetalProgram:
31
+ def __init__(self, device:MetalDevice, name:str, lib:bytes):
32
+ self.device, self.name, self.lib = device, name, lib
33
+ if DEBUG >= 6:
34
+ with tempfile.NamedTemporaryFile(delete=True) as shader:
35
+ shader.write(lib)
36
+ shader.flush()
37
+ os.system(f"cd {pathlib.Path(__file__).parents[2]}/extra/disassemblers/applegpu && python3 compiler_explorer.py {shader.name}")
38
+ assert lib[:4] == b"MTLB", "Invalid Metal library. Could be due to using conda. Try system python or METAL_XCODE=1 DISABLE_COMPILER_CACHE=1."
39
+ data = libdispatch.dispatch_data_create(lib, len(lib), None, None)
40
+ self.library = unwrap2(self.device.device.newLibraryWithData_error_(data, None))
54
41
  self.fxn = self.library.newFunctionWithName_(name)
55
- # hacks to disassemble shader
56
- if DEBUG >= 5:
57
- arc = unwrap(METAL.device.newBinaryArchiveWithDescriptor_error_(Metal.MTLBinaryArchiveDescriptor.alloc().init(), None))
58
- desc = Metal.MTLComputePipelineDescriptor.alloc().init()
59
- desc.setComputeFunction_(self.fxn)
60
- unwrap(arc.addComputePipelineFunctionsWithDescriptor_error_(desc, None))
61
- unwrap(arc.serializeToURL_error_(Cocoa.NSURL.URLWithString_("file:///tmp/shader.bin"), None))
62
- # clone https://github.com/dougallj/applegpu.git in tinygrad/disassemblers
63
- os.system(f"cd {pathlib.Path(__file__).parent.parent.parent}/disassemblers/applegpu && python3 compiler_explorer.py /tmp/shader.bin")
64
- self.pipeline_state = unwrap(METAL.device.newComputePipelineStateWithFunction_error_(self.fxn, None))
42
+ self.pipeline_state = unwrap2(self.device.device.newComputePipelineStateWithFunction_error_(self.fxn, None))
65
43
 
66
- def __call__(self, global_size, local_size, *bufs, wait=False):
67
- assert prod(local_size) <= self.pipeline_state.maxTotalThreadsPerThreadgroup(), f"local size {local_size} bigger than {self.pipeline_state.maxTotalThreadsPerThreadgroup()} with exec width {self.pipeline_state.threadExecutionWidth()} memory length {self.pipeline_state.staticThreadgroupMemoryLength()}"
68
- command_buffer = METAL.mtl_queue.commandBuffer()
44
+ def __call__(self, *bufs, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1), vals:Tuple[int, ...]=(), wait=False):
45
+ if prod(local_size) > self.pipeline_state.maxTotalThreadsPerThreadgroup(): raise RuntimeError(f"local size {local_size} bigger than {self.pipeline_state.maxTotalThreadsPerThreadgroup()} with exec width {self.pipeline_state.threadExecutionWidth()} memory length {self.pipeline_state.staticThreadgroupMemoryLength()}") # noqa: E501
46
+ command_buffer = self.device.mtl_queue.commandBuffer()
69
47
  encoder = command_buffer.computeCommandEncoder()
70
48
  encoder.setComputePipelineState_(self.pipeline_state)
71
- for i,a in enumerate(bufs):
72
- if isinstance(a, RawMetalBuffer): encoder.setBuffer_offset_atIndex_(a._buf, 0, i)
73
- elif isinstance(a, int): encoder.setBytes_length_atIndex_((arg:=ctypes.c_int32(a)), ctypes.sizeof(arg), i)
74
- else: raise RuntimeError(f"arg at index {i} has unsupported type {type(a)}")
49
+ for i,a in enumerate(bufs): encoder.setBuffer_offset_atIndex_(a, 0, i)
50
+ for i,a in enumerate(vals,start=len(bufs)): encoder.setBytes_length_atIndex_(ctypes.c_int32(a), 4, i)
75
51
  encoder.dispatchThreadgroups_threadsPerThreadgroup_(Metal.MTLSize(*global_size), Metal.MTLSize(*local_size))
76
52
  encoder.endEncoding()
77
53
  command_buffer.commit()
78
54
  if wait:
79
- command_buffer.waitUntilCompleted()
55
+ wait_check(command_buffer)
80
56
  return command_buffer.GPUEndTime() - command_buffer.GPUStartTime()
81
- METAL.mtl_buffers_in_flight.append(command_buffer)
57
+ self.device.mtl_buffers_in_flight.append(command_buffer)
58
+
59
+ class MetalAllocator(LRUAllocator):
60
+ def __init__(self, device:MetalDevice):
61
+ self.device:MetalDevice = device
62
+ self.track_cross_device: Set[MetalDevice] = set()
63
+ super().__init__()
64
+ def free_cache(self):
65
+ self.device.synchronize()
66
+ for x in self.track_cross_device: x.synchronize()
67
+ self.track_cross_device.clear()
68
+ return super().free_cache()
69
+ def _alloc(self, size:int, options) -> Any:
70
+ ret = self.device.device.newBufferWithLength_options_(size, Metal.MTLResourceStorageModeShared)
71
+ if ret is None: raise MemoryError(f"Metal OOM while allocating {size=}")
72
+ return ret
73
+ def transfer(self, dest:Any, src:Any, sz:int, src_dev: MetalDevice, **kwargs):
74
+ src_dev.synchronize()
75
+ command_buffer = self.device.mtl_queue.commandBuffer()
76
+ encoder = command_buffer.blitCommandEncoder()
77
+ encoder.copyFromBuffer_sourceOffset_toBuffer_destinationOffset_size_(src, 0, dest, 0, sz)
78
+ encoder.endEncoding()
79
+ command_buffer.commit()
80
+ self.device.mtl_buffers_in_flight.append(command_buffer)
81
+ def from_buffer(self, src:memoryview) -> Optional[Any]:
82
+ ret = self.device.device.newBufferWithBytesNoCopy_length_options_deallocator_(src, len(src), Metal.MTLResourceStorageModeShared, None)
83
+ if ret: self.device.mv_in_metal.append(src)
84
+ return ret
85
+ def _free(self, opaque:Any, options): opaque.release()
86
+ def as_buffer(self, src:Any) -> memoryview:
87
+ self.device.synchronize()
88
+ return src.contents().as_buffer(src.length())
89
+ def copyin(self, dest:Any, src:memoryview): self.as_buffer(dest)[:] = src
90
+ def copyout(self, dest:memoryview, src:Any): dest[:] = self.as_buffer(src)
82
91
 
83
- renderer = functools.partial(uops_to_cstyle, CStyleLanguage(
84
- kernel_prefix = "#include <metal_stdlib>\nusing namespace metal;\nkernel", buffer_prefix = "device ", smem_prefix = "threadgroup ", arg_int_prefix = "constant int&",
85
- barrier = "threadgroup_barrier(mem_flags::mem_threadgroup);", float4 = "float4", uses_ptr_arithmetic=True,
86
- gid = [f"gid.{chr(120+i)}" for i in range(3)], lid = [f"lid.{chr(120+i)}" for i in range(3)],
87
- extra_args = ['uint3 gid [[threadgroup_position_in_grid]]', 'uint3 lid [[thread_position_in_threadgroup]]']))
88
- MetalBuffer = Compiled(RawMetalBuffer, LinearizerOptions(), renderer, MetalProgram, METAL.synchronize)
92
+ class MetalDevice(Compiled):
93
+ def __init__(self, device:str):
94
+ self.device = Metal.MTLCreateSystemDefaultDevice()
95
+ self.mtl_queue = self.device.newCommandQueueWithMaxCommandBufferCount_(1024)
96
+ self.mtl_buffers_in_flight: List[Any] = []
97
+ self.mv_in_metal: List[memoryview] = []
98
+ self.track_cross_buffer: List[Any] = []
99
+ from tinygrad.runtime.graph.metal import MetalGraph
100
+ super().__init__(device, MetalAllocator(self), MetalRenderer(), MetalCompiler(None if getenv("METAL_XCODE") else self),
101
+ functools.partial(MetalProgram, self), MetalGraph)
102
+ def synchronize(self):
103
+ for cbuf in self.mtl_buffers_in_flight: wait_check(cbuf)
104
+ self.mv_in_metal.clear()
105
+ self.mtl_buffers_in_flight.clear()
106
+ self.track_cross_buffer.clear()
@@ -0,0 +1,9 @@
1
+ import numpy as np
2
+ from tinygrad.helpers import flat_mv
3
+ from tinygrad.device import Compiled, Allocator
4
+
5
+ class NpyAllocator(Allocator):
6
+ def copyout(self, dest:memoryview, src:np.ndarray): dest[:] = flat_mv(np.require(src, requirements='C').data)
7
+
8
+ class NpyDevice(Compiled):
9
+ def __init__(self, device:str): super().__init__(device, NpyAllocator(), None, None, None)