tinygrad 0.8.0__py3-none-any.whl → 0.9.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. tinygrad/__init__.py +6 -6
  2. tinygrad/codegen/__init__.py +0 -0
  3. tinygrad/codegen/kernel.py +253 -225
  4. tinygrad/codegen/linearizer.py +398 -436
  5. tinygrad/codegen/uops.py +451 -0
  6. tinygrad/device.py +268 -274
  7. tinygrad/dtype.py +56 -40
  8. tinygrad/engine/__init__.py +0 -0
  9. tinygrad/engine/graph.py +100 -0
  10. tinygrad/engine/jit.py +198 -0
  11. tinygrad/engine/realize.py +192 -0
  12. tinygrad/engine/schedule.py +370 -0
  13. tinygrad/engine/search.py +199 -0
  14. tinygrad/{mlops.py → function.py} +40 -32
  15. tinygrad/helpers.py +144 -46
  16. tinygrad/lazy.py +143 -242
  17. tinygrad/multi.py +173 -0
  18. tinygrad/nn/__init__.py +180 -9
  19. tinygrad/nn/datasets.py +8 -0
  20. tinygrad/nn/optim.py +106 -28
  21. tinygrad/nn/state.py +87 -19
  22. tinygrad/ops.py +104 -45
  23. tinygrad/renderer/__init__.py +65 -0
  24. tinygrad/renderer/assembly.py +269 -0
  25. tinygrad/renderer/cstyle.py +308 -210
  26. tinygrad/renderer/llvmir.py +119 -124
  27. tinygrad/runtime/__init__.py +0 -0
  28. tinygrad/runtime/autogen/amd_gpu.py +13403 -0
  29. tinygrad/runtime/autogen/comgr.py +891 -0
  30. tinygrad/runtime/autogen/cuda.py +5923 -0
  31. tinygrad/runtime/autogen/hip.py +5909 -0
  32. tinygrad/runtime/autogen/hsa.py +5893 -0
  33. tinygrad/runtime/autogen/io_uring.py +1486 -0
  34. tinygrad/runtime/autogen/kfd.py +812 -0
  35. tinygrad/runtime/autogen/nv_gpu.py +33597 -0
  36. tinygrad/runtime/autogen/opencl.py +1795 -0
  37. tinygrad/runtime/driver/__init__.py +0 -0
  38. tinygrad/runtime/driver/hip_comgr.py +56 -0
  39. tinygrad/runtime/graph/__init__.py +0 -0
  40. tinygrad/runtime/graph/clang.py +39 -0
  41. tinygrad/runtime/graph/cuda.py +59 -54
  42. tinygrad/runtime/graph/hcq.py +187 -0
  43. tinygrad/runtime/graph/metal.py +37 -41
  44. tinygrad/runtime/ops_amd.py +550 -0
  45. tinygrad/runtime/ops_clang.py +16 -14
  46. tinygrad/runtime/ops_cuda.py +129 -37
  47. tinygrad/runtime/ops_disk.py +111 -43
  48. tinygrad/runtime/ops_gpu.py +52 -50
  49. tinygrad/runtime/ops_llvm.py +36 -56
  50. tinygrad/runtime/ops_metal.py +41 -24
  51. tinygrad/runtime/ops_npy.py +9 -0
  52. tinygrad/runtime/ops_nv.py +625 -0
  53. tinygrad/runtime/ops_python.py +208 -0
  54. tinygrad/shape/__init__.py +0 -0
  55. tinygrad/shape/shapetracker.py +46 -107
  56. tinygrad/shape/symbolic.py +99 -98
  57. tinygrad/shape/view.py +162 -45
  58. tinygrad/tensor.py +2492 -483
  59. {tinygrad-0.8.0.dist-info → tinygrad-0.9.1.dist-info}/LICENSE +1 -1
  60. {tinygrad-0.8.0.dist-info → tinygrad-0.9.1.dist-info}/METADATA +31 -13
  61. tinygrad-0.9.1.dist-info/RECORD +63 -0
  62. {tinygrad-0.8.0.dist-info → tinygrad-0.9.1.dist-info}/WHEEL +1 -1
  63. tinygrad/features/image.py +0 -93
  64. tinygrad/features/multi.py +0 -103
  65. tinygrad/features/search.py +0 -160
  66. tinygrad/graph.py +0 -106
  67. tinygrad/jit.py +0 -152
  68. tinygrad/realize.py +0 -50
  69. tinygrad/runtime/graph/hip.py +0 -24
  70. tinygrad/runtime/ops_cpu.py +0 -45
  71. tinygrad/runtime/ops_hip.py +0 -97
  72. tinygrad/runtime/ops_torch.py +0 -49
  73. tinygrad-0.8.0.dist-info/RECORD +0 -41
  74. {tinygrad-0.8.0.dist-info → tinygrad-0.9.1.dist-info}/top_level.txt +0 -0
@@ -1,66 +1,46 @@
1
- import ctypes
2
- from typing import ClassVar, Tuple
3
- from tinygrad.device import Compiled, MallocAllocator
4
- from tinygrad.helpers import getenv, DEBUG, cpu_time_execution
5
- from ctypes import CFUNCTYPE
6
- from tinygrad.codegen.kernel import LinearizerOptions
7
- from tinygrad.renderer.llvmir import uops_to_llvm_ir
8
-
1
+ from __future__ import annotations
2
+ import ctypes, functools
3
+ from typing import Tuple
4
+ from tinygrad.device import Compiled, Compiler, MallocAllocator
5
+ from tinygrad.helpers import DEBUG, cpu_time_execution, cpu_objdump
6
+ from tinygrad.renderer.llvmir import LLVMRenderer
9
7
  import llvmlite.binding as llvm
10
8
 
11
- LLVMOPT = bool(getenv("LLVMOPT"))
9
+ class LLVMCompiler(Compiler):
10
+ def __init__(self, device:LLVMDevice):
11
+ self.device = device
12
+ super().__init__("compile_llvm")
13
+ def compile(self, src:str) -> bytes:
14
+ mod = llvm.parse_assembly(src)
15
+ mod.verify()
16
+ self.device.optimizer.run(mod)
17
+ if DEBUG >= 5: print(self.device.target_machine.emit_assembly(mod))
18
+ return self.device.target_machine.emit_object(mod)
12
19
 
13
- class LLVM:
14
- target_machine: ClassVar[llvm.targets.TargetMachine] = None
15
- engine: ClassVar[llvm.executionengine.ExecutionEngine] = None
16
- optimizer: ClassVar[llvm.passmanagers.ModulePassManager] = None
20
+ class LLVMProgram:
21
+ def __init__(self, device:LLVMDevice, name:str, lib:bytes):
22
+ if DEBUG >= 6: cpu_objdump(lib)
23
+ self.name, self.lib = name, lib
24
+ device.engine.add_object_file(llvm.object_file.ObjectFileRef.from_data(lib))
25
+ self.fxn = device.engine.get_function_address(name)
17
26
 
18
- def __init__(self):
19
- if LLVM.engine is not None: return
27
+ def __call__(self, *bufs, vals:Tuple[int, ...]=(), wait=False):
28
+ if not hasattr(self, 'cfunc'):
29
+ self.cfunc = ctypes.CFUNCTYPE(ctypes.c_int, *([ctypes.c_void_p]*len(bufs)), *([ctypes.c_int32]*len(vals)))(self.fxn)
30
+ return cpu_time_execution(lambda: self.cfunc(*bufs, *vals), enable=wait)
31
+
32
+ class LLVMDevice(Compiled):
33
+ def __init__(self, device:str):
20
34
  llvm.initialize()
21
35
  llvm.initialize_native_target()
22
36
  llvm.initialize_native_asmprinter()
23
37
  llvm.initialize_native_asmparser()
24
- target = llvm.Target.from_triple(llvm.get_process_triple())
25
- LLVM.optimizer = llvm.create_module_pass_manager()
26
- LLVM.target_machine = target.create_target_machine(opt=2) # this opt actually can change things. ex: opt=3 means no FMA, opt=2 means FMA
27
- LLVM.target_machine.add_analysis_passes(LLVM.optimizer)
28
-
29
- # TODO: this makes compile times so much faster
30
- if LLVMOPT:
31
- llvm.set_option(str(), '-force-vector-interleave=4') # this makes sum the same speed as torch, it also doubles the (slow) conv speed
32
- if DEBUG >= 4: llvm.set_option(str(), '--debug-only=loop-vectorize')
33
- #llvm.set_option(str(), '--debug')
34
-
35
- # does this do anything?
36
- builder = llvm.create_pass_manager_builder()
37
- builder.opt_level = 3
38
- builder.size_level = 0
39
- builder.loop_vectorize = True
40
- builder.slp_vectorize = True
41
- builder.populate(LLVM.optimizer)
42
-
43
- LLVM.target_machine.set_asm_verbosity(True)
38
+ self.optimizer: llvm.passmanagers.ModulePassManager = llvm.create_module_pass_manager()
39
+ # this opt actually can change things. ex: opt=3 means no FMA, opt=2 means FMA
40
+ self.target_machine: llvm.targets.TargetMachine = llvm.Target.from_triple(llvm.get_process_triple()).create_target_machine(opt=2)
41
+ self.target_machine.add_analysis_passes(self.optimizer)
42
+ self.target_machine.set_asm_verbosity(True)
44
43
  backing_mod = llvm.parse_assembly(str())
45
44
  backing_mod.triple = llvm.get_process_triple()
46
- LLVM.engine = llvm.create_mcjit_compiler(backing_mod, LLVM.target_machine)
47
-
48
- def compile_llvm(prg) -> bytes:
49
- mod = llvm.parse_assembly(prg)
50
- mod.verify()
51
- LLVM().optimizer.run(mod)
52
- if DEBUG >= 5: print(LLVM.target_machine.emit_assembly(mod))
53
- return LLVM.target_machine.emit_object(mod)
54
-
55
- class LLVMProgram:
56
- def __init__(self, name:str, lib:bytes):
57
- self.name, self.lib = name, lib
58
- LLVM().engine.add_object_file(llvm.object_file.ObjectFileRef.from_data(lib))
59
- self.fxn = LLVM.engine.get_function_address(name)
60
-
61
- def __call__(self, *bufs, vals:Tuple[int, ...]=(), wait=False):
62
- self.cfunc = CFUNCTYPE(ctypes.c_int, *([ctypes.c_void_p]*len(bufs)), *([ctypes.c_int32]*len(vals)))(self.fxn)
63
- return cpu_time_execution(lambda: self.cfunc(*bufs, *vals), enable=wait)
64
-
65
- LLVMDevice = Compiled(MallocAllocator, LinearizerOptions(supports_float4=False, has_local=False, has_shared=False),
66
- uops_to_llvm_ir, compile_llvm, LLVMProgram)
45
+ self.engine: llvm.executionengine.ExecutionEngine = llvm.create_mcjit_compiler(backing_mod, self.target_machine)
46
+ super().__init__(device, MallocAllocator, LLVMRenderer(), LLVMCompiler(self), functools.partial(LLVMProgram, self))
@@ -1,21 +1,30 @@
1
1
  from __future__ import annotations
2
2
  import os, subprocess, pathlib, ctypes, tempfile, functools
3
3
  import Metal, libdispatch
4
- from typing import List, Any, Tuple, Optional
5
- from tinygrad.codegen.kernel import LinearizerOptions
4
+ from typing import List, Set, Any, Tuple, Optional
6
5
  from tinygrad.helpers import prod, getenv, DEBUG, unwrap2
7
- from tinygrad.device import Compiled, LRUAllocator
6
+ from tinygrad.device import Compiled, Compiler, CompileError, LRUAllocator
8
7
  from tinygrad.renderer.cstyle import MetalRenderer
9
8
 
10
- def compile_metal(prg, use_xcode=bool(getenv("METAL_XCODE"))) -> bytes:
11
- assert MetalDevice.compiler_device, "metal device creation is required for metal compile"
12
- if use_xcode:
13
- # NOTE: if you run llvm-dis on "air" you can see the llvm bytecode
14
- air = subprocess.check_output(['xcrun', '-sdk', 'macosx', 'metal', '-x', 'metal', '-c', '-', '-o', '-'], input=prg.encode('utf-8'))
15
- return subprocess.check_output(['xcrun', '-sdk', 'macosx', 'metallib', '-', '-o', '-'], input=air)
16
- options = Metal.MTLCompileOptions.new()
17
- library = unwrap2(MetalDevice.compiler_device.newLibraryWithSource_options_error_(prg, options, None))
18
- return library.libraryDataContents().bytes().tobytes()
9
+ def wait_check(cbuf: Any):
10
+ cbuf.waitUntilCompleted()
11
+ if (error := cbuf.error()) is not None:
12
+ raise RuntimeError(error)
13
+
14
+ class MetalCompiler(Compiler):
15
+ def __init__(self, device:Optional[MetalDevice]):
16
+ self.device = device
17
+ super().__init__("compile_metal")
18
+ def compile(self, src:str) -> bytes:
19
+ if self.device is None:
20
+ # NOTE: if you run llvm-dis on "air" you can see the llvm bytecode
21
+ air = subprocess.check_output(['xcrun', '-sdk', 'macosx', 'metal', '-x', 'metal', '-c', '-', '-o', '-'], input=src.encode('utf-8'))
22
+ return subprocess.check_output(['xcrun', '-sdk', 'macosx', 'metallib', '-', '-o', '-'], input=air)
23
+ options = Metal.MTLCompileOptions.new()
24
+ options.setFastMathEnabled_(getenv("METAL_FAST_MATH"))
25
+ try: library = unwrap2(self.device.device.newLibraryWithSource_options_error_(src, options, None))
26
+ except AssertionError as e: raise CompileError(e) from e
27
+ return library.libraryDataContents().bytes().tobytes()
19
28
 
20
29
  class MetalProgram:
21
30
  def __init__(self, device:MetalDevice, name:str, lib:bytes):
@@ -24,14 +33,15 @@ class MetalProgram:
24
33
  with tempfile.NamedTemporaryFile(delete=True) as shader:
25
34
  shader.write(lib)
26
35
  shader.flush()
27
- os.system(f"cd {pathlib.Path(__file__).parents[2]}/disassemblers/applegpu && python3 compiler_explorer.py {shader.name}")
36
+ os.system(f"cd {pathlib.Path(__file__).parents[2]}/extra/disassemblers/applegpu && python3 compiler_explorer.py {shader.name}")
37
+ assert lib[:4] == b"MTLB", "Invalid Metal library. Could be due to using conda. Try system python or METAL_XCODE=1 DISABLE_COMPILER_CACHE=1."
28
38
  data = libdispatch.dispatch_data_create(lib, len(lib), None, None)
29
39
  self.library = unwrap2(self.device.device.newLibraryWithData_error_(data, None))
30
40
  self.fxn = self.library.newFunctionWithName_(name)
31
41
  self.pipeline_state = unwrap2(self.device.device.newComputePipelineStateWithFunction_error_(self.fxn, None))
32
42
 
33
- def __call__(self, *bufs, global_size:Tuple[int,int,int], local_size:Tuple[int,int,int], vals:Tuple[int, ...]=(), wait=False):
34
- assert prod(local_size) <= self.pipeline_state.maxTotalThreadsPerThreadgroup(),f"local size {local_size} bigger than {self.pipeline_state.maxTotalThreadsPerThreadgroup()} with exec width {self.pipeline_state.threadExecutionWidth()} memory length {self.pipeline_state.staticThreadgroupMemoryLength()}" # noqa: E501
43
+ def __call__(self, *bufs, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1), vals:Tuple[int, ...]=(), wait=False):
44
+ if prod(local_size) > self.pipeline_state.maxTotalThreadsPerThreadgroup(): raise RuntimeError(f"local size {local_size} bigger than {self.pipeline_state.maxTotalThreadsPerThreadgroup()} with exec width {self.pipeline_state.threadExecutionWidth()} memory length {self.pipeline_state.staticThreadgroupMemoryLength()}") # noqa: E501
35
45
  command_buffer = self.device.mtl_queue.commandBuffer()
36
46
  encoder = command_buffer.computeCommandEncoder()
37
47
  encoder.setComputePipelineState_(self.pipeline_state)
@@ -41,19 +51,26 @@ class MetalProgram:
41
51
  encoder.endEncoding()
42
52
  command_buffer.commit()
43
53
  if wait:
44
- command_buffer.waitUntilCompleted()
54
+ wait_check(command_buffer)
45
55
  return command_buffer.GPUEndTime() - command_buffer.GPUStartTime()
46
56
  self.device.mtl_buffers_in_flight.append(command_buffer)
47
57
 
48
58
  class MetalAllocator(LRUAllocator):
49
59
  def __init__(self, device:MetalDevice):
50
60
  self.device:MetalDevice = device
61
+ self.track_cross_device: Set[MetalDevice] = set()
51
62
  super().__init__()
52
- def _alloc(self, size:int) -> Any:
63
+ def free_cache(self):
64
+ self.device.synchronize()
65
+ for x in self.track_cross_device: x.synchronize()
66
+ self.track_cross_device.clear()
67
+ return super().free_cache()
68
+ def _alloc(self, size:int, options) -> Any:
53
69
  ret = self.device.device.newBufferWithLength_options_(size, Metal.MTLResourceStorageModeShared)
54
70
  if ret is None: raise MemoryError(f"Metal OOM while allocating {size=}")
55
71
  return ret
56
- def transfer(self, dest:Any, src:Any, sz:int):
72
+ def transfer(self, dest:Any, src:Any, sz:int, src_dev: MetalDevice, **kwargs):
73
+ src_dev.synchronize()
57
74
  command_buffer = self.device.mtl_queue.commandBuffer()
58
75
  encoder = command_buffer.blitCommandEncoder()
59
76
  encoder.copyFromBuffer_sourceOffset_toBuffer_destinationOffset_size_(src, 0, dest, 0, sz)
@@ -64,7 +81,7 @@ class MetalAllocator(LRUAllocator):
64
81
  ret = self.device.device.newBufferWithBytesNoCopy_length_options_deallocator_(src, len(src), Metal.MTLResourceStorageModeShared, None)
65
82
  if ret: self.device.mv_in_metal.append(src)
66
83
  return ret
67
- def _free(self, opaque:Any): opaque.release()
84
+ def _free(self, opaque:Any, options): opaque.release()
68
85
  def as_buffer(self, src:Any) -> memoryview:
69
86
  self.device.synchronize()
70
87
  return src.contents().as_buffer(src.length())
@@ -72,17 +89,17 @@ class MetalAllocator(LRUAllocator):
72
89
  def copyout(self, dest:memoryview, src:Any): dest[:] = self.as_buffer(src)
73
90
 
74
91
  class MetalDevice(Compiled):
75
- compiler_device = None
76
92
  def __init__(self, device:str):
77
93
  self.device = Metal.MTLCreateSystemDefaultDevice()
78
- if MetalDevice.compiler_device is None: MetalDevice.compiler_device = self.device
79
94
  self.mtl_queue = self.device.newCommandQueueWithMaxCommandBufferCount_(1024)
80
95
  self.mtl_buffers_in_flight: List[Any] = []
81
96
  self.mv_in_metal: List[memoryview] = []
97
+ self.track_cross_buffer: List[Any] = []
82
98
  from tinygrad.runtime.graph.metal import MetalGraph
83
- super().__init__(MetalAllocator(self), LinearizerOptions(device="METAL"), MetalRenderer,
84
- compile_metal, functools.partial(MetalProgram, self), functools.partial(MetalGraph, self))
99
+ super().__init__(device, MetalAllocator(self), MetalRenderer(), MetalCompiler(None if getenv("METAL_XCODE") else self),
100
+ functools.partial(MetalProgram, self), MetalGraph)
85
101
  def synchronize(self):
86
- for cbuf in self.mtl_buffers_in_flight: cbuf.waitUntilCompleted()
102
+ for cbuf in self.mtl_buffers_in_flight: wait_check(cbuf)
87
103
  self.mv_in_metal.clear()
88
104
  self.mtl_buffers_in_flight.clear()
105
+ self.track_cross_buffer.clear()
@@ -0,0 +1,9 @@
1
+ import numpy as np
2
+ from tinygrad.helpers import flat_mv
3
+ from tinygrad.device import Compiled, Allocator
4
+
5
+ class NpyAllocator(Allocator): # pylint: disable=abstract-method
6
+ def copyout(self, dest:memoryview, src:np.ndarray): dest[:] = flat_mv(np.require(src, requirements='C').data)
7
+
8
+ class NpyDevice(Compiled):
9
+ def __init__(self, device:str): super().__init__(device, NpyAllocator(), None, None, None)