tinygrad 0.7.0__py3-none-any.whl → 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. tinygrad/__init__.py +6 -0
  2. tinygrad/codegen/kernel.py +572 -83
  3. tinygrad/codegen/linearizer.py +415 -395
  4. tinygrad/codegen/uops.py +415 -0
  5. tinygrad/device.py +183 -0
  6. tinygrad/dtype.py +113 -0
  7. tinygrad/engine/__init__.py +0 -0
  8. tinygrad/engine/graph.py +100 -0
  9. tinygrad/engine/jit.py +195 -0
  10. tinygrad/engine/realize.py +191 -0
  11. tinygrad/engine/schedule.py +362 -0
  12. tinygrad/engine/search.py +196 -0
  13. tinygrad/{mlops.py → function.py} +76 -55
  14. tinygrad/helpers.py +196 -89
  15. tinygrad/lazy.py +210 -371
  16. tinygrad/multi.py +169 -0
  17. tinygrad/nn/__init__.py +202 -22
  18. tinygrad/nn/datasets.py +7 -0
  19. tinygrad/nn/optim.py +112 -32
  20. tinygrad/nn/state.py +136 -39
  21. tinygrad/ops.py +119 -202
  22. tinygrad/renderer/__init__.py +61 -0
  23. tinygrad/renderer/assembly.py +276 -0
  24. tinygrad/renderer/cstyle.py +353 -166
  25. tinygrad/renderer/llvmir.py +150 -138
  26. tinygrad/runtime/autogen/amd_gpu.py +1900 -0
  27. tinygrad/runtime/autogen/comgr.py +865 -0
  28. tinygrad/runtime/autogen/cuda.py +5923 -0
  29. tinygrad/runtime/autogen/hip.py +5909 -0
  30. tinygrad/runtime/autogen/hsa.py +5761 -0
  31. tinygrad/runtime/autogen/kfd.py +812 -0
  32. tinygrad/runtime/autogen/nv_gpu.py +33328 -0
  33. tinygrad/runtime/autogen/opencl.py +1795 -0
  34. tinygrad/runtime/driver/hip_comgr.py +47 -0
  35. tinygrad/runtime/driver/hsa.py +143 -0
  36. tinygrad/runtime/graph/clang.py +38 -0
  37. tinygrad/runtime/graph/cuda.py +81 -0
  38. tinygrad/runtime/graph/hcq.py +143 -0
  39. tinygrad/runtime/graph/hsa.py +171 -0
  40. tinygrad/runtime/graph/metal.py +75 -0
  41. tinygrad/runtime/ops_amd.py +564 -0
  42. tinygrad/runtime/ops_clang.py +24 -77
  43. tinygrad/runtime/ops_cuda.py +175 -89
  44. tinygrad/runtime/ops_disk.py +56 -33
  45. tinygrad/runtime/ops_gpu.py +92 -95
  46. tinygrad/runtime/ops_hsa.py +278 -0
  47. tinygrad/runtime/ops_llvm.py +39 -60
  48. tinygrad/runtime/ops_metal.py +92 -74
  49. tinygrad/runtime/ops_npy.py +9 -0
  50. tinygrad/runtime/ops_nv.py +630 -0
  51. tinygrad/runtime/ops_python.py +204 -0
  52. tinygrad/shape/shapetracker.py +86 -254
  53. tinygrad/shape/symbolic.py +166 -141
  54. tinygrad/shape/view.py +296 -0
  55. tinygrad/tensor.py +2619 -448
  56. {tinygrad-0.7.0.dist-info → tinygrad-0.9.0.dist-info}/LICENSE +1 -1
  57. tinygrad-0.9.0.dist-info/METADATA +227 -0
  58. tinygrad-0.9.0.dist-info/RECORD +60 -0
  59. {tinygrad-0.7.0.dist-info → tinygrad-0.9.0.dist-info}/WHEEL +1 -1
  60. tinygrad/codegen/assembly.py +0 -190
  61. tinygrad/codegen/optimizer.py +0 -379
  62. tinygrad/codegen/search.py +0 -72
  63. tinygrad/graph.py +0 -83
  64. tinygrad/jit.py +0 -57
  65. tinygrad/nn/image.py +0 -100
  66. tinygrad/renderer/assembly_arm64.py +0 -169
  67. tinygrad/renderer/assembly_ptx.py +0 -98
  68. tinygrad/renderer/wgsl.py +0 -53
  69. tinygrad/runtime/lib.py +0 -113
  70. tinygrad/runtime/ops_cpu.py +0 -51
  71. tinygrad/runtime/ops_hip.py +0 -82
  72. tinygrad/runtime/ops_shm.py +0 -29
  73. tinygrad/runtime/ops_torch.py +0 -30
  74. tinygrad/runtime/ops_webgpu.py +0 -45
  75. tinygrad-0.7.0.dist-info/METADATA +0 -212
  76. tinygrad-0.7.0.dist-info/RECORD +0 -40
  77. {tinygrad-0.7.0.dist-info → tinygrad-0.9.0.dist-info}/top_level.txt +0 -0
@@ -1,81 +1,28 @@
1
- import os, time, ctypes, hashlib, subprocess, platform, tempfile, functools
2
- from functools import partial, reduce
3
- from tinygrad.ops import Compiled
4
- from tinygrad.helpers import fromimport, getenv, DEBUG, CI
5
- from tinygrad.runtime.lib import RawMallocBuffer
6
- from tinygrad.codegen.linearizer import LinearizerOptions
7
- from tinygrad.renderer.cstyle import uops_to_cstyle, CStyleLanguage
8
- import struct
9
- import numpy as np
1
+ import ctypes, subprocess, pathlib, tempfile
2
+ from tinygrad.device import Compiled, Compiler, MallocAllocator
3
+ from tinygrad.helpers import cpu_time_execution, DEBUG, cpu_objdump
4
+ from tinygrad.renderer.cstyle import ClangRenderer
10
5
 
11
- ARM64 = getenv('ARM64', False)
12
- if CI and ARM64: from unicorn import Uc, UC_ARCH_ARM64, UC_MODE_ARM, UC_HOOK_CODE, arm64_const # type: ignore
13
-
14
- args = {
15
- 'Windows': {'cflags':'', 'ext':'dll', 'exp':'__declspec(dllexport)'},
16
- 'Linux': {'cflags':'-lm -fPIC --rtlib=compiler-rt ', 'ext':'so', 'exp':''},
17
- 'Darwin': {'cflags':'-lm -fPIC --rtlib=compiler-rt ', 'ext':'dylib', 'exp':''}
18
- }[platform.system()]
19
-
20
- CLANG_PROGRAM_HEADER = '#include <math.h>\n#define max(x,y) ((x>y)?x:y)\n#define int64 long\n#define half __fp16\n#define uchar unsigned char\n#define bool uchar\n'
21
- ADDRESS = 0x10000
22
-
23
- # Unicorn doesn't support external calls
24
- def align(addr): return (addr+4095) & ~(4095)
25
- mock_lm = {"sinf": np.sin, "sqrtf": np.sqrt, "exp2f": np.exp2, "log2f": np.log2}
26
- def emulate_ext_calls(fn, uc, address, size, user_data):
27
- s_in = struct.unpack('f', struct.pack('I', uc.reg_read(getattr(arm64_const, f'UC_ARM64_REG_S{fn[2][1:]}'))))[0]
28
- uc.reg_write(getattr(arm64_const, f'UC_ARM64_REG_S{fn[1][1:]}'), struct.unpack('I', struct.pack('f', mock_lm[fn[0]](s_in)))[0]) # type: ignore
6
+ class ClangCompiler(Compiler):
7
+ def compile(self, src:str) -> bytes:
8
+ # TODO: remove file write. sadly clang doesn't like the use of /dev/stdout here
9
+ with tempfile.NamedTemporaryFile(delete=True) as output_file:
10
+ subprocess.check_output(['clang', '-include', 'tgmath.h', '-shared', '-march=native', '-O2', '-Wall', '-Werror', '-x', 'c', '-fPIC', '-',
11
+ '-o', str(output_file.name)], input=src.encode('utf-8'))
12
+ return pathlib.Path(output_file.name).read_bytes()
29
13
 
30
14
  class ClangProgram:
31
- def __init__(self, name:str, prg:str, binary:bool=False):
32
- # TODO: is there a way to not write this to disk?
33
- # A: it seems there isn't https://stackoverflow.com/questions/28053328/ctypes-cdll-load-library-from-memory-rather-than-file
34
- # because ctypes.CDLL() calls dlopen (POSIX) or LoadLibrary (Windows) which require a file
35
- fn = f"{tempfile.gettempdir()}/clang_{hashlib.md5(prg.encode('utf-8')).hexdigest()}.{args['ext']}"
36
- if binary and DEBUG >= 5: print(prg)
37
- if not os.path.exists(fn):
38
- tmp = f"{fn}.{os.getpid()}.tmp"
39
- if not binary:
40
- prg = CLANG_PROGRAM_HEADER + prg
41
- subprocess.check_output(args=('clang -shared -O2 -Wall -Werror -x c '+args['cflags']+' - -o '+tmp).split(), input=prg.encode('utf-8'))
42
- os.rename(tmp, fn)
43
- else:
44
- if CI and ARM64:
45
- prg = prg.split('\n') # type: ignore
46
- self.varsize = align(int(prg[0].split(" ")[1]))
47
- self.ext_calls = {(i*4+ADDRESS):ins.split(" ")[1:] for i, ins in enumerate(filter(lambda ins: ins[:4] != 'loop', prg[6:-3])) if ins[:2] == 'bl'}
48
- prg = "\n".join(['nop' if ins[:2] == 'bl' else ins for ins in prg[6:-3]] + ['\n'])
49
- subprocess.check_output(args=('aarch64-linux-gnu-as -o '+tmp).split(), input=prg.encode('utf-8'))
50
- subprocess.check_output(args=('aarch64-linux-gnu-objcopy -O binary --only-section=.text '+tmp+' '+fn+'.bin').split())
51
- self.prg = open(fn + '.bin', 'rb').read()
52
- return
53
- subprocess.check_output(args=('as -o' + tmp).split(), input=prg.encode('utf-8'))
54
- subprocess.check_output(args=('clang -lm -shared '+tmp+' -o'+fn).split())
55
- self.lib = ctypes.CDLL(fn)
56
- self.fxn = self.lib[name]
57
- def __call__(self, global_size, local_size, *args, wait=False):
58
- if wait: st = time.monotonic()
59
- if CI and ARM64:
60
- mu = Uc(UC_ARCH_ARM64, UC_MODE_ARM)
61
- total_mem = align(reduce(lambda total, arg: total + arg.size * arg.dtype.itemsize, args, len(self.prg)+self.varsize))
62
- mu.mem_map(ADDRESS, total_mem)
63
- for k, fn in self.ext_calls.items(): mu.hook_add(UC_HOOK_CODE, partial(emulate_ext_calls, fn), begin=k, end=k)
64
- mu.mem_write(ADDRESS, self.prg + b''.join(bytes(arg._buf) for arg in args))
65
- addr = ADDRESS + len(self.prg)
66
- for i, arg in enumerate(args):
67
- if i<=7:
68
- mu.reg_write(getattr(arm64_const, f'UC_ARM64_REG_X{i}'), addr)
69
- else:
70
- # NOTE: In ARM, args beyond the first 8 are placed on the stack it also account for the stack red zone.
71
- mu.mem_write(ADDRESS + total_mem - (len(args[8:])+2)*8 + 8*(i-8), addr.to_bytes(8, 'little'))
72
- addr += arg.size * arg.dtype.itemsize
73
- mu.reg_write(arm64_const.UC_ARM64_REG_SP, ADDRESS + total_mem - (len(args[8:])+2)*8)
74
- mu.emu_start(ADDRESS, ADDRESS + len(self.prg))
75
- args[0]._buf = mu.mem_read(mu.reg_read(arm64_const.UC_ARM64_REG_X0), args[0].size * args[0].dtype.itemsize)
76
- else:
77
- self.fxn(*[x._buf if isinstance(x, RawMallocBuffer) else x for x in args])
78
- if wait: return time.monotonic()-st
15
+ def __init__(self, name:str, lib:bytes):
16
+ if DEBUG >= 6: cpu_objdump(lib)
17
+ self.name, self.lib = name, lib
18
+ # write to disk so we can load it
19
+ with tempfile.NamedTemporaryFile(delete=True) as cached_file_path:
20
+ pathlib.Path(cached_file_path.name).write_bytes(lib)
21
+ self.fxn = ctypes.CDLL(str(cached_file_path.name))[name]
22
+
23
+ def __call__(self, *bufs, vals=(), wait=False): return cpu_time_execution(lambda: self.fxn(*bufs, *vals), enable=wait)
79
24
 
80
- renderer = fromimport("tinygrad.renderer.assembly_arm64", "uops_to_arm64_asm") if ARM64 else functools.partial(uops_to_cstyle, CStyleLanguage(kernel_prefix=args['exp'], buffer_suffix=" restrict", arg_int_prefix="const int"))
81
- ClangBuffer = Compiled(RawMallocBuffer, LinearizerOptions(supports_float4=False, has_local=False), renderer, ClangProgram)
25
+ class ClangDevice(Compiled):
26
+ def __init__(self, device:str):
27
+ from tinygrad.runtime.graph.clang import ClangGraph
28
+ super().__init__(device, MallocAllocator, ClangRenderer(), ClangCompiler("compile_clang"), ClangProgram, ClangGraph)
@@ -1,99 +1,185 @@
1
- import subprocess, time, re, hashlib, tempfile, os, functools
2
- from typing import Optional
3
- import numpy as np
4
- from pycuda.compiler import compile as cuda_compile # type: ignore
5
- from tinygrad.helpers import DEBUG, getenv, colored, fromimport
6
- from tinygrad.ops import Compiled
7
- from tinygrad.runtime.lib import RawBufferCopyInOut, RawMallocBuffer, LRUAllocator
8
- from tinygrad.codegen.linearizer import LinearizerOptions
9
- from tinygrad.renderer.cstyle import uops_to_cstyle, CStyleLanguage
1
+ from __future__ import annotations
2
+ import subprocess, hashlib, tempfile, ctypes, ctypes.util, functools, re
3
+ from pathlib import Path
4
+ from typing import Tuple, Optional, List
5
+ import tinygrad.runtime.autogen.cuda as cuda
6
+ from tinygrad.helpers import DEBUG, getenv, from_mv, to_char_p_p, init_c_var, init_c_struct_t, colored, cpu_time_execution
7
+ from tinygrad.device import Compiled, Compiler, CompileError, BufferOptions, LRUAllocator, MallocAllocator
8
+ from tinygrad.renderer.cstyle import CUDARenderer
9
+ from tinygrad.renderer.assembly import PTXRenderer
10
+ if getenv("IOCTL"): import extra.nv_gpu_driver.nv_ioctl # noqa: F401
10
11
 
11
12
  def pretty_ptx(s):
12
13
  # all expressions match `<valid_before><expr><valid_after>` and replace it with `<valid_before>color(<expr>)<valid_after>`
13
- s = re.sub(r'([!@<\[\s,\+\-;\n])((?:[_%$][\w%\$_]+(?:\.[xyz])?\:?)|(?:buf\d+))([<>\]\s,\+\-;\n\)])', lambda m:m[1]+colored(m[2], "blue")+m[3], s, flags=re.M) # identifiers
14
+ s = re.sub(r'([!@<\[\s,\+\-;\n])((?:[_%$][\w%\$_]+(?:\.[xyz])?\:?)|(?:buf\d+))([<>\]\s,\+\-;\n\)])', lambda m:m[1]+colored(m[2], "blue")+m[3], s, flags=re.M) # identifiers # noqa: E501
14
15
  s = re.sub(r'(.)((?:b|s|u|f)(?:8|16|32|64)|pred)([\.\s])', lambda m:m[1]+colored(m[2], "green")+m[3], s, flags=re.M) # types
15
16
  s = re.sub(r'^(\s*)([\w]+)(.*?;$)', lambda m:m[1]+colored(m[2], "yellow")+m[3], s, flags=re.M) # instructions
16
- s = re.sub(r'([<>\[\]\s,\+\-;])((?:0[fF][0-9a-fA-F]{8})|(?:[0-9]+)|(?:0[xX][0-9a-fA-F]+))([<>\[\]\s,\+\-;])', lambda m:m[1]+colored(m[2], "yellow")+m[3], s, flags=re.M) # numbers
17
+ s = re.sub(r'([<>\[\]\s,\+\-;])((?:0[fF][0-9a-fA-F]{8})|(?:[0-9]+)|(?:0[xX][0-9a-fA-F]+))([<>\[\]\s,\+\-;])', lambda m:m[1]+colored(m[2], "yellow")+m[3], s, flags=re.M) # numbers # noqa: E501
17
18
  s = re.sub(r'(\.)(param|reg|global)', lambda m:m[1]+colored(m[2], "magenta"), s, flags=re.M) # space
18
19
  s = re.sub(r'(\.)(version|target|address_size|visible|entry)', lambda m:m[1]+colored(m[2], "magenta"), s, flags=re.M) # derivatives
19
20
  return s
20
- def arch(): return "sm_" + "".join([str(x) for x in pycuda.driver.Context.get_device().compute_capability()])
21
-
22
- if getenv("CUDACPU", 0) == 1:
23
- import ctypes, ctypes.util
24
- lib = ctypes.CDLL(ctypes.util.find_library("gpuocelot"))
25
- lib.ptx_run.argtypes = [ctypes.c_char_p, ctypes.c_int, ctypes.POINTER(ctypes.c_void_p), ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int]
26
- class cuda:
27
- class module:
28
- def __init__(self, src): self.src = src
29
- def get_function(self, _): return self
30
- def __call__(self, *args, block, grid): lib.ptx_run(self.src, len(args), (ctypes.c_void_p * len(args))(*[ctypes.cast(x, ctypes.c_void_p) for x in args]), *block, *grid)
31
- module_from_buffer = lambda src: cuda.module(src) # pylint: disable=unnecessary-lambda # noqa: E731
32
- class Event:
33
- def __init__(self): pass
34
- def record(self): self.start = time.perf_counter()
35
- def time_till(self, other): return self.start - other.start
36
- def synchronize(self): pass
37
- class Context:
38
- synchronize = lambda:0 # noqa: E731
39
- CompileError = Exception
40
- class context:
41
- class device:
42
- compute_capability = lambda: (3,5) # pylint: disable=unnecessary-lambda # noqa: E731
43
- get_device = lambda: context.device # pylint: disable=unnecessary-lambda # noqa: E731
44
- import pycuda.driver # type: ignore
45
- pycuda.driver.Context = context
46
- RawCUDABuffer = RawMallocBuffer
47
- else:
48
- import pycuda.autoprimaryctx # type: ignore # pylint: disable=unused-import # noqa: F401
49
- import pycuda.driver as cuda # type: ignore
50
- class CUDAAllocator(LRUAllocator):
51
- def _do_alloc(self, size, dtype, device, **kwargs): return cuda.mem_alloc(size * dtype.itemsize) # type: ignore
52
- def _cached_bufkey(self, size, dtype, device): return (device, size*dtype.itemsize) # Buffers of the same length could be reused, no matter what dtype.
53
- CUDAAlloc = CUDAAllocator(pycuda.driver.Context.get_device().total_memory())
54
- class RawCUDABuffer(RawBufferCopyInOut): # type: ignore
55
- def __init__(self, size, dtype): super().__init__(size, dtype, allocator=CUDAAlloc)
56
- def _copyin(self, x:np.ndarray, stream:Optional[cuda.Stream]=None): cuda.memcpy_htod_async(self._buf, x.ravel(), stream) # type: ignore
57
- def _copyout(self, x:np.ndarray): cuda.memcpy_dtoh(x, self._buf) # type: ignore
21
+
22
+ CUDACPU = getenv("CUDACPU") == 1
23
+ if CUDACPU:
24
+ gpuocelot_lib = ctypes.CDLL(ctypes.util.find_library("gpuocelot"))
25
+ gpuocelot_lib.ptx_run.argtypes = [ctypes.c_char_p, ctypes.c_int, ctypes.POINTER(ctypes.c_void_p), ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int] # noqa: E501
26
+ cuda.cuLaunchKernel = lambda src, gx, gy, gz, lx, ly, lz, shared, stream, unused_extra, args: gpuocelot_lib.ptx_run(src, len(args), (ctypes.c_void_p * len(args))(*[ctypes.cast(x, ctypes.c_void_p) for x in args]), lx, ly, lz, gx, gy, gz, shared) # type: ignore # noqa: E501
27
+
28
+ def check(status):
29
+ if status != 0: raise RuntimeError(f"CUDA Error {status}, {ctypes.string_at(init_c_var(ctypes.POINTER(ctypes.c_char)(), lambda x: cuda.cuGetErrorString(status, ctypes.byref(x)))).decode()}") # noqa: E501
30
+
31
+ def encode_args(args, vals) -> Tuple[ctypes.Structure, ctypes.Array]:
32
+ c_args = init_c_struct_t(tuple([(f'f{i}', cuda.CUdeviceptr_v2) for i in range(len(args))] +
33
+ [(f'v{i}', ctypes.c_int) for i in range(len(vals))]))(*args, *vals)
34
+ vargs = (ctypes.c_void_p * 5)(ctypes.c_void_p(1), ctypes.cast(ctypes.byref(c_args), ctypes.c_void_p), ctypes.c_void_p(2),
35
+ ctypes.cast(ctypes.pointer(ctypes.c_size_t(ctypes.sizeof(c_args))), ctypes.c_void_p), ctypes.c_void_p(0))
36
+ return c_args, vargs
37
+
38
+ def cu_time_execution(cb, enable=False) -> Optional[float]:
39
+ if CUDACPU: return cpu_time_execution(cb, enable=enable)
40
+ if not enable: return cb()
41
+ evs = [init_c_var(cuda.CUevent(), lambda x: cuda.cuEventCreate(ctypes.byref(x), 0)) for _ in range(2)]
42
+ cuda.cuEventRecord(evs[0], None)
43
+ cb()
44
+ cuda.cuEventRecord(evs[1], None)
45
+ check(cuda.cuEventSynchronize(evs[1]))
46
+ cuda.cuEventElapsedTime(ctypes.byref(ret := ctypes.c_float()), evs[0], evs[1])
47
+ for ev in evs: cuda.cuEventDestroy_v2(ev)
48
+ return ret.value * 1e-3
49
+
50
+ def _get_bytes(arg, get_str, get_sz, check) -> bytes:
51
+ sz = init_c_var(ctypes.c_size_t(), lambda x: check(get_sz(arg, ctypes.byref(x))))
52
+ return ctypes.string_at(init_c_var(ctypes.create_string_buffer(sz.value), lambda x: check(get_str(arg, x))), size=sz.value)
53
+
54
+ class PTXCompiler(Compiler):
55
+ def __init__(self, arch:str):
56
+ self.arch = arch
57
+ self.version = "7.8" if arch >= "sm_89" else "7.5"
58
+ super().__init__(f"compile_ptx_{self.arch}")
59
+ def compile(self, src:str) -> bytes: return src.replace("TARGET", self.arch).replace("VERSION", self.version).encode()
60
+
61
+ class CUDACompiler(Compiler):
62
+ def __init__(self, arch:str):
63
+ self.arch = arch
64
+ check(cuda.nvrtcVersion((nvrtcMajor := ctypes.c_int()), (nvrtcMinor := ctypes.c_int())))
65
+ self.compile_options = [f'--gpu-architecture={arch}', "-I/usr/local/cuda/include", "-I/usr/include", "-I/opt/cuda/include/"]
66
+ if (nvrtcMajor.value, nvrtcMinor.value) >= (12, 4): self.compile_options.append("--minimal")
67
+ super().__init__(f"compile_cuda_{self.arch}")
68
+ def compile(self, src:str) -> bytes:
69
+ check(cuda.nvrtcCreateProgram(ctypes.byref(prog := cuda.nvrtcProgram()), src.encode(), "<null>".encode(), 0, None, None))
70
+ status = cuda.nvrtcCompileProgram(prog, len(self.compile_options), to_char_p_p([o.encode() for o in self.compile_options]))
71
+
72
+ if status != 0: raise CompileError(f"compile failed: {_get_bytes(prog, cuda.nvrtcGetProgramLog, cuda.nvrtcGetProgramLogSize, check).decode()}")
73
+ return _get_bytes(prog, cuda.nvrtcGetPTX, cuda.nvrtcGetPTXSize, check)
74
+
75
+ def cuda_disassemble(lib, arch):
76
+ try:
77
+ fn = (Path(tempfile.gettempdir()) / f"tinycuda_{hashlib.md5(lib).hexdigest()}").as_posix()
78
+ with open(fn + ".ptx", "wb") as f: f.write(lib)
79
+ subprocess.run(["ptxas", f"-arch={arch}", "-o", fn, fn+".ptx"], check=True)
80
+ print(subprocess.check_output(['nvdisasm', fn]).decode('utf-8'))
81
+ except Exception as e: print("failed to generate SASS", str(e))
58
82
 
59
83
  class CUDAProgram:
60
- def __init__(self, name:str, prg:str, binary=False):
61
- if not binary:
62
- try: prg = cuda_compile(prg, target="ptx", no_extern_c=True, options=['-Wno-deprecated-gpu-targets']).decode('utf-8')
63
- except cuda.CompileError as e:
64
- if DEBUG >= 3: print("FAILED TO BUILD", prg)
65
- raise e
66
- if DEBUG >= 5: print(pretty_ptx(prg))
67
- if DEBUG >= 6:
68
- try:
69
- fn = os.path.join(tempfile.gettempdir(), f"tinycuda_{hashlib.md5(prg.encode('utf-8')).hexdigest()}")
70
- with open(fn + ".ptx", "wb") as f: f.write(prg.encode('utf-8'))
71
- subprocess.run(["ptxas", f"-arch={arch()}", "-o", fn, fn+".ptx"], check=True)
72
- print(subprocess.check_output(['nvdisasm', fn]).decode('utf-8'))
73
- except Exception as e: print("failed to generate SASS", str(e))
74
- # TODO: name is wrong, so we get it from the ptx using hacks
75
- self.prg = cuda.module_from_buffer(prg.encode('utf-8')).get_function(prg.split(".visible .entry ")[1].split("(")[0])
76
-
77
- def __call__(self, global_size, local_size, *args, wait=False):
78
- if wait:
79
- start, end = cuda.Event(), cuda.Event()
80
- start.record()
81
- self.prg(*[x._buf if isinstance(x, RawCUDABuffer) else np.int32(x) if (isinstance(x, int) and not getenv("CUDACPU")) else x for x in args], block=tuple(local_size), grid=tuple(global_size))
82
- if wait:
83
- end.record()
84
- end.synchronize()
85
- return start.time_till(end)*1e-3
86
-
87
- renderer = functools.partial(uops_to_cstyle, CStyleLanguage(
88
- kernel_prefix = "__global__", smem_prefix = "__shared__ ", arg_int_prefix = "const int", barrier = "__syncthreads();", float4 = "make_float4",
89
- gid = [f'blockIdx.{chr(120+i)}' for i in range(3)],
90
- lid = [f'threadIdx.{chr(120+i)}' for i in range(3)],
91
- half_prekernel = """
92
- #include <cuda_fp16.h>
93
- struct __align__(8) half4 {
94
- half2 x, y;
95
- __device__ __forceinline__ explicit half4(const float4& a): x(make_half2(__float2half(a.x), __float2half(a.y))), y(make_half2(__float2half(a.z),__float2half(a.w))) {}
96
- __device__ __forceinline__ explicit operator float4() const {return make_float4(__half2float(x.x), __half2float(x.y), __half2float(y.x), __half2float(y.y)); }
97
- };
98
- """)) if not getenv("PTX") else fromimport("tinygrad.renderer.assembly_ptx", "uops_to_ptx_asm")
99
- CUDABuffer = Compiled(RawCUDABuffer, LinearizerOptions(supports_float4=False if getenv("PTX") else True, supports_float4_alu=False, global_max = [65535, 65535, 2147483647], local_max = [64, 1024, 1024]), renderer, CUDAProgram, cuda.Context.synchronize)
84
+ def __init__(self, device:CUDADevice, name:str, lib:bytes):
85
+ self.device, self.name, self.lib = device, name, lib
86
+ if DEBUG >= 5: print("\n".join([f"{i+1:>3} {line}" for i, line in enumerate(pretty_ptx(lib.decode('utf-8')).split("\n"))]))
87
+ if DEBUG >= 6: cuda_disassemble(lib, device.arch)
88
+
89
+ if CUDACPU: self.prg = lib
90
+ else:
91
+ check(cuda.cuCtxSetCurrent(self.device.context))
92
+ self.module = cuda.CUmodule()
93
+ status = cuda.cuModuleLoadData(ctypes.byref(self.module), lib)
94
+ if status != 0:
95
+ del self.module
96
+ cuda_disassemble(lib, device.arch)
97
+ raise RuntimeError(f"module load failed with status code {status}: {cuda.cudaError_enum__enumvalues[status]}")
98
+ check(cuda.cuModuleGetFunction(ctypes.byref(prg := cuda.CUfunction()), self.module, name.encode("utf-8")))
99
+ self.prg = prg #type: ignore
100
+
101
+ def __del__(self):
102
+ if hasattr(self, 'module'): check(cuda.cuModuleUnload(self.module))
103
+
104
+ def __call__(self, *args, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1), vals:Tuple[int, ...]=(), wait=False):
105
+ if CUDACPU: self.vargs = args+tuple(vals)
106
+ else:
107
+ check(cuda.cuCtxSetCurrent(self.device.context))
108
+ if not hasattr(self, "vargs"):
109
+ self.c_args, self.vargs = encode_args(args, vals) #type: ignore
110
+ else:
111
+ for i in range(len(args)): self.c_args.__setattr__(f'f{i}', args[i])
112
+ for i in range(len(vals)): self.c_args.__setattr__(f'v{i}', vals[i])
113
+ return cu_time_execution(lambda: check(cuda.cuLaunchKernel(self.prg, *global_size, *local_size, 0, None, None, self.vargs)), enable=wait)
114
+
115
+ class CUDAAllocator(LRUAllocator):
116
+ def __init__(self, device:CUDADevice):
117
+ self.device = device
118
+ super().__init__()
119
+ def _alloc(self, size, options:BufferOptions):
120
+ check(cuda.cuCtxSetCurrent(self.device.context))
121
+ if options.host: return init_c_var(ctypes.c_void_p(), lambda x: check(cuda.cuMemHostAlloc(ctypes.byref(x), size, 0x01)))
122
+ else: return init_c_var(cuda.CUdeviceptr(), lambda x: check(cuda.cuMemAlloc_v2(ctypes.byref(x), size)))
123
+ def _free(self, opaque, options:BufferOptions):
124
+ if options.host: return check(cuda.cuMemFreeHost(opaque))
125
+ else: check(cuda.cuMemFree_v2(opaque))
126
+ def copyin(self, dest, src:memoryview):
127
+ check(cuda.cuCtxSetCurrent(self.device.context))
128
+ host_mem = self.alloc(len(src), BufferOptions(host=True))
129
+ self.device.pending_copyin.append((host_mem, len(src), BufferOptions(host=True)))
130
+ ctypes.memmove(host_mem, from_mv(src), len(src))
131
+ check(cuda.cuMemcpyHtoDAsync_v2(dest, host_mem, len(src), None))
132
+ def copyout(self, dest:memoryview, src):
133
+ CUDADevice.synchronize_system()
134
+ check(cuda.cuCtxSetCurrent(self.device.context))
135
+ check(cuda.cuMemcpyDtoH_v2(from_mv(dest), src, len(dest)))
136
+ def transfer(self, dest, src, sz:int, src_dev, dest_dev):
137
+ check(cuda.cuCtxSetCurrent(src_dev.context))
138
+ check(cuda.cuEventCreate(ctypes.byref(sync_event := cuda.CUevent()), 0))
139
+ check(cuda.cuMemcpyDtoDAsync_v2(dest, src, sz, None))
140
+ check(cuda.cuEventRecord(sync_event, None))
141
+ check(cuda.cuCtxSetCurrent(dest_dev.context))
142
+ check(cuda.cuStreamWaitEvent(None, sync_event, 0)) # sync the default stream on the dest dev
143
+ def offset(self, buf, size:int, offset:int): return ctypes.c_ulong(buf.value + offset)
144
+
145
+ class CUDADevice(Compiled):
146
+ devices: List[CUDADevice] = []
147
+ peer_access = False
148
+
149
+ def __init__(self, device:str):
150
+ device_id = int(device.split(":")[1]) if ":" in device else 0
151
+ if not CUDACPU:
152
+ check(cuda.cuInit(0))
153
+ self.cu_device = init_c_var(cuda.CUdevice(), lambda x: check(cuda.cuDeviceGet(ctypes.byref(x), device_id)))
154
+ self.context = init_c_var(cuda.CUcontext(), lambda x: check(cuda.cuCtxCreate_v2(ctypes.byref(x), 0, self.cu_device)))
155
+ check(cuda.cuDeviceComputeCapability(ctypes.byref(major := ctypes.c_int()), ctypes.byref(minor := ctypes.c_int()), device_id))
156
+
157
+ for dev in CUDADevice.devices:
158
+ check(cuda.cuDeviceCanAccessPeer(ctypes.byref(val := ctypes.c_int()), self.cu_device, dev.cu_device))
159
+ if val.value != 1: continue
160
+ check(cuda.cuCtxSetCurrent(dev.context))
161
+ check(cuda.cuCtxEnablePeerAccess(self.context, 0))
162
+ check(cuda.cuCtxSetCurrent(self.context))
163
+ check(cuda.cuCtxEnablePeerAccess(dev.context, 0))
164
+ CUDADevice.peer_access = True
165
+
166
+ self.arch = f"sm_{major.value}{minor.value}" if not CUDACPU else "sm_35"
167
+ self.pending_copyin: List[Tuple[int, int, Optional[BufferOptions]]] = []
168
+ CUDADevice.devices.append(self)
169
+
170
+ from tinygrad.runtime.graph.cuda import CUDAGraph
171
+ super().__init__(device, CUDAAllocator(self) if not CUDACPU else MallocAllocator,
172
+ PTXRenderer(self.arch) if getenv("PTX") else CUDARenderer(self.arch),
173
+ PTXCompiler(self.arch) if getenv("PTX") else CUDACompiler(self.arch),
174
+ functools.partial(CUDAProgram, self), graph=CUDAGraph if not CUDACPU else None)
175
+
176
+ def synchronize(self):
177
+ if CUDACPU: return
178
+ check(cuda.cuCtxSetCurrent(self.context))
179
+ check(cuda.cuCtxSynchronize())
180
+ for opaque,sz,options in self.pending_copyin: self.allocator.free(opaque, sz, options)
181
+ self.pending_copyin.clear()
182
+
183
+ @staticmethod
184
+ def synchronize_system():
185
+ for d in CUDADevice.devices: d.synchronize()
@@ -1,37 +1,60 @@
1
- import os, mmap
1
+ from __future__ import annotations
2
+ import os, mmap, _posixshmem, io
2
3
  from typing import Optional
3
- from typing import Callable, Dict, Tuple
4
- from tinygrad.helpers import prod, DType
5
- from tinygrad.runtime.lib import RawBufferMapped
6
- from tinygrad.ops import Interpreted, Op, MovementOps, UnaryOps
4
+ from tinygrad.helpers import OSX
5
+ from tinygrad.device import Compiled, Allocator
7
6
 
8
- class RawDiskBuffer(RawBufferMapped):
9
- def __init__(self, size, dtype:DType, device:Optional[str]=None, buf=None, shape=None, offset=0): # pylint: disable=super-init-not-called
10
- self.shape = (size, ) if shape is None else shape
11
- self.offset = offset # this is an offset in bytes
12
- assert device is not None or buf is not None, "disk tensor needs a path or a buf"
13
- if device is not None:
14
- f = open(device, "a+b")
15
- if os.path.getsize(device) < size * dtype.itemsize: os.ftruncate(f.fileno(), size * dtype.itemsize)
16
- buf = [f, mmap.mmap(f.fileno(), size * dtype.itemsize), 1]
7
+ class DiskBuffer:
8
+ def __init__(self, device:DiskDevice, size:int, offset=0):
9
+ self.device, self.size, self.offset = device, size, offset
10
+ def __repr__(self): return f"<DiskBuffer size={self.size} offset={self.offset}>"
11
+ def _buf(self) -> memoryview:
12
+ assert self.device.mem is not None, "DiskBuffer wasn't opened"
13
+ return memoryview(self.device.mem)[self.offset:self.offset+self.size]
14
+
15
+ MAP_LOCKED, MAP_POPULATE = 0 if OSX else 0x2000, getattr(mmap, "MAP_POPULATE", 0 if OSX else 0x008000)
16
+ class DiskAllocator(Allocator):
17
+ def __init__(self, device:DiskDevice): self.device = device
18
+ def _alloc(self, size:int, options):
19
+ self.device._might_open(size)
20
+ return DiskBuffer(self.device, size)
21
+ def _free(self, buf, options): self.device._might_close()
22
+ def as_buffer(self, src:DiskBuffer): return src._buf()
23
+ def copyin(self, dest:DiskBuffer, src:memoryview): dest._buf()[:] = src
24
+ def copyout(self, dest:memoryview, src:DiskBuffer):
25
+ if OSX and hasattr(self.device, 'fd'):
26
+ # OSX doesn't seem great at mmap, this is faster
27
+ with io.FileIO(self.device.fd, "a+b", closefd=False) as fo:
28
+ fo.seek(src.offset)
29
+ fo.readinto(dest)
17
30
  else:
18
- buf[2] += 1
19
- # NOTE: we don't call super since disk tensors don't use RAM
20
- self.size, self.dtype, self._buf = size, dtype, buf
21
- def __del__(self):
22
- self._buf[2] -= 1
23
- if self._buf[2] == 0: self._buf[0].close()
24
- def cast(self, arg:Tuple[DType, bool]): return RawDiskBuffer(self.size, arg[0], buf=self._buf, shape=self.shape, offset=self.offset)
25
- def reshape(self, arg): return RawDiskBuffer(self.size, self.dtype, buf=self._buf, shape=arg, offset=self.offset)
26
- def shrink(self, arg):
27
- assert arg[1:] == tuple([(0,x) for x in self.shape[1:]]), f"can only slice the first dim of disk tensor {arg}"
28
- offset = arg[0][0]*prod(self.shape[1:])*self.dtype.itemsize
29
- size = (arg[0][1]-arg[0][0]) * prod(self.shape[1:])
30
- return RawDiskBuffer(size, self.dtype, buf=self._buf, offset=self.offset+offset, shape=(arg[0][1]-arg[0][0],)+self.shape[1:])
31
- def _buffer(self): return memoryview(self._buf[1])[self.offset:self.offset+self.size*self.dtype.itemsize]
32
- def readinto(self, buf):
33
- self._buf[0].seek(self.offset)
34
- self._buf[0].readinto(buf)
31
+ dest[:] = src._buf()
32
+ def offset(self, buf:DiskBuffer, size:int, offset:int): return DiskBuffer(buf.device, size, offset)
33
+
34
+ class DiskDevice(Compiled):
35
+ def __init__(self, device:str):
36
+ self.size: Optional[int] = None
37
+ self.count = 0
38
+ super().__init__(device, DiskAllocator(self), None, None, None)
39
+ def _might_open(self, size):
40
+ self.count += 1
41
+ assert self.size is None or size <= self.size, f"can't reopen Disk tensor with larger size, opened with {self.size}, tried to open with {size}"
42
+ if self.size is not None: return
43
+ filename = self.dname[len("disk:"):]
44
+ self.size = size
35
45
 
36
- disk_fxn_for_op: Dict[Op, Callable] = { UnaryOps.NOOP: lambda x: x, UnaryOps.CAST: RawDiskBuffer.cast, MovementOps.RESHAPE: RawDiskBuffer.reshape, MovementOps.SHRINK: RawDiskBuffer.shrink }
37
- DiskBuffer = Interpreted(RawDiskBuffer, disk_fxn_for_op, to_underlying=lambda x:x, from_underlying=lambda x:x)
46
+ if filename.startswith("shm:"):
47
+ fd = _posixshmem.shm_open("/"+filename[4:].lstrip("/"), os.O_RDWR, 0o600)
48
+ self.mem = mmap.mmap(fd, self.size, mmap.MAP_SHARED | MAP_POPULATE | MAP_LOCKED)
49
+ os.close(fd)
50
+ else:
51
+ try: self.fd = os.open(filename, os.O_RDWR|os.O_CREAT|(0 if OSX else os.O_DIRECT))
52
+ except OSError: self.fd = os.open(filename, os.O_RDWR|os.O_CREAT)
53
+ if os.fstat(self.fd).st_size < self.size: os.ftruncate(self.fd, self.size)
54
+ self.mem = mmap.mmap(self.fd, self.size)
55
+ if (hp := getattr(mmap, "MADV_HUGEPAGE", None)) is not None: self.mem.madvise(hp) # type: ignore
56
+ def _might_close(self):
57
+ self.count -= 1
58
+ if self.count == 0:
59
+ if hasattr(self, 'fd'): os.close(self.fd)
60
+ self.size = None