tinygrad 0.7.0__py3-none-any.whl → 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tinygrad/__init__.py +6 -0
- tinygrad/codegen/kernel.py +572 -83
- tinygrad/codegen/linearizer.py +415 -395
- tinygrad/codegen/uops.py +415 -0
- tinygrad/device.py +183 -0
- tinygrad/dtype.py +113 -0
- tinygrad/engine/__init__.py +0 -0
- tinygrad/engine/graph.py +100 -0
- tinygrad/engine/jit.py +195 -0
- tinygrad/engine/realize.py +191 -0
- tinygrad/engine/schedule.py +362 -0
- tinygrad/engine/search.py +196 -0
- tinygrad/{mlops.py → function.py} +76 -55
- tinygrad/helpers.py +196 -89
- tinygrad/lazy.py +210 -371
- tinygrad/multi.py +169 -0
- tinygrad/nn/__init__.py +202 -22
- tinygrad/nn/datasets.py +7 -0
- tinygrad/nn/optim.py +112 -32
- tinygrad/nn/state.py +136 -39
- tinygrad/ops.py +119 -202
- tinygrad/renderer/__init__.py +61 -0
- tinygrad/renderer/assembly.py +276 -0
- tinygrad/renderer/cstyle.py +353 -166
- tinygrad/renderer/llvmir.py +150 -138
- tinygrad/runtime/autogen/amd_gpu.py +1900 -0
- tinygrad/runtime/autogen/comgr.py +865 -0
- tinygrad/runtime/autogen/cuda.py +5923 -0
- tinygrad/runtime/autogen/hip.py +5909 -0
- tinygrad/runtime/autogen/hsa.py +5761 -0
- tinygrad/runtime/autogen/kfd.py +812 -0
- tinygrad/runtime/autogen/nv_gpu.py +33328 -0
- tinygrad/runtime/autogen/opencl.py +1795 -0
- tinygrad/runtime/driver/hip_comgr.py +47 -0
- tinygrad/runtime/driver/hsa.py +143 -0
- tinygrad/runtime/graph/clang.py +38 -0
- tinygrad/runtime/graph/cuda.py +81 -0
- tinygrad/runtime/graph/hcq.py +143 -0
- tinygrad/runtime/graph/hsa.py +171 -0
- tinygrad/runtime/graph/metal.py +75 -0
- tinygrad/runtime/ops_amd.py +564 -0
- tinygrad/runtime/ops_clang.py +24 -77
- tinygrad/runtime/ops_cuda.py +175 -89
- tinygrad/runtime/ops_disk.py +56 -33
- tinygrad/runtime/ops_gpu.py +92 -95
- tinygrad/runtime/ops_hsa.py +278 -0
- tinygrad/runtime/ops_llvm.py +39 -60
- tinygrad/runtime/ops_metal.py +92 -74
- tinygrad/runtime/ops_npy.py +9 -0
- tinygrad/runtime/ops_nv.py +630 -0
- tinygrad/runtime/ops_python.py +204 -0
- tinygrad/shape/shapetracker.py +86 -254
- tinygrad/shape/symbolic.py +166 -141
- tinygrad/shape/view.py +296 -0
- tinygrad/tensor.py +2619 -448
- {tinygrad-0.7.0.dist-info → tinygrad-0.9.0.dist-info}/LICENSE +1 -1
- tinygrad-0.9.0.dist-info/METADATA +227 -0
- tinygrad-0.9.0.dist-info/RECORD +60 -0
- {tinygrad-0.7.0.dist-info → tinygrad-0.9.0.dist-info}/WHEEL +1 -1
- tinygrad/codegen/assembly.py +0 -190
- tinygrad/codegen/optimizer.py +0 -379
- tinygrad/codegen/search.py +0 -72
- tinygrad/graph.py +0 -83
- tinygrad/jit.py +0 -57
- tinygrad/nn/image.py +0 -100
- tinygrad/renderer/assembly_arm64.py +0 -169
- tinygrad/renderer/assembly_ptx.py +0 -98
- tinygrad/renderer/wgsl.py +0 -53
- tinygrad/runtime/lib.py +0 -113
- tinygrad/runtime/ops_cpu.py +0 -51
- tinygrad/runtime/ops_hip.py +0 -82
- tinygrad/runtime/ops_shm.py +0 -29
- tinygrad/runtime/ops_torch.py +0 -30
- tinygrad/runtime/ops_webgpu.py +0 -45
- tinygrad-0.7.0.dist-info/METADATA +0 -212
- tinygrad-0.7.0.dist-info/RECORD +0 -40
- {tinygrad-0.7.0.dist-info → tinygrad-0.9.0.dist-info}/top_level.txt +0 -0
tinygrad/runtime/ops_hip.py
DELETED
@@ -1,82 +0,0 @@
|
|
1
|
-
import numpy as np
|
2
|
-
import ctypes, functools
|
3
|
-
import extra.hip_wrapper as hip
|
4
|
-
from tinygrad.helpers import DEBUG
|
5
|
-
from tinygrad.ops import Compiled
|
6
|
-
from tinygrad.runtime.lib import RawBufferCopyInOut, LRUAllocator
|
7
|
-
from tinygrad.codegen.linearizer import LinearizerOptions
|
8
|
-
from tinygrad.renderer.cstyle import uops_to_cstyle, CStyleLanguage
|
9
|
-
|
10
|
-
# TODO: if you fork and exit the child process after creating anything with cl on AMD, it hangs on e.wait()
|
11
|
-
if DEBUG >= 5:
|
12
|
-
from extra.helpers import enable_early_exec
|
13
|
-
early_exec = enable_early_exec()
|
14
|
-
|
15
|
-
# The default HIP stream is used for everything.
|
16
|
-
|
17
|
-
class HIPAllocator(LRUAllocator):
|
18
|
-
def _do_alloc(self, size, dtype, device, **kwargs): return hip.hipMalloc(size * dtype.itemsize)
|
19
|
-
def _do_free(self, buf): hip.hipFree(buf)
|
20
|
-
def _cached_bufkey(self, size, dtype, device): return (device, size*dtype.itemsize) # Buffers of the same length could be reused, no matter what dtype.
|
21
|
-
HIPAlloc = HIPAllocator(hip.hipGetDeviceProperties(hip.hipGetDevice()).totalGlobalMem)
|
22
|
-
|
23
|
-
class RawHIPBuffer(RawBufferCopyInOut):
|
24
|
-
def __init__(self, size, dtype): super().__init__(size, dtype, allocator=HIPAlloc)
|
25
|
-
def _copyin(self, x:np.ndarray): hip.hipMemcpyAsync_htod(self._buf, x.ctypes.data, self.size * self.dtype.itemsize, 0)
|
26
|
-
def _copyout(self, x:np.ndarray): hip.hipMemcpy_dtoh(x.ctypes.data, self._buf, self.size * self.dtype.itemsize)
|
27
|
-
|
28
|
-
class HIPProgram:
|
29
|
-
def __init__(self, name:str, prg:str, binary=False):
|
30
|
-
try:
|
31
|
-
if not binary:
|
32
|
-
prog = hip.hiprtcCreateProgram(prg, name, [], [])
|
33
|
-
device_properties = hip.hipGetDeviceProperties(hip.hipGetDevice())
|
34
|
-
hip.hiprtcCompileProgram(prog, [f'--offload-arch={device_properties.gcnArchName}'])
|
35
|
-
prg = hip.hiprtcGetCode(prog)
|
36
|
-
except Exception as e:
|
37
|
-
if DEBUG >= 3: print("FAILED TO BUILD", prg)
|
38
|
-
raise e
|
39
|
-
if DEBUG >= 5:
|
40
|
-
asm = early_exec((["/opt/rocm/llvm/bin/llvm-objdump", '-d', '-'], prg))
|
41
|
-
print('\n'.join([x for x in asm.decode('utf-8').split("\n") if 's_code_end' not in x]))
|
42
|
-
|
43
|
-
module = hip.hipModuleLoadData(prg)
|
44
|
-
self.prg = hip.hipModuleGetFunction(module, name)
|
45
|
-
|
46
|
-
def __call__(self, global_size, local_size, *args, wait=False):
|
47
|
-
if wait:
|
48
|
-
start, end = hip.hipEventCreate(), hip.hipEventCreate()
|
49
|
-
hip.hipEventRecord(start)
|
50
|
-
class PackageStruct(ctypes.Structure):
|
51
|
-
_fields_ = [(f'field{idx}', ctypes.c_void_p) for idx in range(len(args))]
|
52
|
-
struct = PackageStruct(*[data._buf for data in args])
|
53
|
-
hip.hipModuleLaunchKernel(self.prg, global_size[0], global_size[1], global_size[2], local_size[0], local_size[1], local_size[2], 0, 0, struct)
|
54
|
-
if wait:
|
55
|
-
hip.hipEventRecord(end)
|
56
|
-
hip.hipEventSynchronize(end)
|
57
|
-
return hip.hipEventElapsedTime(start, end)*1e-3
|
58
|
-
|
59
|
-
renderer = functools.partial(uops_to_cstyle, CStyleLanguage(
|
60
|
-
kernel_prefix = "#include <hip/hip_common.h>\n#define INFINITY (__builtin_inff())\n#define NAN (__builtin_nanf(\"\"))" + """
|
61
|
-
__device__ float4 max(float4 x, float4 y) { return float4(max(x.x, y.x), max(x.y, y.y), max(x.z, y.z), max(x.w, y.w)); }
|
62
|
-
__device__ float4 pow(float x, float4 y) { return float4(pow(x, y.x), pow(x, y.y), pow(x, y.z), pow(x, y.w)); }
|
63
|
-
__device__ float4 pow(float4 x, float4 y) { return float4(pow(x.x, y.x), pow(x.y, y.y), pow(x.z, y.z), pow(x.w, y.w)); }
|
64
|
-
__device__ float4 log2(float4 x) { return float4(log2(x.x), log2(x.y), log2(x.z), log2(x.w)); }
|
65
|
-
__device__ float4 exp2(float4 x) { return float4(exp2(x.x), exp2(x.y), exp2(x.z), exp2(x.w)); }
|
66
|
-
__device__ float4 sin(float4 x) { return float4(sin(x.x), sin(x.y), sin(x.z), sin(x.w)); }
|
67
|
-
typedef float float8 __attribute__((ext_vector_type(8)));
|
68
|
-
typedef _Float16 half16 __attribute__((ext_vector_type(16)));
|
69
|
-
extern "C" __global__
|
70
|
-
""", launch_bounds=True,
|
71
|
-
smem_prefix = "__shared__ ", barrier = "__syncthreads();", float4 = "make_float4", uses_vload=True, uses_ptr_arithmetic=True,
|
72
|
-
half_prekernel = "#include <hip/hip_fp16.h>\nusing half4 = HIP_vector_type<half, 4>;" + """
|
73
|
-
__device__ float vload_half(size_t offset, const half *p) { return (float)*(p + offset); }
|
74
|
-
__device__ float2 vload_half2(size_t offset, const half *p) { return make_float2((float)*(p + offset*2), (float)*(p + offset*2 + 1)); }
|
75
|
-
__device__ float4 vload_half4(size_t offset, const half *p) { return make_float4((float)*(p + offset*4), (float)*(p + offset*4 + 1), (float)*(p + offset*4 + 2), (float)*(p + offset*4 + 3)); }
|
76
|
-
__device__ void vstore_half(float data, size_t offset, half *p) { *(p + offset) = (half)data; }
|
77
|
-
__device__ void vstore_half2(float2 data, size_t offset, half *p) { *(p + offset*2) = (half)data.x; *(p + offset*2 + 1) = (half)data.y; }
|
78
|
-
__device__ void vstore_half4(float4 data, size_t offset, half *p) { *(p + offset*4) = (half)data.x; *(p + offset*4 + 1) = (half)data.y; *(p + offset*4 + 2) = (half)data.z; *(p + offset*4 + 3) = (half)data.w; }
|
79
|
-
""",
|
80
|
-
gid = [f'blockIdx.{chr(120+i)}' for i in range(3)],
|
81
|
-
lid = [f'threadIdx.{chr(120+i)}' for i in range(3)]))
|
82
|
-
HIPBuffer = Compiled(RawHIPBuffer, LinearizerOptions(), renderer, HIPProgram, hip.hipDeviceSynchronize)
|
tinygrad/runtime/ops_shm.py
DELETED
@@ -1,29 +0,0 @@
|
|
1
|
-
import os, mmap
|
2
|
-
try: import _posixshmem # type: ignore
|
3
|
-
except Exception: pass
|
4
|
-
from typing import Callable, Dict
|
5
|
-
from tinygrad.helpers import DType
|
6
|
-
from tinygrad.runtime.lib import RawBufferMapped
|
7
|
-
from tinygrad.ops import Interpreted, Op, UnaryOps, MovementOps
|
8
|
-
|
9
|
-
SHM_CACHE: Dict[str, mmap.mmap] = {}
|
10
|
-
class RawShmBuffer(RawBufferMapped):
|
11
|
-
def __init__(self, size, dtype:DType, device:str):
|
12
|
-
device, self.cache_id = device.split(",")[0], None if "," not in device else device.split(",")[1]
|
13
|
-
|
14
|
-
if self.cache_id is not None and self.cache_id in SHM_CACHE: shm = SHM_CACHE[self.cache_id]
|
15
|
-
else:
|
16
|
-
fd = _posixshmem.shm_open(device, os.O_RDWR, 0o600)
|
17
|
-
# TODO: these flags are somewhat platform specific, but python doesn't expose the ones we need
|
18
|
-
shm = mmap.mmap(fd, size * dtype.itemsize, flags=mmap.MAP_SHARED | 0x2000 | 0x008000)
|
19
|
-
shm.madvise(mmap.MADV_HUGEPAGE) # type: ignore
|
20
|
-
os.close(fd)
|
21
|
-
if self.cache_id is not None: SHM_CACHE[self.cache_id] = shm
|
22
|
-
|
23
|
-
super().__init__(size, dtype, shm)
|
24
|
-
def __del__(self):
|
25
|
-
if self.cache_id is None: self._buf.close()
|
26
|
-
def _buffer(self): return memoryview(self._buf)
|
27
|
-
|
28
|
-
shm_fxn_for_op: Dict[Op, Callable] = { UnaryOps.NOOP: lambda x:x, MovementOps.RESHAPE: lambda x,_:x }
|
29
|
-
ShmBuffer = Interpreted(RawShmBuffer, shm_fxn_for_op, to_underlying=lambda x:x, from_underlying=lambda x:x)
|
tinygrad/runtime/ops_torch.py
DELETED
@@ -1,30 +0,0 @@
|
|
1
|
-
import torch
|
2
|
-
from typing import Dict, Callable, Optional
|
3
|
-
from tinygrad.ops import UnaryOps, BinaryOps, MovementOps, TernaryOps, Op, Interpreted
|
4
|
-
from tinygrad.helpers import getenv, dtypes, prod, DType
|
5
|
-
from tinygrad.runtime.ops_cpu import base_fxn_for_op, einsum_mulacc
|
6
|
-
from tinygrad.runtime.lib import RawBuffer
|
7
|
-
|
8
|
-
device = torch.device("cuda:0" if torch.cuda.is_available() else ("mps" if getenv("MPS", 0) else "cpu"))
|
9
|
-
type_map = {torch.float64: dtypes.float64, torch.float16: dtypes.float16, torch.float32: dtypes.float32, torch.int8: dtypes.int8, torch.int32: dtypes.int32, torch.int64: dtypes.int64, torch.uint8: dtypes.uint8, torch.bool: dtypes.bool}
|
10
|
-
inverse_type_map = {v:k for k,v in type_map.items()}
|
11
|
-
|
12
|
-
torch_fxn_for_op: Dict[Op, Callable] = {**base_fxn_for_op, **{
|
13
|
-
UnaryOps.NOOP: lambda x: x.contiguous(), UnaryOps.SQRT: lambda x: x.sqrt(), UnaryOps.EXP2: lambda x: x.exp2(), UnaryOps.LOG2: lambda x: x.log2(), UnaryOps.SIN: torch.sin,
|
14
|
-
UnaryOps.CAST: lambda x,y: (x.view if y[1] else x.type)(next(k for k,v in type_map.items() if v==y[0])),
|
15
|
-
BinaryOps.MAX: torch.maximum, BinaryOps.CMPLT: lambda x,y: (x<y).type(torch.promote_types(x.dtype, y.dtype)),
|
16
|
-
MovementOps.PAD: lambda x, padding: torch.nn.functional.pad(x, [item for sublist in padding[::-1] for item in sublist]),
|
17
|
-
TernaryOps.MULACC: einsum_mulacc(lambda s,a,b: torch.einsum(s, a.float(), b.float()).type(torch.promote_types(a.dtype, b.dtype)), lambda x: x.stride(), lambda x,s: x.expand(s)),
|
18
|
-
TernaryOps.WHERE: lambda x, y, z: torch.where(x != 0, y, z),
|
19
|
-
MovementOps.STRIDE: lambda x, arg: x[tuple(slice(None, None, abs(i)) for i in arg)].flip([i for i,a in enumerate(arg) if a < 0]),
|
20
|
-
MovementOps.EXPAND: lambda x, arg: x.expand(arg), MovementOps.PERMUTE: lambda x, arg: x.permute(arg)
|
21
|
-
}}
|
22
|
-
|
23
|
-
class RawTorchBuffer(RawBuffer):
|
24
|
-
def __init__(self, size:int, dtype:DType, buf:Optional[torch.Tensor]=None): super().__init__(size, dtype, buf if buf is not None else torch.empty([size], dtype=inverse_type_map[dtype]))
|
25
|
-
@classmethod
|
26
|
-
def fromCPU(cls, x):
|
27
|
-
buf = torch.from_numpy(x).requires_grad_(False).to(device)
|
28
|
-
return cls(prod(x.shape), type_map[buf.dtype], buf)
|
29
|
-
def toCPU(self): return self._buf.cpu().numpy()
|
30
|
-
TorchBuffer = Interpreted(RawTorchBuffer, torch_fxn_for_op, from_underlying=lambda x: RawTorchBuffer(prod(x.shape), type_map[x.dtype], x))
|
tinygrad/runtime/ops_webgpu.py
DELETED
@@ -1,45 +0,0 @@
|
|
1
|
-
import numpy as np
|
2
|
-
import functools
|
3
|
-
from wgpu.utils._device import get_default_device # type: ignore
|
4
|
-
from tinygrad.runtime.lib import RawBufferCopyIn, LRUAllocator
|
5
|
-
from tinygrad.helpers import dtypes, DType
|
6
|
-
from tinygrad.ops import Compiled
|
7
|
-
from tinygrad.codegen.linearizer import LinearizerOptions
|
8
|
-
from tinygrad.renderer.cstyle import uops_to_cstyle
|
9
|
-
from tinygrad.renderer.wgsl import WGSLLanguage
|
10
|
-
import wgpu # type: ignore
|
11
|
-
|
12
|
-
wgpu_device = get_default_device()
|
13
|
-
|
14
|
-
class WebGPUProgram:
|
15
|
-
def __init__(self, name: str, prg: str, binary=False): self.name,self.prg = name,wgpu_device.create_shader_module(code=prg)
|
16
|
-
def __call__(self, global_size, local_size, *bufs, wait=False):
|
17
|
-
assert len(bufs) <= 8, "WEBGPU only supports 8 buffers"
|
18
|
-
binding_layouts = [{"binding": i, "visibility": wgpu.ShaderStage.COMPUTE, "buffer": {"type": wgpu.BufferBindingType.storage}} for i in range(len(bufs))]
|
19
|
-
bindings = [{"binding": i, "resource": {"buffer": x._buf, "offset": 0, "size": x._buf.size}} for i, x in enumerate(bufs)]
|
20
|
-
bind_group_layout = wgpu_device.create_bind_group_layout(entries=binding_layouts)
|
21
|
-
pipeline_layout = wgpu_device.create_pipeline_layout(bind_group_layouts=[bind_group_layout])
|
22
|
-
bind_group = wgpu_device.create_bind_group(layout=bind_group_layout, entries=bindings)
|
23
|
-
compute_pipeline = wgpu_device.create_compute_pipeline(layout=pipeline_layout,compute={"module": self.prg, "entry_point": self.name},)
|
24
|
-
command_encoder = wgpu_device.create_command_encoder()
|
25
|
-
compute_pass = command_encoder.begin_compute_pass()
|
26
|
-
compute_pass.set_pipeline(compute_pipeline)
|
27
|
-
compute_pass.set_bind_group(0, bind_group, [], 0, 999999) # last 2 not used
|
28
|
-
compute_pass.dispatch_workgroups(*global_size) # x y z
|
29
|
-
compute_pass.end()
|
30
|
-
wgpu_device.queue.submit([command_encoder.finish()])
|
31
|
-
|
32
|
-
class RawWebGPUAllocator(LRUAllocator):
|
33
|
-
def _do_alloc(self, size, dtype, device, **kwargs): return wgpu_device.create_buffer(size=size*dtype.itemsize, usage=wgpu.BufferUsage.STORAGE | wgpu.BufferUsage.COPY_DST | wgpu.BufferUsage.COPY_SRC)
|
34
|
-
def _cached_bufkey(self, size, dtype, device): return (device, size*dtype.itemsize) # Buffers of the same length could be reused, no matter what dtype.
|
35
|
-
WebGPUAlloc = RawWebGPUAllocator(wgpu_device.limits['max_buffer_size'])
|
36
|
-
|
37
|
-
class RawWebGPUBuffer(RawBufferCopyIn):
|
38
|
-
def __init__(self, size:int, dtype:DType):
|
39
|
-
assert dtype not in [dtypes.int8,dtypes.uint8,dtypes.int64,dtypes.uint64,dtypes.double], f"dtype {dtype} not supported on WEBGPU"
|
40
|
-
super().__init__(size, dtype, allocator=WebGPUAlloc)
|
41
|
-
def _copyin(self, x:np.ndarray): wgpu_device.queue.write_buffer(self._buf, 0, np.ascontiguousarray(x))
|
42
|
-
def toCPU(self) -> np.ndarray: return np.frombuffer(wgpu_device.queue.read_buffer(self._buf, 0), dtype=np.dtype(self.dtype.np, metadata={"backing": self})) # type: ignore
|
43
|
-
|
44
|
-
renderer = functools.partial(uops_to_cstyle, WGSLLanguage())
|
45
|
-
WebGpuBuffer = Compiled(RawWebGPUBuffer, LinearizerOptions(supports_float4=False, local_max=[256, 256, 64], global_max=[65535, 65535, 65535]), renderer, WebGPUProgram)
|
@@ -1,212 +0,0 @@
|
|
1
|
-
Metadata-Version: 2.1
|
2
|
-
Name: tinygrad
|
3
|
-
Version: 0.7.0
|
4
|
-
Summary: You like pytorch? You like micrograd? You love tinygrad! <3
|
5
|
-
Author: George Hotz
|
6
|
-
License: MIT
|
7
|
-
Classifier: Programming Language :: Python :: 3
|
8
|
-
Classifier: License :: OSI Approved :: MIT License
|
9
|
-
Requires-Python: >=3.8
|
10
|
-
Description-Content-Type: text/markdown
|
11
|
-
License-File: LICENSE
|
12
|
-
Requires-Dist: numpy
|
13
|
-
Requires-Dist: requests
|
14
|
-
Requires-Dist: pillow
|
15
|
-
Requires-Dist: tqdm
|
16
|
-
Requires-Dist: networkx
|
17
|
-
Requires-Dist: pyopencl
|
18
|
-
Requires-Dist: PyYAML
|
19
|
-
Provides-Extra: arm
|
20
|
-
Requires-Dist: unicorn ; extra == 'arm'
|
21
|
-
Provides-Extra: cuda
|
22
|
-
Requires-Dist: pycuda ; extra == 'cuda'
|
23
|
-
Provides-Extra: linting
|
24
|
-
Requires-Dist: flake8 ; extra == 'linting'
|
25
|
-
Requires-Dist: pylint ; extra == 'linting'
|
26
|
-
Requires-Dist: mypy ; extra == 'linting'
|
27
|
-
Requires-Dist: pre-commit ; extra == 'linting'
|
28
|
-
Provides-Extra: llvm
|
29
|
-
Requires-Dist: llvmlite ; extra == 'llvm'
|
30
|
-
Provides-Extra: metal
|
31
|
-
Requires-Dist: pyobjc-framework-Metal ; extra == 'metal'
|
32
|
-
Requires-Dist: pyobjc-framework-Cocoa ; extra == 'metal'
|
33
|
-
Requires-Dist: pyobjc-framework-libdispatch ; extra == 'metal'
|
34
|
-
Provides-Extra: testing
|
35
|
-
Requires-Dist: torch ; extra == 'testing'
|
36
|
-
Requires-Dist: pytest ; extra == 'testing'
|
37
|
-
Requires-Dist: pytest-xdist ; extra == 'testing'
|
38
|
-
Requires-Dist: onnx ; extra == 'testing'
|
39
|
-
Requires-Dist: onnx2torch ; extra == 'testing'
|
40
|
-
Requires-Dist: opencv-python ; extra == 'testing'
|
41
|
-
Requires-Dist: tabulate ; extra == 'testing'
|
42
|
-
Requires-Dist: safetensors ; extra == 'testing'
|
43
|
-
Requires-Dist: types-PyYAML ; extra == 'testing'
|
44
|
-
Requires-Dist: cloudpickle ; extra == 'testing'
|
45
|
-
Requires-Dist: transformers ; extra == 'testing'
|
46
|
-
Provides-Extra: triton
|
47
|
-
Requires-Dist: triton >=2.0.0.dev20221202 ; extra == 'triton'
|
48
|
-
Provides-Extra: webgpu
|
49
|
-
Requires-Dist: wgpu ; extra == 'webgpu'
|
50
|
-
|
51
|
-
<div align="center">
|
52
|
-
|
53
|
-
[](https://tinygrad.org)
|
54
|
-
|
55
|
-
tinygrad: For something between [PyTorch](https://github.com/pytorch/pytorch) and [karpathy/micrograd](https://github.com/karpathy/micrograd). Maintained by [tiny corp](https://tinygrad.org).
|
56
|
-
|
57
|
-
<h3>
|
58
|
-
|
59
|
-
[Homepage](https://github.com/tinygrad/tinygrad) | [Documentation](/docs) | [Examples](/examples) | [Showcase](/docs/showcase.md) | [Discord](https://discord.gg/ZjZadyC7PK)
|
60
|
-
|
61
|
-
</h3>
|
62
|
-
|
63
|
-
[](https://github.com/tinygrad/tinygrad/stargazers)
|
64
|
-
[](https://github.com/tinygrad/tinygrad/actions/workflows/test.yml)
|
65
|
-
[](https://discord.gg/ZjZadyC7PK)
|
66
|
-
[](https://github.com/tinygrad/tinygrad)
|
67
|
-
|
68
|
-
</div>
|
69
|
-
|
70
|
-
---
|
71
|
-
|
72
|
-
This may not be the best deep learning framework, but it is a deep learning framework.
|
73
|
-
|
74
|
-
Due to its extreme simplicity, it aims to be the easiest framework to add new accelerators to, with support for both inference and training. If XLA is CISC, tinygrad is RISC.
|
75
|
-
|
76
|
-
tinygrad is still alpha software, but we [raised some money](https://geohot.github.io/blog/jekyll/update/2023/05/24/the-tiny-corp-raised-5M.html) to make it good. Someday, we will tape out chips.
|
77
|
-
|
78
|
-
## Features
|
79
|
-
|
80
|
-
### LLaMA and Stable Diffusion
|
81
|
-
|
82
|
-
tinygrad can run [LLaMA](/docs/showcase.md#llama) and [Stable Diffusion](/docs/showcase.md#stable-diffusion)!
|
83
|
-
|
84
|
-
### Laziness
|
85
|
-
|
86
|
-
Try a matmul. See how, despite the style, it is fused into one kernel with the power of laziness.
|
87
|
-
|
88
|
-
```sh
|
89
|
-
DEBUG=3 python3 -c "from tinygrad.tensor import Tensor;
|
90
|
-
N = 1024; a, b = Tensor.rand(N, N), Tensor.rand(N, N);
|
91
|
-
c = (a.reshape(N, 1, N) * b.permute(1,0).reshape(1, N, N)).sum(axis=2);
|
92
|
-
print((c.numpy() - (a.numpy() @ b.numpy())).mean())"
|
93
|
-
```
|
94
|
-
|
95
|
-
And we can change `DEBUG` to `4` to see the generated code.
|
96
|
-
|
97
|
-
### Neural networks
|
98
|
-
|
99
|
-
As it turns out, 90% of what you need for neural networks are a decent autograd/tensor library.
|
100
|
-
Throw in an optimizer, a data loader, and some compute, and you have all you need.
|
101
|
-
|
102
|
-
#### Neural network example (from test/models/test_mnist.py)
|
103
|
-
|
104
|
-
```py
|
105
|
-
from tinygrad.tensor import Tensor
|
106
|
-
import tinygrad.nn.optim as optim
|
107
|
-
|
108
|
-
class TinyBobNet:
|
109
|
-
def __init__(self):
|
110
|
-
self.l1 = Tensor.uniform(784, 128)
|
111
|
-
self.l2 = Tensor.uniform(128, 10)
|
112
|
-
|
113
|
-
def forward(self, x):
|
114
|
-
return x.dot(self.l1).relu().dot(self.l2).log_softmax()
|
115
|
-
|
116
|
-
model = TinyBobNet()
|
117
|
-
optim = optim.SGD([model.l1, model.l2], lr=0.001)
|
118
|
-
|
119
|
-
# ... complete data loader here
|
120
|
-
|
121
|
-
out = model.forward(x)
|
122
|
-
loss = out.mul(y).mean()
|
123
|
-
optim.zero_grad()
|
124
|
-
loss.backward()
|
125
|
-
optim.step()
|
126
|
-
```
|
127
|
-
|
128
|
-
## Accelerators
|
129
|
-
|
130
|
-
tinygrad already supports numerous accelerators, including:
|
131
|
-
|
132
|
-
- [x] [CPU](tinygrad/runtime/ops_cpu.py)
|
133
|
-
- [x] [GPU (OpenCL)](tinygrad/runtime/ops_gpu.py)
|
134
|
-
- [x] [C Code (Clang)](tinygrad/runtime/ops_clang.py)
|
135
|
-
- [x] [LLVM](tinygrad/runtime/ops_llvm.py)
|
136
|
-
- [x] [METAL](tinygrad/runtime/ops_metal.py)
|
137
|
-
- [x] [CUDA](tinygrad/runtime/ops_cuda.py)
|
138
|
-
- [x] [Triton](extra/accel/triton/ops_triton.py)
|
139
|
-
- [x] [PyTorch](tinygrad/runtime/ops_torch.py)
|
140
|
-
- [x] [HIP](tinygrad/runtime/ops_hip.py)
|
141
|
-
- [x] [WebGPU](tinygrad/runtime/ops_webgpu.py)
|
142
|
-
|
143
|
-
And it is easy to add more! Your accelerator of choice only needs to support a total of 26 (optionally 27) low level ops.
|
144
|
-
More information can be found in the [documentation for adding new accelerators](/docs/adding_new_accelerators.md).
|
145
|
-
|
146
|
-
## Installation
|
147
|
-
|
148
|
-
The current recommended way to install tinygrad is from source.
|
149
|
-
|
150
|
-
### From source
|
151
|
-
|
152
|
-
```sh
|
153
|
-
git clone https://github.com/tinygrad/tinygrad.git
|
154
|
-
cd tinygrad
|
155
|
-
python3 -m pip install -e .
|
156
|
-
```
|
157
|
-
Don't forget the `.` at the end!
|
158
|
-
|
159
|
-
## Documentation
|
160
|
-
|
161
|
-
Documentation along with a quick start guide can be found in the [docs/](/docs) directory.
|
162
|
-
|
163
|
-
### Quick example comparing to PyTorch
|
164
|
-
|
165
|
-
```py
|
166
|
-
from tinygrad.tensor import Tensor
|
167
|
-
|
168
|
-
x = Tensor.eye(3, requires_grad=True)
|
169
|
-
y = Tensor([[2.0,0,-2.0]], requires_grad=True)
|
170
|
-
z = y.matmul(x).sum()
|
171
|
-
z.backward()
|
172
|
-
|
173
|
-
print(x.grad.numpy()) # dz/dx
|
174
|
-
print(y.grad.numpy()) # dz/dy
|
175
|
-
```
|
176
|
-
|
177
|
-
The same thing but in PyTorch:
|
178
|
-
```py
|
179
|
-
import torch
|
180
|
-
|
181
|
-
x = torch.eye(3, requires_grad=True)
|
182
|
-
y = torch.tensor([[2.0,0,-2.0]], requires_grad=True)
|
183
|
-
z = y.matmul(x).sum()
|
184
|
-
z.backward()
|
185
|
-
|
186
|
-
print(x.grad.numpy()) # dz/dx
|
187
|
-
print(y.grad.numpy()) # dz/dy
|
188
|
-
```
|
189
|
-
|
190
|
-
## Contributing
|
191
|
-
|
192
|
-
There has been a lot of interest in tinygrad lately. Here are some basic guidelines for contributing:
|
193
|
-
|
194
|
-
- Bug fixes are the best and always welcome! Like [this one](https://github.com/tinygrad/tinygrad/pull/421/files).
|
195
|
-
- If you don't understand the code you are changing, don't change it!
|
196
|
-
- All code golf PRs will be closed, but [conceptual cleanups](https://github.com/tinygrad/tinygrad/pull/372/files) are great.
|
197
|
-
- Features are welcome. Though if you are adding a feature, you need to include tests.
|
198
|
-
- Improving test coverage is great, with reliable non-brittle tests.
|
199
|
-
|
200
|
-
Additional guidelines can be found in [CONTRIBUTING.md](/CONTRIBUTING.md).
|
201
|
-
|
202
|
-
### Running tests
|
203
|
-
|
204
|
-
For more examples on how to run the full test suite please refer to the [CI workflow](.github/workflows/test.yml).
|
205
|
-
|
206
|
-
Some examples:
|
207
|
-
```sh
|
208
|
-
python3 -m pip install -e '.[testing]'
|
209
|
-
python3 -m pytest
|
210
|
-
python3 -m pytest -v -k TestTrain
|
211
|
-
python3 ./test/models/test_train.py TestTrain.test_efficientnet
|
212
|
-
```
|
tinygrad-0.7.0.dist-info/RECORD
DELETED
@@ -1,40 +0,0 @@
|
|
1
|
-
tinygrad/graph.py,sha256=s5f1OQKb9AF8oYFMAl2oTIH7FRAsK-IZDJ7oG_mEVsc,3933
|
2
|
-
tinygrad/helpers.py,sha256=dGbfameyEU60Or6CYGOIhZZk5RShMegD4ZvNmQaweDM,7073
|
3
|
-
tinygrad/jit.py,sha256=kHYEMr6YQYHPu26ff5zQldx9xuYAjWhmiK6ag3PpVGo,3801
|
4
|
-
tinygrad/lazy.py,sha256=63O1kONydeocrRI88iAXbxPC559PnAZNPhr6Gi6SjdA,22926
|
5
|
-
tinygrad/mlops.py,sha256=pWFGP96p06pY9r8SDdGiyw7RzC10HqGtuFbJ-2aWvSU,7910
|
6
|
-
tinygrad/ops.py,sha256=_h1CpDHoghauB6m3DEyhB1d2g64t2xT26nzFJn5quMk,13966
|
7
|
-
tinygrad/tensor.py,sha256=xI82653sfLLPDvvVpIHpdlj7LsBqOR1hTvjSFahRpno,42142
|
8
|
-
tinygrad/codegen/assembly.py,sha256=Vv6wFyxqTkgmfkZFFpmRCkV71gVjT170--T6X4Y-JO0,11033
|
9
|
-
tinygrad/codegen/kernel.py,sha256=miUZtef3JXbDYxPhcif0C2sgVe2QyMs8SPi94O7cEjg,7416
|
10
|
-
tinygrad/codegen/linearizer.py,sha256=rymEs3E5tmYdRbdTKobGst7JlAQHkkJbZFwcR6hTIpA,22228
|
11
|
-
tinygrad/codegen/optimizer.py,sha256=Eq6oro8lZlb3-8qss025j3yn1Sg4qbKG03kJUjYnjpo,19961
|
12
|
-
tinygrad/codegen/search.py,sha256=sYU7tw2suC_9T4lMiOvdQ5uDaMj_Daj4xhLiVvB8eGw,2881
|
13
|
-
tinygrad/nn/__init__.py,sha256=k5BnAeWmE3s6J4iNLozZsZe4tIaa9OjRQAyWHd9xApk,7225
|
14
|
-
tinygrad/nn/image.py,sha256=MAjbIUPXpGETloctzJvKbWJ59koz60hqrsK7TrWyaKE,5007
|
15
|
-
tinygrad/nn/optim.py,sha256=9mcy0Cs0HC68AZEV3j_jhxgmiNUrTYW-ZoTWB9_Rr38,3466
|
16
|
-
tinygrad/nn/state.py,sha256=mHsW1W5KDrc2yLNV-Mk_OFjObpAvch7XhC5ybDSWz_w,6773
|
17
|
-
tinygrad/renderer/assembly_arm64.py,sha256=dc_IZtFETl6naMIJ0InNurL2COP67CwooUnaA9jexbI,8997
|
18
|
-
tinygrad/renderer/assembly_ptx.py,sha256=wj8--S-re8yXBZ5YsP1aCsLvguaUrsmn6qbRYAyxJhg,6508
|
19
|
-
tinygrad/renderer/cstyle.py,sha256=v52rzVXkrEnb8OQYCmsa5g_90qwr5qiYUzaLjqujtlQ,11334
|
20
|
-
tinygrad/renderer/llvmir.py,sha256=n0FGxjbztwyDv7AZZpjEDirCIHS07t9AuXFR_5rcrx4,8965
|
21
|
-
tinygrad/renderer/wgsl.py,sha256=ATQVyrLwPh2K1Zgff-_ZF6v5JHHmaHwHKjlzwmV5ySI,3154
|
22
|
-
tinygrad/runtime/lib.py,sha256=Xy8_4h7f72aeFakSQUflc5tyBprhLNzN2htQhWk5SHo,6383
|
23
|
-
tinygrad/runtime/ops_clang.py,sha256=tCgjBiTn7sg6AGWWfBqVf_spaddkDNE0ctqbIGq3c5k,4929
|
24
|
-
tinygrad/runtime/ops_cpu.py,sha256=dI813Q1l2k23NJFptgJZcrZBQNHJ7-WtprjxjZKZGCk,3676
|
25
|
-
tinygrad/runtime/ops_cuda.py,sha256=-KmYL9dNrXEtveXNA9lM2cuDePYEAwx3nizJ3bMynPY,6333
|
26
|
-
tinygrad/runtime/ops_disk.py,sha256=SJQVat4_ic-1EOmHA4A3HUzACZ6hZodH60OLvS26hfg,2224
|
27
|
-
tinygrad/runtime/ops_gpu.py,sha256=0Rg_WVZRYbQPRe8xFxqGVBcOkIrf4nnNAqWQDxmYNj0,6980
|
28
|
-
tinygrad/runtime/ops_hip.py,sha256=5UCgFwHZHm7Ta7-IqTv9T1ir4T35uAoWiPGSSLySOmc,5051
|
29
|
-
tinygrad/runtime/ops_llvm.py,sha256=HDU1DwU43-EGMMu9k08piKKuglNZw8LViY-dFU_G2i0,2826
|
30
|
-
tinygrad/runtime/ops_metal.py,sha256=eXXf-pkojU3LtHCy7MhKOg45p80G_f2UhwZ7t6qEQ2c,5284
|
31
|
-
tinygrad/runtime/ops_shm.py,sha256=MgQwCDlP5n0h937tBlDTaCxuiIGRvBaJFW7vEqwKnnE,1363
|
32
|
-
tinygrad/runtime/ops_torch.py,sha256=LDR_3ajKer6HECcs-UyhxCWwscooggKx_MS5zjVNops,2335
|
33
|
-
tinygrad/runtime/ops_webgpu.py,sha256=8gbJa2sAeFXtBmFOCR1FbziV0ZXFl7mbFb8r5HGH_1Y,3070
|
34
|
-
tinygrad/shape/shapetracker.py,sha256=ks5xoWg_8F4VNHtjJUlbxe_CGyCVD4SI78dmObO3xww,15046
|
35
|
-
tinygrad/shape/symbolic.py,sha256=QI3ACNfGv8ZQUvjplmFguAs3w3W5lfbHz8b-Hk1NsmY,13921
|
36
|
-
tinygrad-0.7.0.dist-info/LICENSE,sha256=6cp1Hqk0v7NMg1j6OXty_1vAZ4EIwZdCySIoHrCS6RI,1055
|
37
|
-
tinygrad-0.7.0.dist-info/METADATA,sha256=SJaE_k9-bcaF3GkoMxSv_tN3lFyFkLZY38EjN3hmdk0,7180
|
38
|
-
tinygrad-0.7.0.dist-info/WHEEL,sha256=yQN5g4mg4AybRjkgi-9yy4iQEFibGQmlz78Pik5Or-A,92
|
39
|
-
tinygrad-0.7.0.dist-info/top_level.txt,sha256=vDABMCWBFQnx2kn9Azueu88FP-1klQdePoHikQhHymc,9
|
40
|
-
tinygrad-0.7.0.dist-info/RECORD,,
|
File without changes
|