tinygrad 0.9.1__tar.gz → 0.10.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {tinygrad-0.9.1 → tinygrad-0.10.0}/PKG-INFO +21 -17
- {tinygrad-0.9.1 → tinygrad-0.10.0}/README.md +13 -11
- {tinygrad-0.9.1 → tinygrad-0.10.0}/setup.py +13 -9
- tinygrad-0.10.0/test/test_arange.py +179 -0
- {tinygrad-0.9.1 → tinygrad-0.10.0}/test/test_assign.py +17 -4
- tinygrad-0.10.0/test/test_compile_failures.py +18 -0
- {tinygrad-0.9.1 → tinygrad-0.10.0}/test/test_const_folding.py +27 -12
- {tinygrad-0.9.1 → tinygrad-0.10.0}/test/test_conv.py +8 -0
- tinygrad-0.10.0/test/test_conv_shapetracker.py +58 -0
- {tinygrad-0.9.1 → tinygrad-0.10.0}/test/test_copy_speed.py +1 -1
- {tinygrad-0.9.1 → tinygrad-0.10.0}/test/test_device_speed.py +1 -2
- {tinygrad-0.9.1 → tinygrad-0.10.0}/test/test_dtype.py +172 -35
- {tinygrad-0.9.1 → tinygrad-0.10.0}/test/test_dtype_alu.py +27 -15
- {tinygrad-0.9.1 → tinygrad-0.10.0}/test/test_fusion_op.py +28 -9
- {tinygrad-0.9.1 → tinygrad-0.10.0}/test/test_fuzz_shape_ops.py +2 -2
- tinygrad-0.10.0/test/test_gc.py +67 -0
- {tinygrad-0.9.1 → tinygrad-0.10.0}/test/test_graph.py +1 -2
- tinygrad-0.10.0/test/test_hcq.py +475 -0
- {tinygrad-0.9.1 → tinygrad-0.10.0}/test/test_image_dtype.py +51 -11
- {tinygrad-0.9.1 → tinygrad-0.10.0}/test/test_jit.py +79 -2
- {tinygrad-0.9.1 → tinygrad-0.10.0}/test/test_lazybuffer.py +34 -13
- tinygrad-0.10.0/test/test_linearizer.py +2174 -0
- tinygrad-0.10.0/test/test_linearizer_dumb.py +223 -0
- tinygrad-0.10.0/test/test_linearizer_failures.py +1435 -0
- tinygrad-0.10.0/test/test_linearizer_overflows.py +196 -0
- tinygrad-0.10.0/test/test_metal.py +77 -0
- {tinygrad-0.9.1 → tinygrad-0.10.0}/test/test_multitensor.py +202 -47
- {tinygrad-0.9.1 → tinygrad-0.10.0}/test/test_nn.py +259 -30
- tinygrad-0.10.0/test/test_ocl.py +31 -0
- {tinygrad-0.9.1 → tinygrad-0.10.0}/test/test_ops.py +615 -52
- {tinygrad-0.9.1 → tinygrad-0.10.0}/test/test_optim.py +1 -1
- {tinygrad-0.9.1 → tinygrad-0.10.0}/test/test_pickle.py +52 -6
- tinygrad-0.10.0/test/test_profiler.py +221 -0
- {tinygrad-0.9.1 → tinygrad-0.10.0}/test/test_randomness.py +146 -28
- tinygrad-0.10.0/test/test_rearrange_einops.py +321 -0
- tinygrad-0.10.0/test/test_renderer_failures.py +68 -0
- {tinygrad-0.9.1 → tinygrad-0.10.0}/test/test_sample.py +1 -2
- tinygrad-0.10.0/test/test_schedule.py +1859 -0
- tinygrad-0.10.0/test/test_search.py +158 -0
- {tinygrad-0.9.1 → tinygrad-0.10.0}/test/test_setitem.py +23 -8
- {tinygrad-0.9.1 → tinygrad-0.10.0}/test/test_specific_conv.py +1 -1
- {tinygrad-0.9.1 → tinygrad-0.10.0}/test/test_speed_v_torch.py +9 -16
- {tinygrad-0.9.1 → tinygrad-0.10.0}/test/test_subbuffer.py +2 -3
- {tinygrad-0.9.1 → tinygrad-0.10.0}/test/test_symbolic_jit.py +1 -3
- {tinygrad-0.9.1 → tinygrad-0.10.0}/test/test_symbolic_ops.py +2 -2
- {tinygrad-0.9.1 → tinygrad-0.10.0}/test/test_symbolic_shapetracker.py +37 -40
- {tinygrad-0.9.1 → tinygrad-0.10.0}/test/test_tensor.py +182 -11
- {tinygrad-0.9.1 → tinygrad-0.10.0}/test/test_tensor_data.py +12 -1
- {tinygrad-0.9.1 → tinygrad-0.10.0}/test/test_tensor_variable.py +36 -20
- tinygrad-0.10.0/test/test_tiny.py +84 -0
- tinygrad-0.10.0/test/test_transcendental.py +121 -0
- tinygrad-0.10.0/test/test_uop_graph.py +716 -0
- tinygrad-0.10.0/test/test_uops.py +454 -0
- tinygrad-0.10.0/test/test_uops_stats.py +224 -0
- tinygrad-0.10.0/test/test_viz.py +93 -0
- {tinygrad-0.9.1 → tinygrad-0.10.0}/test/test_winograd.py +7 -6
- tinygrad-0.10.0/tinygrad/__init__.py +11 -0
- {tinygrad-0.9.1 → tinygrad-0.10.0}/tinygrad/codegen/kernel.py +308 -175
- tinygrad-0.10.0/tinygrad/codegen/linearize.py +95 -0
- tinygrad-0.10.0/tinygrad/codegen/lowerer.py +143 -0
- tinygrad-0.10.0/tinygrad/codegen/transcendental.py +257 -0
- tinygrad-0.10.0/tinygrad/codegen/uopgraph.py +506 -0
- tinygrad-0.10.0/tinygrad/device.py +221 -0
- tinygrad-0.10.0/tinygrad/dtype.py +188 -0
- tinygrad-0.10.0/tinygrad/engine/jit.py +295 -0
- {tinygrad-0.9.1/tinygrad → tinygrad-0.10.0/tinygrad/engine}/lazy.py +74 -66
- tinygrad-0.10.0/tinygrad/engine/memory.py +51 -0
- {tinygrad-0.9.1 → tinygrad-0.10.0}/tinygrad/engine/realize.py +86 -61
- tinygrad-0.10.0/tinygrad/engine/schedule.py +419 -0
- {tinygrad-0.9.1 → tinygrad-0.10.0}/tinygrad/engine/search.py +58 -47
- {tinygrad-0.9.1 → tinygrad-0.10.0}/tinygrad/function.py +59 -58
- {tinygrad-0.9.1 → tinygrad-0.10.0}/tinygrad/helpers.py +120 -102
- {tinygrad-0.9.1 → tinygrad-0.10.0}/tinygrad/multi.py +82 -78
- {tinygrad-0.9.1 → tinygrad-0.10.0}/tinygrad/nn/__init__.py +116 -67
- tinygrad-0.10.0/tinygrad/nn/datasets.py +15 -0
- {tinygrad-0.9.1 → tinygrad-0.10.0}/tinygrad/nn/optim.py +1 -1
- {tinygrad-0.9.1 → tinygrad-0.10.0}/tinygrad/nn/state.py +91 -6
- tinygrad-0.10.0/tinygrad/ops.py +1152 -0
- tinygrad-0.10.0/tinygrad/renderer/__init__.py +89 -0
- tinygrad-0.10.0/tinygrad/renderer/cstyle.py +462 -0
- tinygrad-0.10.0/tinygrad/renderer/llvmir.py +142 -0
- tinygrad-0.10.0/tinygrad/renderer/ptx.py +225 -0
- tinygrad-0.10.0/tinygrad/runtime/autogen/adreno.py +17904 -0
- tinygrad-0.10.0/tinygrad/runtime/autogen/amd_gpu.py +48384 -0
- {tinygrad-0.9.1 → tinygrad-0.10.0}/tinygrad/runtime/autogen/cuda.py +6 -162
- {tinygrad-0.9.1 → tinygrad-0.10.0}/tinygrad/runtime/autogen/io_uring.py +97 -63
- {tinygrad-0.9.1 → tinygrad-0.10.0}/tinygrad/runtime/autogen/kfd.py +60 -47
- tinygrad-0.10.0/tinygrad/runtime/autogen/kgsl.py +1386 -0
- tinygrad-0.10.0/tinygrad/runtime/autogen/libc.py +5462 -0
- {tinygrad-0.9.1 → tinygrad-0.10.0}/tinygrad/runtime/autogen/nv_gpu.py +1976 -1957
- tinygrad-0.10.0/tinygrad/runtime/autogen/nvrtc.py +579 -0
- {tinygrad-0.9.1 → tinygrad-0.10.0}/tinygrad/runtime/autogen/opencl.py +11 -11
- tinygrad-0.10.0/tinygrad/runtime/autogen/qcom_dsp.py +1739 -0
- {tinygrad-0.9.1 → tinygrad-0.10.0}/tinygrad/runtime/graph/clang.py +3 -3
- {tinygrad-0.9.1 → tinygrad-0.10.0}/tinygrad/runtime/graph/cuda.py +11 -15
- tinygrad-0.10.0/tinygrad/runtime/graph/hcq.py +200 -0
- tinygrad-0.10.0/tinygrad/runtime/graph/metal.py +103 -0
- tinygrad-0.10.0/tinygrad/runtime/ops_amd.py +471 -0
- {tinygrad-0.9.1 → tinygrad-0.10.0}/tinygrad/runtime/ops_clang.py +12 -5
- tinygrad-0.10.0/tinygrad/runtime/ops_cloud.py +220 -0
- tinygrad-0.10.0/tinygrad/runtime/ops_cuda.py +128 -0
- {tinygrad-0.9.1 → tinygrad-0.10.0}/tinygrad/runtime/ops_disk.py +25 -26
- tinygrad-0.10.0/tinygrad/runtime/ops_dsp.py +181 -0
- {tinygrad-0.9.1 → tinygrad-0.10.0}/tinygrad/runtime/ops_gpu.py +29 -16
- tinygrad-0.10.0/tinygrad/runtime/ops_hip.py +68 -0
- {tinygrad-0.9.1 → tinygrad-0.10.0}/tinygrad/runtime/ops_llvm.py +15 -10
- tinygrad-0.10.0/tinygrad/runtime/ops_metal.py +188 -0
- tinygrad-0.10.0/tinygrad/runtime/ops_nv.py +584 -0
- {tinygrad-0.9.1 → tinygrad-0.10.0}/tinygrad/runtime/ops_python.py +78 -79
- tinygrad-0.10.0/tinygrad/runtime/ops_qcom.py +405 -0
- tinygrad-0.10.0/tinygrad/runtime/support/compiler_cuda.py +77 -0
- tinygrad-0.9.1/tinygrad/runtime/driver/hip_comgr.py → tinygrad-0.10.0/tinygrad/runtime/support/compiler_hip.py +13 -1
- tinygrad-0.10.0/tinygrad/runtime/support/elf.py +38 -0
- tinygrad-0.10.0/tinygrad/runtime/support/hcq.py +539 -0
- tinygrad-0.10.0/tinygrad/shape/__init__.py +0 -0
- {tinygrad-0.9.1 → tinygrad-0.10.0}/tinygrad/shape/shapetracker.py +40 -50
- {tinygrad-0.9.1 → tinygrad-0.10.0}/tinygrad/shape/view.py +102 -63
- {tinygrad-0.9.1 → tinygrad-0.10.0}/tinygrad/tensor.py +1109 -365
- {tinygrad-0.9.1 → tinygrad-0.10.0}/tinygrad.egg-info/PKG-INFO +21 -17
- {tinygrad-0.9.1 → tinygrad-0.10.0}/tinygrad.egg-info/SOURCES.txt +33 -13
- {tinygrad-0.9.1 → tinygrad-0.10.0}/tinygrad.egg-info/requires.txt +6 -6
- tinygrad-0.9.1/test/test_arange.py +0 -19
- tinygrad-0.9.1/test/test_conv_shapetracker.py +0 -22
- tinygrad-0.9.1/test/test_custom_function.py +0 -106
- tinygrad-0.9.1/test/test_gc.py +0 -37
- tinygrad-0.9.1/test/test_lazyop.py +0 -34
- tinygrad-0.9.1/test/test_linearizer.py +0 -1778
- tinygrad-0.9.1/test/test_linearizer_failures.py +0 -255
- tinygrad-0.9.1/test/test_linearizer_overflows.py +0 -89
- tinygrad-0.9.1/test/test_pattern_matcher.py +0 -168
- tinygrad-0.9.1/test/test_print_tree.py +0 -66
- tinygrad-0.9.1/test/test_schedule.py +0 -1156
- tinygrad-0.9.1/test/test_search.py +0 -101
- tinygrad-0.9.1/test/test_uop_graph.py +0 -190
- tinygrad-0.9.1/test/test_uops.py +0 -319
- tinygrad-0.9.1/test/test_uops_stats.py +0 -81
- tinygrad-0.9.1/test/test_verify_lazyop.py +0 -64
- tinygrad-0.9.1/tinygrad/__init__.py +0 -6
- tinygrad-0.9.1/tinygrad/codegen/linearizer.py +0 -528
- tinygrad-0.9.1/tinygrad/codegen/uops.py +0 -451
- tinygrad-0.9.1/tinygrad/device.py +0 -320
- tinygrad-0.9.1/tinygrad/dtype.py +0 -113
- tinygrad-0.9.1/tinygrad/engine/graph.py +0 -100
- tinygrad-0.9.1/tinygrad/engine/jit.py +0 -198
- tinygrad-0.9.1/tinygrad/engine/schedule.py +0 -370
- tinygrad-0.9.1/tinygrad/nn/datasets.py +0 -8
- tinygrad-0.9.1/tinygrad/ops.py +0 -169
- tinygrad-0.9.1/tinygrad/renderer/__init__.py +0 -65
- tinygrad-0.9.1/tinygrad/renderer/assembly.py +0 -269
- tinygrad-0.9.1/tinygrad/renderer/cstyle.py +0 -389
- tinygrad-0.9.1/tinygrad/renderer/llvmir.py +0 -160
- tinygrad-0.9.1/tinygrad/runtime/autogen/amd_gpu.py +0 -13403
- tinygrad-0.9.1/tinygrad/runtime/graph/hcq.py +0 -187
- tinygrad-0.9.1/tinygrad/runtime/graph/metal.py +0 -75
- tinygrad-0.9.1/tinygrad/runtime/ops_amd.py +0 -550
- tinygrad-0.9.1/tinygrad/runtime/ops_cuda.py +0 -185
- tinygrad-0.9.1/tinygrad/runtime/ops_metal.py +0 -105
- tinygrad-0.9.1/tinygrad/runtime/ops_nv.py +0 -625
- tinygrad-0.9.1/tinygrad/shape/symbolic.py +0 -327
- {tinygrad-0.9.1 → tinygrad-0.10.0}/LICENSE +0 -0
- {tinygrad-0.9.1 → tinygrad-0.10.0}/setup.cfg +0 -0
- {tinygrad-0.9.1 → tinygrad-0.10.0}/test/test_kernel_cache.py +0 -0
- {tinygrad-0.9.1 → tinygrad-0.10.0}/test/test_masked_st.py +0 -0
- {tinygrad-0.9.1 → tinygrad-0.10.0}/test/test_method_cache.py +0 -0
- {tinygrad-0.9.1 → tinygrad-0.10.0}/test/test_net_speed.py +0 -0
- {tinygrad-0.9.1 → tinygrad-0.10.0}/test/test_to_numpy.py +0 -0
- {tinygrad-0.9.1 → tinygrad-0.10.0}/test/test_zero_copy.py +0 -0
- {tinygrad-0.9.1 → tinygrad-0.10.0}/tinygrad/codegen/__init__.py +0 -0
- {tinygrad-0.9.1 → tinygrad-0.10.0}/tinygrad/engine/__init__.py +0 -0
- /tinygrad-0.9.1/tinygrad/runtime/__init__.py → /tinygrad-0.10.0/tinygrad/py.typed +0 -0
- {tinygrad-0.9.1/tinygrad/runtime/driver → tinygrad-0.10.0/tinygrad/runtime}/__init__.py +0 -0
- {tinygrad-0.9.1 → tinygrad-0.10.0}/tinygrad/runtime/autogen/comgr.py +0 -0
- {tinygrad-0.9.1 → tinygrad-0.10.0}/tinygrad/runtime/autogen/hip.py +0 -0
- {tinygrad-0.9.1 → tinygrad-0.10.0}/tinygrad/runtime/autogen/hsa.py +0 -0
- {tinygrad-0.9.1 → tinygrad-0.10.0}/tinygrad/runtime/graph/__init__.py +0 -0
- {tinygrad-0.9.1 → tinygrad-0.10.0}/tinygrad/runtime/ops_npy.py +0 -0
- {tinygrad-0.9.1/tinygrad/shape → tinygrad-0.10.0/tinygrad/runtime/support}/__init__.py +0 -0
- {tinygrad-0.9.1 → tinygrad-0.10.0}/tinygrad.egg-info/dependency_links.txt +0 -0
- {tinygrad-0.9.1 → tinygrad-0.10.0}/tinygrad.egg-info/top_level.txt +0 -0
@@ -1,17 +1,14 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: tinygrad
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.10.0
|
4
4
|
Summary: You like pytorch? You like micrograd? You love tinygrad! <3
|
5
5
|
Author: George Hotz
|
6
6
|
License: MIT
|
7
7
|
Classifier: Programming Language :: Python :: 3
|
8
8
|
Classifier: License :: OSI Approved :: MIT License
|
9
|
-
Requires-Python: >=3.
|
9
|
+
Requires-Python: >=3.10
|
10
10
|
Description-Content-Type: text/markdown
|
11
11
|
License-File: LICENSE
|
12
|
-
Requires-Dist: numpy
|
13
|
-
Requires-Dist: pyobjc-framework-Metal; platform_system == "Darwin"
|
14
|
-
Requires-Dist: pyobjc-framework-libdispatch; platform_system == "Darwin"
|
15
12
|
Provides-Extra: llvm
|
16
13
|
Requires-Dist: llvmlite; extra == "llvm"
|
17
14
|
Provides-Extra: arm
|
@@ -20,12 +17,13 @@ Provides-Extra: triton
|
|
20
17
|
Requires-Dist: triton-nightly>=2.1.0.dev20231014192330; extra == "triton"
|
21
18
|
Provides-Extra: linting
|
22
19
|
Requires-Dist: pylint; extra == "linting"
|
23
|
-
Requires-Dist: mypy; extra == "linting"
|
20
|
+
Requires-Dist: mypy==1.11.2; extra == "linting"
|
24
21
|
Requires-Dist: typing-extensions; extra == "linting"
|
25
22
|
Requires-Dist: pre-commit; extra == "linting"
|
26
23
|
Requires-Dist: ruff; extra == "linting"
|
27
24
|
Requires-Dist: types-tqdm; extra == "linting"
|
28
25
|
Provides-Extra: testing
|
26
|
+
Requires-Dist: numpy; extra == "testing"
|
29
27
|
Requires-Dist: torch; extra == "testing"
|
30
28
|
Requires-Dist: pillow; extra == "testing"
|
31
29
|
Requires-Dist: pytest; extra == "testing"
|
@@ -39,17 +37,21 @@ Requires-Dist: safetensors; extra == "testing"
|
|
39
37
|
Requires-Dist: transformers; extra == "testing"
|
40
38
|
Requires-Dist: sentencepiece; extra == "testing"
|
41
39
|
Requires-Dist: tiktoken; extra == "testing"
|
40
|
+
Requires-Dist: blobfile; extra == "testing"
|
42
41
|
Requires-Dist: librosa; extra == "testing"
|
43
42
|
Requires-Dist: networkx; extra == "testing"
|
44
43
|
Requires-Dist: hypothesis; extra == "testing"
|
45
44
|
Requires-Dist: nibabel; extra == "testing"
|
46
45
|
Requires-Dist: bottle; extra == "testing"
|
46
|
+
Requires-Dist: ggml-python; extra == "testing"
|
47
47
|
Provides-Extra: docs
|
48
|
+
Requires-Dist: mkdocs; extra == "docs"
|
48
49
|
Requires-Dist: mkdocs-material; extra == "docs"
|
49
50
|
Requires-Dist: mkdocstrings[python]; extra == "docs"
|
50
51
|
Requires-Dist: markdown-callouts; extra == "docs"
|
51
52
|
Requires-Dist: markdown-exec[ansi]; extra == "docs"
|
52
53
|
Requires-Dist: black; extra == "docs"
|
54
|
+
Requires-Dist: numpy; extra == "docs"
|
53
55
|
Provides-Extra: testing-tf
|
54
56
|
Requires-Dist: tensorflow==2.15.1; extra == "testing-tf"
|
55
57
|
Requires-Dist: tensorflow_addons; extra == "testing-tf"
|
@@ -107,7 +109,7 @@ And we can change `DEBUG` to `4` to see the generated code.
|
|
107
109
|
As it turns out, 90% of what you need for neural networks are a decent autograd/tensor library.
|
108
110
|
Throw in an optimizer, a data loader, and some compute, and you have all you need.
|
109
111
|
|
110
|
-
```
|
112
|
+
```python
|
111
113
|
from tinygrad import Tensor, nn
|
112
114
|
|
113
115
|
class LinearNet:
|
@@ -122,11 +124,12 @@ optim = nn.optim.Adam([model.l1, model.l2], lr=0.001)
|
|
122
124
|
|
123
125
|
x, y = Tensor.rand(4, 1, 28, 28), Tensor([2,4,3,7]) # replace with real mnist dataloader
|
124
126
|
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
127
|
+
with Tensor.train():
|
128
|
+
for i in range(10):
|
129
|
+
optim.zero_grad()
|
130
|
+
loss = model(x).sparse_categorical_crossentropy(y).backward()
|
131
|
+
optim.step()
|
132
|
+
print(i, loss.item())
|
130
133
|
```
|
131
134
|
|
132
135
|
See [examples/beautiful_mnist.py](examples/beautiful_mnist.py) for the full version that gets 98% in ~5 seconds
|
@@ -142,9 +145,12 @@ tinygrad already supports numerous accelerators, including:
|
|
142
145
|
- [x] [CUDA](tinygrad/runtime/ops_cuda.py)
|
143
146
|
- [x] [AMD](tinygrad/runtime/ops_amd.py)
|
144
147
|
- [x] [NV](tinygrad/runtime/ops_nv.py)
|
148
|
+
- [x] [QCOM](tinygrad/runtime/ops_qcom.py)
|
145
149
|
|
146
150
|
And it is easy to add more! Your accelerator of choice only needs to support a total of ~25 low level ops.
|
147
151
|
|
152
|
+
To check default accelerator run: `python3 -c "from tinygrad import Device; print(Device.DEFAULT)"`
|
153
|
+
|
148
154
|
## Installation
|
149
155
|
|
150
156
|
The current recommended way to install tinygrad is from source.
|
@@ -169,7 +175,7 @@ Documentation along with a quick start guide can be found on the [docs website](
|
|
169
175
|
|
170
176
|
### Quick example comparing to PyTorch
|
171
177
|
|
172
|
-
```
|
178
|
+
```python
|
173
179
|
from tinygrad import Tensor
|
174
180
|
|
175
181
|
x = Tensor.eye(3, requires_grad=True)
|
@@ -182,7 +188,7 @@ print(y.grad.numpy()) # dz/dy
|
|
182
188
|
```
|
183
189
|
|
184
190
|
The same thing but in PyTorch:
|
185
|
-
```
|
191
|
+
```python
|
186
192
|
import torch
|
187
193
|
|
188
194
|
x = torch.eye(3, requires_grad=True)
|
@@ -230,6 +236,4 @@ python3 -m pytest test/ # whole test suite
|
|
230
236
|
|
231
237
|
#### Process replay tests
|
232
238
|
|
233
|
-
[Process replay](https://github.com/tinygrad/tinygrad/blob/master/test/external/process_replay/
|
234
|
-
|
235
|
-
You can enable process replay by adding [run_process_replay] to your PR title. [example](https://github.com/tinygrad/tinygrad/pull/4995). Note that you should keep your branch up-to-date with master.
|
239
|
+
[Process replay](https://github.com/tinygrad/tinygrad/blob/master/test/external/process_replay/README.md) compares your PR's generated kernels against master. If your PR is a refactor or speedup without any expected behavior change, It should include [pr] in the pull request title.
|
@@ -51,7 +51,7 @@ And we can change `DEBUG` to `4` to see the generated code.
|
|
51
51
|
As it turns out, 90% of what you need for neural networks are a decent autograd/tensor library.
|
52
52
|
Throw in an optimizer, a data loader, and some compute, and you have all you need.
|
53
53
|
|
54
|
-
```
|
54
|
+
```python
|
55
55
|
from tinygrad import Tensor, nn
|
56
56
|
|
57
57
|
class LinearNet:
|
@@ -66,11 +66,12 @@ optim = nn.optim.Adam([model.l1, model.l2], lr=0.001)
|
|
66
66
|
|
67
67
|
x, y = Tensor.rand(4, 1, 28, 28), Tensor([2,4,3,7]) # replace with real mnist dataloader
|
68
68
|
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
69
|
+
with Tensor.train():
|
70
|
+
for i in range(10):
|
71
|
+
optim.zero_grad()
|
72
|
+
loss = model(x).sparse_categorical_crossentropy(y).backward()
|
73
|
+
optim.step()
|
74
|
+
print(i, loss.item())
|
74
75
|
```
|
75
76
|
|
76
77
|
See [examples/beautiful_mnist.py](examples/beautiful_mnist.py) for the full version that gets 98% in ~5 seconds
|
@@ -86,9 +87,12 @@ tinygrad already supports numerous accelerators, including:
|
|
86
87
|
- [x] [CUDA](tinygrad/runtime/ops_cuda.py)
|
87
88
|
- [x] [AMD](tinygrad/runtime/ops_amd.py)
|
88
89
|
- [x] [NV](tinygrad/runtime/ops_nv.py)
|
90
|
+
- [x] [QCOM](tinygrad/runtime/ops_qcom.py)
|
89
91
|
|
90
92
|
And it is easy to add more! Your accelerator of choice only needs to support a total of ~25 low level ops.
|
91
93
|
|
94
|
+
To check default accelerator run: `python3 -c "from tinygrad import Device; print(Device.DEFAULT)"`
|
95
|
+
|
92
96
|
## Installation
|
93
97
|
|
94
98
|
The current recommended way to install tinygrad is from source.
|
@@ -113,7 +117,7 @@ Documentation along with a quick start guide can be found on the [docs website](
|
|
113
117
|
|
114
118
|
### Quick example comparing to PyTorch
|
115
119
|
|
116
|
-
```
|
120
|
+
```python
|
117
121
|
from tinygrad import Tensor
|
118
122
|
|
119
123
|
x = Tensor.eye(3, requires_grad=True)
|
@@ -126,7 +130,7 @@ print(y.grad.numpy()) # dz/dy
|
|
126
130
|
```
|
127
131
|
|
128
132
|
The same thing but in PyTorch:
|
129
|
-
```
|
133
|
+
```python
|
130
134
|
import torch
|
131
135
|
|
132
136
|
x = torch.eye(3, requires_grad=True)
|
@@ -174,6 +178,4 @@ python3 -m pytest test/ # whole test suite
|
|
174
178
|
|
175
179
|
#### Process replay tests
|
176
180
|
|
177
|
-
[Process replay](https://github.com/tinygrad/tinygrad/blob/master/test/external/process_replay/
|
178
|
-
|
179
|
-
You can enable process replay by adding [run_process_replay] to your PR title. [example](https://github.com/tinygrad/tinygrad/pull/4995). Note that you should keep your branch up-to-date with master.
|
181
|
+
[Process replay](https://github.com/tinygrad/tinygrad/blob/master/test/external/process_replay/README.md) compares your PR's generated kernels against master. If your PR is a refactor or speedup without any expected behavior change, It should include [pr] in the pull request title.
|
@@ -8,36 +8,36 @@ with open(directory / 'README.md', encoding='utf-8') as f:
|
|
8
8
|
long_description = f.read()
|
9
9
|
|
10
10
|
setup(name='tinygrad',
|
11
|
-
version='0.
|
11
|
+
version='0.10.0',
|
12
12
|
description='You like pytorch? You like micrograd? You love tinygrad! <3',
|
13
13
|
author='George Hotz',
|
14
14
|
license='MIT',
|
15
15
|
long_description=long_description,
|
16
16
|
long_description_content_type='text/markdown',
|
17
17
|
packages = ['tinygrad', 'tinygrad.runtime.autogen', 'tinygrad.codegen', 'tinygrad.nn', 'tinygrad.renderer', 'tinygrad.engine',
|
18
|
-
'tinygrad.runtime', 'tinygrad.runtime.
|
18
|
+
'tinygrad.runtime', 'tinygrad.runtime.support', 'tinygrad.runtime.graph', 'tinygrad.shape'],
|
19
|
+
package_data = {'tinygrad': ['py.typed']},
|
19
20
|
classifiers=[
|
20
21
|
"Programming Language :: Python :: 3",
|
21
22
|
"License :: OSI Approved :: MIT License"
|
22
23
|
],
|
23
|
-
install_requires=[
|
24
|
-
|
25
|
-
"pyobjc-framework-libdispatch; platform_system=='Darwin'"],
|
26
|
-
python_requires='>=3.8',
|
24
|
+
install_requires=[],
|
25
|
+
python_requires='>=3.10',
|
27
26
|
extras_require={
|
28
27
|
'llvm': ["llvmlite"],
|
29
28
|
'arm': ["unicorn"],
|
30
29
|
'triton': ["triton-nightly>=2.1.0.dev20231014192330"],
|
31
30
|
'linting': [
|
32
31
|
"pylint",
|
33
|
-
"mypy",
|
32
|
+
"mypy==1.11.2",
|
34
33
|
"typing-extensions",
|
35
34
|
"pre-commit",
|
36
35
|
"ruff",
|
37
36
|
"types-tqdm",
|
38
37
|
],
|
39
|
-
#'mlperf': ["mlperf-logging @ git+https://github.com/mlperf/logging.git@4.
|
38
|
+
#'mlperf': ["mlperf-logging @ git+https://github.com/mlperf/logging.git@4.1.0-rc3"],
|
40
39
|
'testing': [
|
40
|
+
"numpy",
|
41
41
|
"torch",
|
42
42
|
"pillow",
|
43
43
|
"pytest",
|
@@ -51,18 +51,22 @@ setup(name='tinygrad',
|
|
51
51
|
"transformers",
|
52
52
|
"sentencepiece",
|
53
53
|
"tiktoken",
|
54
|
+
"blobfile",
|
54
55
|
"librosa",
|
55
56
|
"networkx",
|
56
57
|
"hypothesis",
|
57
58
|
"nibabel",
|
58
59
|
"bottle",
|
60
|
+
"ggml-python"
|
59
61
|
],
|
60
62
|
'docs': [
|
63
|
+
"mkdocs",
|
61
64
|
"mkdocs-material",
|
62
65
|
"mkdocstrings[python]",
|
63
66
|
"markdown-callouts",
|
64
67
|
"markdown-exec[ansi]",
|
65
|
-
"black"
|
68
|
+
"black",
|
69
|
+
"numpy",
|
66
70
|
],
|
67
71
|
'testing_tf': [
|
68
72
|
"tensorflow==2.15.1",
|
@@ -0,0 +1,179 @@
|
|
1
|
+
import unittest, contextlib
|
2
|
+
import numpy as np
|
3
|
+
from tinygrad import Tensor, GlobalCounters, dtypes, nn
|
4
|
+
from tinygrad.helpers import CI, Context, getenv
|
5
|
+
from tinygrad.engine.realize import run_schedule
|
6
|
+
from tinygrad.codegen.kernel import Opt, OptOps, Kernel, KernelOptError
|
7
|
+
from tinygrad.engine.realize import CompiledRunner, ExecItem
|
8
|
+
from tinygrad.engine.search import get_kernel_actions
|
9
|
+
|
10
|
+
class TestArange(unittest.TestCase):
|
11
|
+
def _get_flops(self, N, opts=None):
|
12
|
+
GlobalCounters.reset()
|
13
|
+
tt = Tensor.arange(N)
|
14
|
+
sched = tt.schedule()
|
15
|
+
self.assertEqual(len(sched), 1)
|
16
|
+
k = Kernel(sched[-1].ast)
|
17
|
+
if opts is not None:
|
18
|
+
for o in opts: k.apply_opt(o)
|
19
|
+
p = k.to_program()
|
20
|
+
print(p.name)
|
21
|
+
#print(p.src)
|
22
|
+
ExecItem(CompiledRunner(p), [tt.lazydata.buffer]).run()
|
23
|
+
np.testing.assert_equal(tt.numpy(), np.arange(N))
|
24
|
+
return p.op_estimate
|
25
|
+
|
26
|
+
def test_complexity(self, opts=None, limit=None):
|
27
|
+
# add 1 to avoid divide by 0. arange is 0 flops now!
|
28
|
+
f1 = self._get_flops(256, opts) + 1
|
29
|
+
f2 = self._get_flops(2560, opts) + 1
|
30
|
+
print(f"{f1=}, {f2=}")
|
31
|
+
assert (f1 < 5000 and f2 < 5000) or (f2 / f1 < 15), f"bad complexity, flops {f2/f1:.1f}X while inputs 10X"
|
32
|
+
if limit is not None and not getenv("PTX"):
|
33
|
+
# PTX counts index ALU in flops
|
34
|
+
assert f1 <= limit, f"{f1=}, {limit=}"
|
35
|
+
|
36
|
+
def test_complexity_w_upcast(self): return self.test_complexity([Opt(OptOps.UPCAST, 0, 4)], limit=1)
|
37
|
+
def test_complexity_w_unroll2(self): return self.test_complexity([Opt(OptOps.UNROLL, 0, 2)], limit=1)
|
38
|
+
def test_complexity_w_unroll4(self): return self.test_complexity([Opt(OptOps.UNROLL, 0, 4)], limit=1)
|
39
|
+
def test_complexity_w_unroll8(self): return self.test_complexity([Opt(OptOps.UNROLL, 0, 8)], limit=1)
|
40
|
+
def test_complexity_w_upcast_and_unroll(self): return self.test_complexity([Opt(OptOps.UPCAST, 0, 4), Opt(OptOps.UNROLL, 0, 4)], limit=1)
|
41
|
+
|
42
|
+
@unittest.skip("doesn't work yet")
|
43
|
+
def test_complexity_w_local_and_padto(self): return self.test_complexity([Opt(OptOps.LOCAL, 0, 16), Opt(op=OptOps.PADTO, axis=1, amt=32)])
|
44
|
+
|
45
|
+
def test_all_opts(self, opts=None, exclude=None):
|
46
|
+
k = Kernel(Tensor.arange(256).schedule()[-1].ast)
|
47
|
+
if opts is not None:
|
48
|
+
for o in opts: k.apply_opt(o)
|
49
|
+
all_opts_256 = [kk.applied_opts for kk in get_kernel_actions(k, include_0=False).values()]
|
50
|
+
k = Kernel(Tensor.arange(2560).schedule()[-1].ast)
|
51
|
+
if opts is not None:
|
52
|
+
for o in opts: k.apply_opt(o)
|
53
|
+
all_opts_2560 = [kk.applied_opts for kk in get_kernel_actions(k, include_0=False).values()]
|
54
|
+
all_opts = [x for x in all_opts_256 if x in all_opts_2560]
|
55
|
+
for opts in all_opts:
|
56
|
+
if exclude is not None and opts[-1] in exclude: continue
|
57
|
+
print(opts)
|
58
|
+
self.test_complexity(opts)
|
59
|
+
def test_all_opts_w_local(self):
|
60
|
+
with contextlib.suppress(KernelOptError):
|
61
|
+
return self.test_all_opts([Opt(OptOps.LOCAL, 0, 16)], [Opt(op=OptOps.PADTO, axis=1, amt=32)])
|
62
|
+
def test_all_opts_w_upcast(self): return self.test_all_opts([Opt(OptOps.UPCAST, 0, 4)])
|
63
|
+
def test_all_opts_w_unroll(self): return self.test_all_opts([Opt(OptOps.UNROLL, 0, 4)], [Opt(op=OptOps.GROUP, axis=0, amt=0)])
|
64
|
+
def test_all_opts_w_upcast_and_unroll(self):
|
65
|
+
return self.test_all_opts([Opt(OptOps.UPCAST, 0, 4), Opt(OptOps.UNROLL, 0, 4)], [Opt(op=OptOps.GROUP, axis=0, amt=0)])
|
66
|
+
|
67
|
+
class TestIndexing(unittest.TestCase):
|
68
|
+
def test_arange_2_reduce(self):
|
69
|
+
needle = Tensor.zeros(16384, dtype=dtypes.int).contiguous()
|
70
|
+
needle[1337] = 1
|
71
|
+
needle.realize()
|
72
|
+
with Context(NOOPT=1, FUSE_ARANGE=1):
|
73
|
+
GlobalCounters.reset()
|
74
|
+
# TODO: it should work without these reshapes
|
75
|
+
out = ((Tensor.arange(1,16385).reshape(16384,1)-1)*needle.reshape(16384,1)).sum()
|
76
|
+
sched = out.schedule()
|
77
|
+
assert len(sched) == 1
|
78
|
+
run_schedule(sched)
|
79
|
+
assert out.item() == 1337, f"expected 1337, got {out.item()}"
|
80
|
+
|
81
|
+
@unittest.skipIf(getenv("PTX"), "broken on ptx for some reason")
|
82
|
+
def test_manual_index(self):
|
83
|
+
dataset = Tensor.rand(16384, 256).realize()
|
84
|
+
idxs = Tensor([0,3,5,6]).realize()
|
85
|
+
real_index = dataset.numpy()[idxs.numpy()]
|
86
|
+
print("*** indexing ***")
|
87
|
+
with Context(NOOPT=1, FUSE_ARANGE=1):
|
88
|
+
GlobalCounters.reset()
|
89
|
+
rng = Tensor.ones(4, 256, 16384, dtype=dtypes.int)._cumsum(axis=-1, _first_zero=True).reshape(4, 256, 16384, 1)
|
90
|
+
idxs = idxs.reshape(4,1,1,1).expand(4, 256, 16384, 1)
|
91
|
+
reshape_dataset = dataset.T.reshape(1, 256, 16384, 1).expand(4, 256, 16384, 1)
|
92
|
+
full = (rng==idxs).where(reshape_dataset, Tensor.zeros(4, 256, 16384, 1))
|
93
|
+
X = full.sum(axis=(2,3))
|
94
|
+
sched = X.schedule()
|
95
|
+
assert len(sched) == 1
|
96
|
+
run_schedule(sched)
|
97
|
+
assert GlobalCounters.global_ops < 4*16384, f"too many ops {GlobalCounters.global_ops}"
|
98
|
+
np.testing.assert_allclose(real_index, X.numpy())
|
99
|
+
|
100
|
+
def test_index(self):
|
101
|
+
dataset = Tensor.rand(16384, 256).realize()
|
102
|
+
idxs = Tensor([0,3,5,6]).realize()
|
103
|
+
real_index = dataset.numpy()[idxs.numpy()]
|
104
|
+
print("*** indexing ***")
|
105
|
+
with Context(NOOPT=1):
|
106
|
+
GlobalCounters.reset()
|
107
|
+
X = dataset[idxs]
|
108
|
+
assert X.shape == (4,256)
|
109
|
+
sched = X.schedule()
|
110
|
+
# TODO: enable these asserts when the scheduler can handle this
|
111
|
+
#assert len(sched) == 1, f"{len(sched)} != 1"
|
112
|
+
run_schedule(sched)
|
113
|
+
#assert GlobalCounters.global_ops < 4*16384, f"too many ops {GlobalCounters.global_ops}"
|
114
|
+
np.testing.assert_allclose(real_index, X.numpy())
|
115
|
+
|
116
|
+
def test_index_fused(self, noopt=1):
|
117
|
+
dataset = Tensor.rand(16384, 256).realize()
|
118
|
+
idxs = Tensor([0,3,5,6]).realize()
|
119
|
+
real_index = dataset.numpy()[idxs.numpy()]
|
120
|
+
print("*** indexing ***")
|
121
|
+
with Context(NOOPT=noopt, FUSE_ARANGE=1):
|
122
|
+
GlobalCounters.reset()
|
123
|
+
X = dataset[idxs]
|
124
|
+
assert X.shape == (4,256)
|
125
|
+
sched = X.schedule()
|
126
|
+
assert len(sched) == 2
|
127
|
+
run_schedule(sched)
|
128
|
+
assert GlobalCounters.global_ops < 4*16384, f"too many ops {GlobalCounters.global_ops} != {4*16384}"
|
129
|
+
np.testing.assert_allclose(real_index, X.numpy())
|
130
|
+
@unittest.skip("not ready")
|
131
|
+
def test_index_fused_opt(self): self.test_index_fused(0)
|
132
|
+
|
133
|
+
def test_index_fused_out_of_bounds(self):
|
134
|
+
dataset = Tensor.rand(256, 256).realize()
|
135
|
+
idxs = Tensor([-19238, -257, 256, 495, 10982377]).realize()
|
136
|
+
with Context(NOOPT=1, FUSE_ARANGE=1):
|
137
|
+
X = dataset[idxs]
|
138
|
+
np.testing.assert_equal(X.numpy(), 0)
|
139
|
+
|
140
|
+
@unittest.skipIf(getenv("PTX"), "broken on ptx for some reason")
|
141
|
+
def test_index_mnist(self, noopt=1, op_limit=512*784*5):
|
142
|
+
from tinygrad.nn.datasets import mnist
|
143
|
+
X_train, Y_train, _, _ = mnist()
|
144
|
+
with Context(NOOPT=noopt, FUSE_ARANGE=1, SPLIT_REDUCEOP=0):
|
145
|
+
samples = Tensor.randint(getenv("BS", 512), high=X_train.shape[0]).realize()
|
146
|
+
GlobalCounters.reset()
|
147
|
+
x = X_train[samples].numpy()
|
148
|
+
y = Y_train[samples].numpy()
|
149
|
+
assert GlobalCounters.global_ops < op_limit, f"too many ops {GlobalCounters.global_ops} != {op_limit}"
|
150
|
+
np.testing.assert_allclose(X_train.numpy()[samples.numpy()], x)
|
151
|
+
np.testing.assert_allclose(Y_train.numpy()[samples.numpy()], y)
|
152
|
+
@unittest.skip("not ready")
|
153
|
+
def test_index_mnist_opt(self): self.test_index_mnist(0)
|
154
|
+
|
155
|
+
@unittest.skipIf(getenv("PTX"), "broken on ptx for some reason")
|
156
|
+
def test_llama_embedding(self, noopt=1, op_limit=65536):
|
157
|
+
# llama3 is 128256
|
158
|
+
vocab_size, embed_size = (10, 3) if CI else (32000, 4096)
|
159
|
+
emb = nn.Embedding(vocab_size, embed_size)
|
160
|
+
emb_w = emb.weight.numpy()
|
161
|
+
x = Tensor([1,2,3,4])
|
162
|
+
with Context(NOOPT=noopt, FUSE_ARANGE=1):
|
163
|
+
GlobalCounters.reset()
|
164
|
+
z = emb(x).realize()
|
165
|
+
self.assertLessEqual(GlobalCounters.global_ops, op_limit)
|
166
|
+
self.assertEqual(GlobalCounters.kernel_count, 2)
|
167
|
+
if getenv("CHECK", 1):
|
168
|
+
import torch
|
169
|
+
with torch.no_grad():
|
170
|
+
torch_emb = torch.nn.Embedding(vocab_size, embed_size).eval()
|
171
|
+
torch_emb.weight[:] = torch.tensor(emb_w, dtype=torch.float32)
|
172
|
+
torch_z = torch_emb(torch.tensor(x.numpy()))
|
173
|
+
# TODO: reshape to match torch, should we do this in nn?
|
174
|
+
np.testing.assert_allclose(z.numpy().reshape(4, embed_size), torch_z.detach().numpy(), atol=1e-8, rtol=1e-8)
|
175
|
+
# at least the arange is being fused
|
176
|
+
def test_llama_embedding_opt(self): self.test_llama_embedding(0, 1_736_704_000 if CI else 5_898_240_000)
|
177
|
+
|
178
|
+
if __name__ == "__main__":
|
179
|
+
unittest.main()
|
@@ -2,6 +2,7 @@
|
|
2
2
|
import unittest
|
3
3
|
import numpy as np
|
4
4
|
from tinygrad import dtypes, Tensor, TinyJit, GlobalCounters, Variable
|
5
|
+
from tinygrad.engine.schedule import create_schedule
|
5
6
|
|
6
7
|
N = 200 # has to be bigger than the cache to fail
|
7
8
|
|
@@ -57,10 +58,12 @@ class TestAssign(unittest.TestCase):
|
|
57
58
|
x.realize()
|
58
59
|
x = Tensor([0])
|
59
60
|
f(x)
|
60
|
-
|
61
|
+
out = x.item()
|
62
|
+
assert out == 1, f"expected 1, got {out}"
|
61
63
|
x = Tensor([0])
|
62
64
|
f(x)
|
63
|
-
|
65
|
+
out = x.item()
|
66
|
+
assert out == 1, f"expected 1, got {out}"
|
64
67
|
|
65
68
|
def test_assign_add_jit(self):
|
66
69
|
@TinyJit
|
@@ -165,6 +168,16 @@ class TestAssign(unittest.TestCase):
|
|
165
168
|
a += 1
|
166
169
|
np.testing.assert_allclose(a.numpy(), 3)
|
167
170
|
|
171
|
+
# NOTE: this is similar to the resnet failure
|
172
|
+
#@unittest.expectedFailure
|
173
|
+
def test_double_assign_alt(self):
|
174
|
+
a = Tensor.ones(4).contiguous().realize()
|
175
|
+
b = Tensor([1, 2, 3, 4]).realize().lazydata
|
176
|
+
a1 = a.lazydata.assign(b)
|
177
|
+
a2 = a.lazydata.assign(b)
|
178
|
+
sched = create_schedule([a1, a2])
|
179
|
+
self.assertEqual(len(sched), 1)
|
180
|
+
|
168
181
|
def test_crossover_assign(self):
|
169
182
|
a = Tensor.full((4,), 2).contiguous().realize()
|
170
183
|
b = Tensor.full((4,), 3).contiguous().realize()
|
@@ -347,7 +360,7 @@ class TestAssign(unittest.TestCase):
|
|
347
360
|
|
348
361
|
def test_permuted_assignment_masked_view_possible(self):
|
349
362
|
a = Tensor.ones(4, 4).contiguous().realize()
|
350
|
-
b = a.shrink((None, (0, 2))).pad((None, (0, 2)), 2)
|
363
|
+
b = a.shrink((None, (0, 2))).pad((None, (0, 2)), value=2)
|
351
364
|
a.assign(a + b)
|
352
365
|
kc = GlobalCounters.kernel_count
|
353
366
|
a.realize()
|
@@ -357,7 +370,7 @@ class TestAssign(unittest.TestCase):
|
|
357
370
|
def test_permuted_assignment_masked_view_not_contiguous(self):
|
358
371
|
a = Tensor.ones(4, 4).contiguous().realize()
|
359
372
|
with self.assertRaisesRegex(RuntimeError, "contiguous"):
|
360
|
-
b = a.shrink((None, (0, 2))).pad((None, (0, 2)), 2).permute(1, 0)
|
373
|
+
b = a.shrink((None, (0, 2))).pad((None, (0, 2)), value=2).permute(1, 0)
|
361
374
|
a.assign(a + b)
|
362
375
|
a.realize()
|
363
376
|
|
@@ -0,0 +1,18 @@
|
|
1
|
+
import unittest
|
2
|
+
from tinygrad import Tensor, dtypes, Device
|
3
|
+
from tinygrad.engine.realize import lower_schedule
|
4
|
+
from tinygrad.device import is_dtype_supported
|
5
|
+
|
6
|
+
class TestCompileFailures(unittest.TestCase):
|
7
|
+
def compile(self, out:Tensor):
|
8
|
+
for _ in lower_schedule(out.schedule()): pass
|
9
|
+
|
10
|
+
@unittest.skipUnless(is_dtype_supported(dtypes.uchar, Device.DEFAULT), f"no uint8 on {Device.DEFAULT}")
|
11
|
+
def test_interpolate_atari(self):
|
12
|
+
self.compile(Tensor.empty(210, 160, dtype='uint8').interpolate((64, 64)))
|
13
|
+
|
14
|
+
def test_add_max_uchar(self):
|
15
|
+
self.compile((Tensor.empty(1024, dtype='uint8') + Tensor.empty(1024, dtype='uint8')).max())
|
16
|
+
|
17
|
+
if __name__ == '__main__':
|
18
|
+
unittest.main()
|
@@ -1,15 +1,15 @@
|
|
1
1
|
import unittest, math
|
2
2
|
from tinygrad import Tensor, Device, dtypes
|
3
|
+
from tinygrad.ops import Ops
|
3
4
|
from tinygrad.engine.schedule import create_schedule
|
4
5
|
from tinygrad.helpers import CI
|
5
|
-
from tinygrad.ops import BufferOps
|
6
6
|
import numpy as np
|
7
|
-
from
|
7
|
+
from tinygrad.device import is_dtype_supported
|
8
8
|
|
9
9
|
def _check_ast_count(desired_count:int, t:Tensor):
|
10
10
|
# NOTE: this has side effect because everything can be scheduled only once
|
11
11
|
schedule = create_schedule(t.lazydata.lbs)
|
12
|
-
asts = [s for s in schedule if s.ast
|
12
|
+
asts = [s for s in schedule if s.ast.op is Ops.SINK]
|
13
13
|
assert len(asts) == desired_count
|
14
14
|
|
15
15
|
class TestUnaryOpsConstFolding(unittest.TestCase):
|
@@ -23,6 +23,7 @@ class TestUnaryOpsConstFolding(unittest.TestCase):
|
|
23
23
|
_check_ast_count(0, Tensor.ones(4).cast(dtypes.int16))
|
24
24
|
_check_ast_count(0, Tensor.full(4, fill_value=-1).cast(dtypes.uint16))
|
25
25
|
|
26
|
+
@unittest.expectedFailure # no two level fold at lazybuffer
|
26
27
|
def test_neg_folding(self):
|
27
28
|
_check_ast_count(0, Tensor([1, 2, 3]).mul(-1).neg())
|
28
29
|
_check_ast_count(0, Tensor([1, 2, 3]).neg().mul(-1))
|
@@ -78,6 +79,11 @@ class TestBinaryOpsConstFolding(unittest.TestCase):
|
|
78
79
|
def test_div_tensor_one(self):
|
79
80
|
_check_ast_count(0, Tensor([1.0, 2, 3, 4]) / Tensor.ones(4))
|
80
81
|
|
82
|
+
def test_idiv_literal_one(self):
|
83
|
+
_check_ast_count(0, Tensor([1, 2, 3, 4]) // 1)
|
84
|
+
def test_idiv_tensor_one(self):
|
85
|
+
_check_ast_count(0, Tensor([1, 2, 3, 4]) // Tensor.ones(4, dtype=dtypes.int32))
|
86
|
+
|
81
87
|
def test_pow_literal_zero(self):
|
82
88
|
_check_ast_count(0, Tensor([1.0, 2, 3, 4]) ** 0)
|
83
89
|
def test_pow_tensor_zero(self):
|
@@ -124,13 +130,16 @@ class TestMovedConstFolding(unittest.TestCase):
|
|
124
130
|
|
125
131
|
def test_cast_padded(self):
|
126
132
|
# NOTE: this is folded due to CAST_BEFORE_VIEW
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
133
|
+
if is_dtype_supported(dtypes.int16):
|
134
|
+
_check_ast_count(0, Tensor.ones(4).pad(((1, 1),)).cast(dtypes.int16))
|
135
|
+
np.testing.assert_equal(Tensor.ones(4).pad(((1, 1),)).cast(dtypes.int16).numpy(), [0, 1, 1, 1, 1, 0])
|
136
|
+
if is_dtype_supported(dtypes.uint16):
|
137
|
+
_check_ast_count(0, Tensor.full(4, fill_value=-1).pad(((1, 1),)).cast(dtypes.uint16))
|
138
|
+
np.testing.assert_equal(Tensor.full(4, fill_value=-1).pad(((1, 1),)).cast(dtypes.uint16).numpy(), [0, 65535, 65535, 65535, 65535, 0])
|
131
139
|
# not folded
|
132
|
-
|
133
|
-
|
140
|
+
if is_dtype_supported(dtypes.int64):
|
141
|
+
_check_ast_count(1, Tensor.ones(4).pad(((1, 1),)).cast(dtypes.int64))
|
142
|
+
np.testing.assert_equal(Tensor.ones(4).pad(((1, 1),)).cast(dtypes.int64).numpy(), [0, 1, 1, 1, 1, 0])
|
134
143
|
|
135
144
|
class TestReduceOpsConstFolding(unittest.TestCase):
|
136
145
|
def test_const_sum(self):
|
@@ -145,10 +154,18 @@ class TestReduceOpsConstFolding(unittest.TestCase):
|
|
145
154
|
_check_ast_count(1, Tensor.ones(4).pad(((1, 1),)).sum())
|
146
155
|
np.testing.assert_equal(Tensor.ones(4).pad(((1, 1),)).sum().numpy(), 4)
|
147
156
|
|
148
|
-
# NOTE: cannot just count the non-padded area because some
|
157
|
+
# NOTE: cannot just count the non-padded area because some Ops f do not have f(0) = 0.
|
149
158
|
_check_ast_count(1, Tensor.ones(4).pad(((1, 1),)).exp().sum())
|
150
159
|
np.testing.assert_allclose(Tensor.ones(4).pad(((1, 1),)).exp().sum().numpy(), 4 * math.e + 2)
|
151
160
|
|
161
|
+
def test_const_prod(self):
|
162
|
+
_check_ast_count(0, Tensor.full((2, 3), fill_value=2).prod())
|
163
|
+
np.testing.assert_equal(Tensor.full((2, 3), fill_value=2).prod().numpy(), 2**(2*3))
|
164
|
+
_check_ast_count(0, Tensor.full((4, 5, 6), fill_value=2).prod(axis=0))
|
165
|
+
np.testing.assert_equal(Tensor.full((4, 5, 6), fill_value=2).prod(axis=0).numpy(), np.full((5, 6), 2**4))
|
166
|
+
_check_ast_count(0, Tensor(4).prod())
|
167
|
+
np.testing.assert_equal(Tensor(4).prod().numpy(), 4)
|
168
|
+
|
152
169
|
def test_const_max(self):
|
153
170
|
_check_ast_count(0, Tensor.ones(4, 5, 6).max())
|
154
171
|
np.testing.assert_equal(Tensor.ones(4, 5, 6).max().numpy(), 1)
|
@@ -234,7 +251,6 @@ class TestTautologicalCompare(unittest.TestCase):
|
|
234
251
|
np.testing.assert_equal((Tensor(True) < Tensor(False)).numpy(), False)
|
235
252
|
np.testing.assert_equal((Tensor(True) < Tensor(True)).numpy(), False)
|
236
253
|
|
237
|
-
@unittest.skip("not implemented yet")
|
238
254
|
def test_a_eq_a(self):
|
239
255
|
# self eq is always true for int or bool
|
240
256
|
a = Tensor([1, 2, 3])
|
@@ -244,7 +260,6 @@ class TestTautologicalCompare(unittest.TestCase):
|
|
244
260
|
a = Tensor([math.nan, 1.0, 2.0])
|
245
261
|
np.testing.assert_equal((a == a).numpy(), [False, True, True])
|
246
262
|
|
247
|
-
@unittest.skip("not implemented yet")
|
248
263
|
def test_a_ne_a(self):
|
249
264
|
# self not eq is always false for int or bool
|
250
265
|
a = Tensor([1, 2, 3])
|
@@ -42,6 +42,14 @@ class TestConv(unittest.TestCase):
|
|
42
42
|
|
43
43
|
print(ret.numpy())
|
44
44
|
|
45
|
+
def test_two_binops_no_rerun_small(self):
|
46
|
+
Tensor.no_grad = True
|
47
|
+
x = Tensor.rand(1,1,32,32)
|
48
|
+
w = Tensor.rand(1,1,3,3)
|
49
|
+
out = x.conv2d(w, padding=(1,1))
|
50
|
+
np.testing.assert_allclose(out.relu().numpy(), np.maximum(out.numpy(), 0))
|
51
|
+
Tensor.no_grad = False
|
52
|
+
|
45
53
|
def test_two_binops_no_rerun(self):
|
46
54
|
Tensor.no_grad = True
|
47
55
|
x = Tensor.randn(1,12,128,256)
|