tinygrad 0.9.0__tar.gz → 0.9.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {tinygrad-0.9.0 → tinygrad-0.9.2}/PKG-INFO +23 -14
- {tinygrad-0.9.0 → tinygrad-0.9.2}/README.md +18 -12
- {tinygrad-0.9.0 → tinygrad-0.9.2}/setup.py +7 -3
- tinygrad-0.9.2/test/test_arange.py +167 -0
- {tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_const_folding.py +8 -3
- {tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_conv.py +8 -0
- tinygrad-0.9.2/test/test_conv_shapetracker.py +63 -0
- {tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_custom_function.py +6 -5
- {tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_device_speed.py +2 -2
- {tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_dtype.py +94 -13
- {tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_dtype_alu.py +54 -15
- {tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_fuzz_shape_ops.py +3 -2
- tinygrad-0.9.2/test/test_graph.py +235 -0
- tinygrad-0.9.2/test/test_hcq.py +463 -0
- {tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_image_dtype.py +22 -10
- {tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_jit.py +122 -2
- {tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_lazybuffer.py +9 -9
- {tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_lazyop.py +1 -1
- tinygrad-0.9.2/test/test_linearizer.py +2077 -0
- tinygrad-0.9.2/test/test_linearizer_dumb.py +104 -0
- tinygrad-0.9.2/test/test_linearizer_failures.py +467 -0
- {tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_linearizer_overflows.py +3 -3
- {tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_multitensor.py +231 -95
- {tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_nn.py +147 -68
- tinygrad-0.9.2/test/test_ocl.py +20 -0
- {tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_ops.py +380 -105
- {tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_optim.py +2 -1
- tinygrad-0.9.2/test/test_pattern_matcher.py +186 -0
- {tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_pickle.py +36 -4
- tinygrad-0.9.2/test/test_profiler.py +220 -0
- {tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_randomness.py +12 -6
- tinygrad-0.9.2/test/test_renderer_failures.py +43 -0
- tinygrad-0.9.2/test/test_schedule.py +1589 -0
- {tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_search.py +24 -13
- {tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_speed_v_torch.py +5 -14
- {tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_subbuffer.py +2 -3
- {tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_symbolic_jit.py +62 -1
- {tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_symbolic_ops.py +37 -29
- {tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_symbolic_shapetracker.py +47 -1
- {tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_tensor.py +225 -62
- {tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_tensor_data.py +12 -1
- {tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_tensor_variable.py +23 -18
- tinygrad-0.9.2/test/test_transcendental.py +71 -0
- tinygrad-0.9.2/test/test_uop_graph.py +662 -0
- tinygrad-0.9.2/test/test_uops.py +379 -0
- tinygrad-0.9.2/test/test_uops_stats.py +203 -0
- tinygrad-0.9.2/test/test_verify_lazyop.py +76 -0
- {tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_winograd.py +9 -7
- {tinygrad-0.9.0 → tinygrad-0.9.2}/tinygrad/codegen/kernel.py +313 -192
- tinygrad-0.9.2/tinygrad/codegen/lowerer.py +215 -0
- tinygrad-0.9.2/tinygrad/codegen/transcendental.py +310 -0
- tinygrad-0.9.2/tinygrad/codegen/uopgraph.py +622 -0
- tinygrad-0.9.2/tinygrad/codegen/uops.py +293 -0
- tinygrad-0.9.2/tinygrad/device.py +679 -0
- {tinygrad-0.9.0 → tinygrad-0.9.2}/tinygrad/dtype.py +25 -11
- tinygrad-0.9.2/tinygrad/engine/__init__.py +0 -0
- {tinygrad-0.9.0 → tinygrad-0.9.2}/tinygrad/engine/graph.py +24 -37
- tinygrad-0.9.2/tinygrad/engine/jit.py +276 -0
- tinygrad-0.9.2/tinygrad/engine/realize.py +268 -0
- tinygrad-0.9.2/tinygrad/engine/schedule.py +413 -0
- {tinygrad-0.9.0 → tinygrad-0.9.2}/tinygrad/engine/search.py +33 -23
- {tinygrad-0.9.0 → tinygrad-0.9.2}/tinygrad/function.py +26 -23
- {tinygrad-0.9.0 → tinygrad-0.9.2}/tinygrad/helpers.py +121 -14
- {tinygrad-0.9.0 → tinygrad-0.9.2}/tinygrad/lazy.py +55 -56
- {tinygrad-0.9.0 → tinygrad-0.9.2}/tinygrad/multi.py +51 -42
- {tinygrad-0.9.0 → tinygrad-0.9.2}/tinygrad/nn/__init__.py +40 -23
- {tinygrad-0.9.0 → tinygrad-0.9.2}/tinygrad/nn/datasets.py +2 -1
- {tinygrad-0.9.0 → tinygrad-0.9.2}/tinygrad/nn/state.py +6 -7
- tinygrad-0.9.2/tinygrad/ops.py +170 -0
- tinygrad-0.9.2/tinygrad/renderer/__init__.py +87 -0
- tinygrad-0.9.2/tinygrad/renderer/assembly.py +267 -0
- {tinygrad-0.9.0 → tinygrad-0.9.2}/tinygrad/renderer/cstyle.py +125 -93
- {tinygrad-0.9.0 → tinygrad-0.9.2}/tinygrad/renderer/llvmir.py +44 -53
- tinygrad-0.9.2/tinygrad/runtime/__init__.py +0 -0
- tinygrad-0.9.2/tinygrad/runtime/autogen/amd_gpu.py +32858 -0
- {tinygrad-0.9.0 → tinygrad-0.9.2}/tinygrad/runtime/autogen/comgr.py +36 -10
- {tinygrad-0.9.0 → tinygrad-0.9.2}/tinygrad/runtime/autogen/cuda.py +6 -162
- {tinygrad-0.9.0 → tinygrad-0.9.2}/tinygrad/runtime/autogen/hsa.py +146 -14
- tinygrad-0.9.2/tinygrad/runtime/autogen/io_uring.py +1486 -0
- {tinygrad-0.9.0 → tinygrad-0.9.2}/tinygrad/runtime/autogen/kfd.py +32 -0
- tinygrad-0.9.2/tinygrad/runtime/autogen/libc.py +4260 -0
- {tinygrad-0.9.0 → tinygrad-0.9.2}/tinygrad/runtime/autogen/nv_gpu.py +269 -0
- tinygrad-0.9.2/tinygrad/runtime/autogen/nvrtc.py +579 -0
- tinygrad-0.9.2/tinygrad/runtime/graph/__init__.py +0 -0
- {tinygrad-0.9.0 → tinygrad-0.9.2}/tinygrad/runtime/graph/clang.py +5 -4
- {tinygrad-0.9.0 → tinygrad-0.9.2}/tinygrad/runtime/graph/cuda.py +9 -12
- tinygrad-0.9.2/tinygrad/runtime/graph/hcq.py +200 -0
- {tinygrad-0.9.0 → tinygrad-0.9.2}/tinygrad/runtime/graph/metal.py +18 -15
- tinygrad-0.9.2/tinygrad/runtime/ops_amd.py +442 -0
- {tinygrad-0.9.0 → tinygrad-0.9.2}/tinygrad/runtime/ops_clang.py +2 -2
- tinygrad-0.9.2/tinygrad/runtime/ops_cuda.py +127 -0
- tinygrad-0.9.2/tinygrad/runtime/ops_disk.py +121 -0
- {tinygrad-0.9.0 → tinygrad-0.9.2}/tinygrad/runtime/ops_gpu.py +6 -4
- tinygrad-0.9.2/tinygrad/runtime/ops_hip.py +70 -0
- {tinygrad-0.9.0 → tinygrad-0.9.2}/tinygrad/runtime/ops_metal.py +43 -33
- {tinygrad-0.9.0 → tinygrad-0.9.2}/tinygrad/runtime/ops_npy.py +1 -1
- tinygrad-0.9.2/tinygrad/runtime/ops_nv.py +545 -0
- {tinygrad-0.9.0 → tinygrad-0.9.2}/tinygrad/runtime/ops_python.py +35 -35
- tinygrad-0.9.2/tinygrad/runtime/support/__init__.py +0 -0
- tinygrad-0.9.2/tinygrad/runtime/support/compiler_cuda.py +78 -0
- tinygrad-0.9.0/tinygrad/runtime/driver/hip_comgr.py → tinygrad-0.9.2/tinygrad/runtime/support/compiler_hip.py +35 -12
- tinygrad-0.9.2/tinygrad/runtime/support/elf.py +38 -0
- tinygrad-0.9.2/tinygrad/shape/__init__.py +0 -0
- {tinygrad-0.9.0 → tinygrad-0.9.2}/tinygrad/shape/shapetracker.py +10 -16
- {tinygrad-0.9.0 → tinygrad-0.9.2}/tinygrad/shape/symbolic.py +5 -11
- {tinygrad-0.9.0 → tinygrad-0.9.2}/tinygrad/shape/view.py +67 -40
- {tinygrad-0.9.0 → tinygrad-0.9.2}/tinygrad/tensor.py +601 -215
- {tinygrad-0.9.0 → tinygrad-0.9.2}/tinygrad.egg-info/PKG-INFO +23 -14
- {tinygrad-0.9.0 → tinygrad-0.9.2}/tinygrad.egg-info/SOURCES.txt +23 -5
- {tinygrad-0.9.0 → tinygrad-0.9.2}/tinygrad.egg-info/requires.txt +4 -1
- tinygrad-0.9.0/test/test_arange.py +0 -17
- tinygrad-0.9.0/test/test_conv_shapetracker.py +0 -22
- tinygrad-0.9.0/test/test_linearizer.py +0 -1453
- tinygrad-0.9.0/test/test_linearizer_failures.py +0 -248
- tinygrad-0.9.0/test/test_pattern_matcher.py +0 -93
- tinygrad-0.9.0/test/test_schedule.py +0 -859
- tinygrad-0.9.0/test/test_uop_graph.py +0 -82
- tinygrad-0.9.0/test/test_uops.py +0 -245
- tinygrad-0.9.0/test/test_uops_stats.py +0 -83
- tinygrad-0.9.0/tinygrad/codegen/linearizer.py +0 -460
- tinygrad-0.9.0/tinygrad/codegen/uops.py +0 -415
- tinygrad-0.9.0/tinygrad/device.py +0 -183
- tinygrad-0.9.0/tinygrad/engine/jit.py +0 -195
- tinygrad-0.9.0/tinygrad/engine/realize.py +0 -191
- tinygrad-0.9.0/tinygrad/engine/schedule.py +0 -362
- tinygrad-0.9.0/tinygrad/ops.py +0 -136
- tinygrad-0.9.0/tinygrad/renderer/__init__.py +0 -61
- tinygrad-0.9.0/tinygrad/renderer/assembly.py +0 -276
- tinygrad-0.9.0/tinygrad/runtime/autogen/amd_gpu.py +0 -1900
- tinygrad-0.9.0/tinygrad/runtime/driver/hsa.py +0 -143
- tinygrad-0.9.0/tinygrad/runtime/graph/hcq.py +0 -143
- tinygrad-0.9.0/tinygrad/runtime/graph/hsa.py +0 -171
- tinygrad-0.9.0/tinygrad/runtime/ops_amd.py +0 -564
- tinygrad-0.9.0/tinygrad/runtime/ops_cuda.py +0 -185
- tinygrad-0.9.0/tinygrad/runtime/ops_disk.py +0 -60
- tinygrad-0.9.0/tinygrad/runtime/ops_hsa.py +0 -278
- tinygrad-0.9.0/tinygrad/runtime/ops_nv.py +0 -630
- {tinygrad-0.9.0 → tinygrad-0.9.2}/LICENSE +0 -0
- {tinygrad-0.9.0 → tinygrad-0.9.2}/setup.cfg +0 -0
- {tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_assign.py +0 -0
- {tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_copy_speed.py +0 -0
- {tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_fusion_op.py +0 -0
- {tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_gc.py +0 -0
- {tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_kernel_cache.py +0 -0
- {tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_masked_st.py +0 -0
- {tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_method_cache.py +0 -0
- {tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_net_speed.py +0 -0
- {tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_sample.py +0 -0
- {tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_setitem.py +0 -0
- {tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_specific_conv.py +0 -0
- {tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_to_numpy.py +0 -0
- {tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_zero_copy.py +0 -0
- {tinygrad-0.9.0 → tinygrad-0.9.2}/tinygrad/__init__.py +0 -0
- {tinygrad-0.9.0/tinygrad/engine → tinygrad-0.9.2/tinygrad/codegen}/__init__.py +0 -0
- {tinygrad-0.9.0 → tinygrad-0.9.2}/tinygrad/nn/optim.py +0 -0
- {tinygrad-0.9.0 → tinygrad-0.9.2}/tinygrad/runtime/autogen/hip.py +0 -0
- {tinygrad-0.9.0 → tinygrad-0.9.2}/tinygrad/runtime/autogen/opencl.py +0 -0
- {tinygrad-0.9.0 → tinygrad-0.9.2}/tinygrad/runtime/ops_llvm.py +0 -0
- {tinygrad-0.9.0 → tinygrad-0.9.2}/tinygrad.egg-info/dependency_links.txt +0 -0
- {tinygrad-0.9.0 → tinygrad-0.9.2}/tinygrad.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: tinygrad
|
3
|
-
Version: 0.9.
|
3
|
+
Version: 0.9.2
|
4
4
|
Summary: You like pytorch? You like micrograd? You love tinygrad! <3
|
5
5
|
Author: George Hotz
|
6
6
|
License: MIT
|
@@ -10,7 +10,6 @@ Requires-Python: >=3.8
|
|
10
10
|
Description-Content-Type: text/markdown
|
11
11
|
License-File: LICENSE
|
12
12
|
Requires-Dist: numpy
|
13
|
-
Requires-Dist: tqdm
|
14
13
|
Requires-Dist: pyobjc-framework-Metal; platform_system == "Darwin"
|
15
14
|
Requires-Dist: pyobjc-framework-libdispatch; platform_system == "Darwin"
|
16
15
|
Provides-Extra: llvm
|
@@ -35,15 +34,19 @@ Requires-Dist: onnx==1.16.0; extra == "testing"
|
|
35
34
|
Requires-Dist: onnx2torch; extra == "testing"
|
36
35
|
Requires-Dist: opencv-python; extra == "testing"
|
37
36
|
Requires-Dist: tabulate; extra == "testing"
|
37
|
+
Requires-Dist: tqdm; extra == "testing"
|
38
38
|
Requires-Dist: safetensors; extra == "testing"
|
39
39
|
Requires-Dist: transformers; extra == "testing"
|
40
40
|
Requires-Dist: sentencepiece; extra == "testing"
|
41
41
|
Requires-Dist: tiktoken; extra == "testing"
|
42
|
+
Requires-Dist: blobfile; extra == "testing"
|
42
43
|
Requires-Dist: librosa; extra == "testing"
|
43
44
|
Requires-Dist: networkx; extra == "testing"
|
44
45
|
Requires-Dist: hypothesis; extra == "testing"
|
45
46
|
Requires-Dist: nibabel; extra == "testing"
|
47
|
+
Requires-Dist: bottle; extra == "testing"
|
46
48
|
Provides-Extra: docs
|
49
|
+
Requires-Dist: mkdocs; extra == "docs"
|
47
50
|
Requires-Dist: mkdocs-material; extra == "docs"
|
48
51
|
Requires-Dist: mkdocstrings[python]; extra == "docs"
|
49
52
|
Requires-Dist: markdown-callouts; extra == "docs"
|
@@ -64,7 +67,7 @@ tinygrad: For something between [PyTorch](https://github.com/pytorch/pytorch) an
|
|
64
67
|
|
65
68
|
<h3>
|
66
69
|
|
67
|
-
[Homepage](https://github.com/tinygrad/tinygrad) | [Documentation](
|
70
|
+
[Homepage](https://github.com/tinygrad/tinygrad) | [Documentation](https://docs.tinygrad.org/) | [Discord](https://discord.gg/ZjZadyC7PK)
|
68
71
|
|
69
72
|
</h3>
|
70
73
|
|
@@ -106,7 +109,7 @@ And we can change `DEBUG` to `4` to see the generated code.
|
|
106
109
|
As it turns out, 90% of what you need for neural networks are a decent autograd/tensor library.
|
107
110
|
Throw in an optimizer, a data loader, and some compute, and you have all you need.
|
108
111
|
|
109
|
-
```
|
112
|
+
```python
|
110
113
|
from tinygrad import Tensor, nn
|
111
114
|
|
112
115
|
class LinearNet:
|
@@ -121,11 +124,12 @@ optim = nn.optim.Adam([model.l1, model.l2], lr=0.001)
|
|
121
124
|
|
122
125
|
x, y = Tensor.rand(4, 1, 28, 28), Tensor([2,4,3,7]) # replace with real mnist dataloader
|
123
126
|
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
127
|
+
with Tensor.train():
|
128
|
+
for i in range(10):
|
129
|
+
optim.zero_grad()
|
130
|
+
loss = model(x).sparse_categorical_crossentropy(y).backward()
|
131
|
+
optim.step()
|
132
|
+
print(i, loss.item())
|
129
133
|
```
|
130
134
|
|
131
135
|
See [examples/beautiful_mnist.py](examples/beautiful_mnist.py) for the full version that gets 98% in ~5 seconds
|
@@ -139,7 +143,8 @@ tinygrad already supports numerous accelerators, including:
|
|
139
143
|
- [x] [LLVM](tinygrad/runtime/ops_llvm.py)
|
140
144
|
- [x] [METAL](tinygrad/runtime/ops_metal.py)
|
141
145
|
- [x] [CUDA](tinygrad/runtime/ops_cuda.py)
|
142
|
-
- [x] [
|
146
|
+
- [x] [AMD](tinygrad/runtime/ops_amd.py)
|
147
|
+
- [x] [NV](tinygrad/runtime/ops_nv.py)
|
143
148
|
|
144
149
|
And it is easy to add more! Your accelerator of choice only needs to support a total of ~25 low level ops.
|
145
150
|
|
@@ -163,11 +168,11 @@ python3 -m pip install git+https://github.com/tinygrad/tinygrad.git
|
|
163
168
|
|
164
169
|
## Documentation
|
165
170
|
|
166
|
-
Documentation along with a quick start guide can be found
|
171
|
+
Documentation along with a quick start guide can be found on the [docs website](https://docs.tinygrad.org/) built from the [docs/](/docs) directory.
|
167
172
|
|
168
173
|
### Quick example comparing to PyTorch
|
169
174
|
|
170
|
-
```
|
175
|
+
```python
|
171
176
|
from tinygrad import Tensor
|
172
177
|
|
173
178
|
x = Tensor.eye(3, requires_grad=True)
|
@@ -180,7 +185,7 @@ print(y.grad.numpy()) # dz/dy
|
|
180
185
|
```
|
181
186
|
|
182
187
|
The same thing but in PyTorch:
|
183
|
-
```
|
188
|
+
```python
|
184
189
|
import torch
|
185
190
|
|
186
191
|
x = torch.eye(3, requires_grad=True)
|
@@ -209,7 +214,7 @@ Now, what we want:
|
|
209
214
|
- Bug fixes (with a regression test) are great! This library isn't 1.0 yet, so if you stumble upon a bug, fix it, write a test, and submit a PR, this is valuable work.
|
210
215
|
- Solving bounties! tinygrad [offers cash bounties](https://docs.google.com/spreadsheets/d/1WKHbT-7KOgjEawq5h5Ic1qUWzpfAzuD_J06N1JwOCGs/edit?usp=sharing) for certain improvements to the library. All new code should be high quality and well tested.
|
211
216
|
- Features. However, if you are adding a feature, consider the line tradeoff. If it's 3 lines, there's less of a bar of usefulness it has to meet over something that's 30 or 300 lines. All features must have regression tests. In general with no other constraints, your feature's API should match torch or numpy.
|
212
|
-
- Refactors that are clear wins. In general, if your refactor isn't a clear win it will be closed. But some refactors are amazing! Think about readability in a deep core sense. A whitespace change or moving a few functions around is useless, but if you realize that two 100 line functions can actually use the same 110 line function with arguments while also improving readability, this is a big win.
|
217
|
+
- Refactors that are clear wins. In general, if your refactor isn't a clear win it will be closed. But some refactors are amazing! Think about readability in a deep core sense. A whitespace change or moving a few functions around is useless, but if you realize that two 100 line functions can actually use the same 110 line function with arguments while also improving readability, this is a big win. Refactors should pass [process replay](#process-replay-tests).
|
213
218
|
- Tests/fuzzers. If you can add tests that are non brittle, they are welcome. We have some fuzzers in here too, and there's a plethora of bugs that can be found with them and by improving them. Finding bugs, even writing broken tests (that should pass) with `@unittest.expectedFailure` is great. This is how we make progress.
|
214
219
|
- Dead code removal from core `tinygrad/` folder. We don't care about the code in extra, but removing dead code from the core library is great. Less for new people to read and be confused by.
|
215
220
|
|
@@ -225,3 +230,7 @@ python3 -m pip install -e '.[testing]' # install extra deps for testing
|
|
225
230
|
python3 test/test_ops.py # just the ops tests
|
226
231
|
python3 -m pytest test/ # whole test suite
|
227
232
|
```
|
233
|
+
|
234
|
+
#### Process replay tests
|
235
|
+
|
236
|
+
[Process replay](https://github.com/tinygrad/tinygrad/blob/master/test/external/process_replay/process_replay.py) compares your PR's generated kernels against master. If your PR is a refactor or speedup without any expected behavior change, It should include [run_process_replay] in the PR title, [example](https://github.com/tinygrad/tinygrad/pull/4995). Note that you should keep your branch up-to-date with master.
|
@@ -9,7 +9,7 @@ tinygrad: For something between [PyTorch](https://github.com/pytorch/pytorch) an
|
|
9
9
|
|
10
10
|
<h3>
|
11
11
|
|
12
|
-
[Homepage](https://github.com/tinygrad/tinygrad) | [Documentation](
|
12
|
+
[Homepage](https://github.com/tinygrad/tinygrad) | [Documentation](https://docs.tinygrad.org/) | [Discord](https://discord.gg/ZjZadyC7PK)
|
13
13
|
|
14
14
|
</h3>
|
15
15
|
|
@@ -51,7 +51,7 @@ And we can change `DEBUG` to `4` to see the generated code.
|
|
51
51
|
As it turns out, 90% of what you need for neural networks are a decent autograd/tensor library.
|
52
52
|
Throw in an optimizer, a data loader, and some compute, and you have all you need.
|
53
53
|
|
54
|
-
```
|
54
|
+
```python
|
55
55
|
from tinygrad import Tensor, nn
|
56
56
|
|
57
57
|
class LinearNet:
|
@@ -66,11 +66,12 @@ optim = nn.optim.Adam([model.l1, model.l2], lr=0.001)
|
|
66
66
|
|
67
67
|
x, y = Tensor.rand(4, 1, 28, 28), Tensor([2,4,3,7]) # replace with real mnist dataloader
|
68
68
|
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
69
|
+
with Tensor.train():
|
70
|
+
for i in range(10):
|
71
|
+
optim.zero_grad()
|
72
|
+
loss = model(x).sparse_categorical_crossentropy(y).backward()
|
73
|
+
optim.step()
|
74
|
+
print(i, loss.item())
|
74
75
|
```
|
75
76
|
|
76
77
|
See [examples/beautiful_mnist.py](examples/beautiful_mnist.py) for the full version that gets 98% in ~5 seconds
|
@@ -84,7 +85,8 @@ tinygrad already supports numerous accelerators, including:
|
|
84
85
|
- [x] [LLVM](tinygrad/runtime/ops_llvm.py)
|
85
86
|
- [x] [METAL](tinygrad/runtime/ops_metal.py)
|
86
87
|
- [x] [CUDA](tinygrad/runtime/ops_cuda.py)
|
87
|
-
- [x] [
|
88
|
+
- [x] [AMD](tinygrad/runtime/ops_amd.py)
|
89
|
+
- [x] [NV](tinygrad/runtime/ops_nv.py)
|
88
90
|
|
89
91
|
And it is easy to add more! Your accelerator of choice only needs to support a total of ~25 low level ops.
|
90
92
|
|
@@ -108,11 +110,11 @@ python3 -m pip install git+https://github.com/tinygrad/tinygrad.git
|
|
108
110
|
|
109
111
|
## Documentation
|
110
112
|
|
111
|
-
Documentation along with a quick start guide can be found
|
113
|
+
Documentation along with a quick start guide can be found on the [docs website](https://docs.tinygrad.org/) built from the [docs/](/docs) directory.
|
112
114
|
|
113
115
|
### Quick example comparing to PyTorch
|
114
116
|
|
115
|
-
```
|
117
|
+
```python
|
116
118
|
from tinygrad import Tensor
|
117
119
|
|
118
120
|
x = Tensor.eye(3, requires_grad=True)
|
@@ -125,7 +127,7 @@ print(y.grad.numpy()) # dz/dy
|
|
125
127
|
```
|
126
128
|
|
127
129
|
The same thing but in PyTorch:
|
128
|
-
```
|
130
|
+
```python
|
129
131
|
import torch
|
130
132
|
|
131
133
|
x = torch.eye(3, requires_grad=True)
|
@@ -154,7 +156,7 @@ Now, what we want:
|
|
154
156
|
- Bug fixes (with a regression test) are great! This library isn't 1.0 yet, so if you stumble upon a bug, fix it, write a test, and submit a PR, this is valuable work.
|
155
157
|
- Solving bounties! tinygrad [offers cash bounties](https://docs.google.com/spreadsheets/d/1WKHbT-7KOgjEawq5h5Ic1qUWzpfAzuD_J06N1JwOCGs/edit?usp=sharing) for certain improvements to the library. All new code should be high quality and well tested.
|
156
158
|
- Features. However, if you are adding a feature, consider the line tradeoff. If it's 3 lines, there's less of a bar of usefulness it has to meet over something that's 30 or 300 lines. All features must have regression tests. In general with no other constraints, your feature's API should match torch or numpy.
|
157
|
-
- Refactors that are clear wins. In general, if your refactor isn't a clear win it will be closed. But some refactors are amazing! Think about readability in a deep core sense. A whitespace change or moving a few functions around is useless, but if you realize that two 100 line functions can actually use the same 110 line function with arguments while also improving readability, this is a big win.
|
159
|
+
- Refactors that are clear wins. In general, if your refactor isn't a clear win it will be closed. But some refactors are amazing! Think about readability in a deep core sense. A whitespace change or moving a few functions around is useless, but if you realize that two 100 line functions can actually use the same 110 line function with arguments while also improving readability, this is a big win. Refactors should pass [process replay](#process-replay-tests).
|
158
160
|
- Tests/fuzzers. If you can add tests that are non brittle, they are welcome. We have some fuzzers in here too, and there's a plethora of bugs that can be found with them and by improving them. Finding bugs, even writing broken tests (that should pass) with `@unittest.expectedFailure` is great. This is how we make progress.
|
159
161
|
- Dead code removal from core `tinygrad/` folder. We don't care about the code in extra, but removing dead code from the core library is great. Less for new people to read and be confused by.
|
160
162
|
|
@@ -170,3 +172,7 @@ python3 -m pip install -e '.[testing]' # install extra deps for testing
|
|
170
172
|
python3 test/test_ops.py # just the ops tests
|
171
173
|
python3 -m pytest test/ # whole test suite
|
172
174
|
```
|
175
|
+
|
176
|
+
#### Process replay tests
|
177
|
+
|
178
|
+
[Process replay](https://github.com/tinygrad/tinygrad/blob/master/test/external/process_replay/process_replay.py) compares your PR's generated kernels against master. If your PR is a refactor or speedup without any expected behavior change, It should include [run_process_replay] in the PR title, [example](https://github.com/tinygrad/tinygrad/pull/4995). Note that you should keep your branch up-to-date with master.
|
@@ -8,19 +8,19 @@ with open(directory / 'README.md', encoding='utf-8') as f:
|
|
8
8
|
long_description = f.read()
|
9
9
|
|
10
10
|
setup(name='tinygrad',
|
11
|
-
version='0.9.
|
11
|
+
version='0.9.2',
|
12
12
|
description='You like pytorch? You like micrograd? You love tinygrad! <3',
|
13
13
|
author='George Hotz',
|
14
14
|
license='MIT',
|
15
15
|
long_description=long_description,
|
16
16
|
long_description_content_type='text/markdown',
|
17
17
|
packages = ['tinygrad', 'tinygrad.runtime.autogen', 'tinygrad.codegen', 'tinygrad.nn', 'tinygrad.renderer', 'tinygrad.engine',
|
18
|
-
'tinygrad.runtime', 'tinygrad.runtime.
|
18
|
+
'tinygrad.runtime', 'tinygrad.runtime.support', 'tinygrad.runtime.graph', 'tinygrad.shape'],
|
19
19
|
classifiers=[
|
20
20
|
"Programming Language :: Python :: 3",
|
21
21
|
"License :: OSI Approved :: MIT License"
|
22
22
|
],
|
23
|
-
install_requires=["numpy",
|
23
|
+
install_requires=["numpy",
|
24
24
|
"pyobjc-framework-Metal; platform_system=='Darwin'",
|
25
25
|
"pyobjc-framework-libdispatch; platform_system=='Darwin'"],
|
26
26
|
python_requires='>=3.8',
|
@@ -46,16 +46,20 @@ setup(name='tinygrad',
|
|
46
46
|
"onnx2torch",
|
47
47
|
"opencv-python",
|
48
48
|
"tabulate",
|
49
|
+
"tqdm",
|
49
50
|
"safetensors",
|
50
51
|
"transformers",
|
51
52
|
"sentencepiece",
|
52
53
|
"tiktoken",
|
54
|
+
"blobfile",
|
53
55
|
"librosa",
|
54
56
|
"networkx",
|
55
57
|
"hypothesis",
|
56
58
|
"nibabel",
|
59
|
+
"bottle",
|
57
60
|
],
|
58
61
|
'docs': [
|
62
|
+
"mkdocs",
|
59
63
|
"mkdocs-material",
|
60
64
|
"mkdocstrings[python]",
|
61
65
|
"markdown-callouts",
|
@@ -0,0 +1,167 @@
|
|
1
|
+
import unittest, contextlib
|
2
|
+
import numpy as np
|
3
|
+
from tinygrad import Tensor, GlobalCounters, dtypes, nn
|
4
|
+
from tinygrad.helpers import CI, Context, getenv
|
5
|
+
from tinygrad.engine.realize import run_schedule
|
6
|
+
from tinygrad.codegen.kernel import Opt, OptOps, Kernel, KernelOptError
|
7
|
+
from tinygrad.engine.realize import CompiledRunner, ExecItem
|
8
|
+
from tinygrad.engine.search import get_kernel_actions
|
9
|
+
|
10
|
+
class TestArange(unittest.TestCase):
|
11
|
+
def _get_flops(self, N, opts=None):
|
12
|
+
GlobalCounters.reset()
|
13
|
+
tt = Tensor.arange(N)
|
14
|
+
sched = tt.schedule()
|
15
|
+
self.assertEqual(len(sched), 1)
|
16
|
+
k = Kernel(sched[-1].ast)
|
17
|
+
if opts is not None:
|
18
|
+
for o in opts: k.apply_opt(o)
|
19
|
+
p = k.to_program()
|
20
|
+
print(p.name)
|
21
|
+
#print(p.src)
|
22
|
+
ExecItem(CompiledRunner(p), [tt.lazydata.buffer]).run()
|
23
|
+
np.testing.assert_equal(tt.numpy(), np.arange(N))
|
24
|
+
return p.op_estimate
|
25
|
+
|
26
|
+
def test_complexity(self, opts=None):
|
27
|
+
# add 1 to avoid divide by 0. arange is 0 flops now!
|
28
|
+
f1 = self._get_flops(256, opts) + 1
|
29
|
+
f2 = self._get_flops(2560, opts) + 1
|
30
|
+
print(f"{f1=}, {f2=}")
|
31
|
+
assert (f1 < 5000 and f2 < 5000) or (f2 / f1 < 15), f"bad complexity, flops {f2/f1:.1f}X while inputs 10X"
|
32
|
+
|
33
|
+
def test_complexity_w_upcast(self): return self.test_complexity([Opt(OptOps.UPCAST, 0, 4)])
|
34
|
+
def test_complexity_w_unroll(self): return self.test_complexity([Opt(OptOps.UNROLL, 0, 4)])
|
35
|
+
def test_complexity_w_upcast_and_unroll(self): return self.test_complexity([Opt(OptOps.UPCAST, 0, 4), Opt(OptOps.UNROLL, 0, 4)])
|
36
|
+
|
37
|
+
@unittest.skip("doesn't work yet")
|
38
|
+
def test_complexity_w_local_and_padto(self): return self.test_complexity([Opt(OptOps.LOCAL, 0, 16), Opt(op=OptOps.PADTO, axis=1, amt=32)])
|
39
|
+
|
40
|
+
def test_all_opts(self, opts=None, exclude=None):
|
41
|
+
k = Kernel(Tensor.arange(256).schedule()[-1].ast)
|
42
|
+
if opts is not None:
|
43
|
+
for o in opts: k.apply_opt(o)
|
44
|
+
all_opts_256 = [kk.applied_opts for kk in get_kernel_actions(k, include_0=False).values()]
|
45
|
+
k = Kernel(Tensor.arange(2560).schedule()[-1].ast)
|
46
|
+
if opts is not None:
|
47
|
+
for o in opts: k.apply_opt(o)
|
48
|
+
all_opts_2560 = [kk.applied_opts for kk in get_kernel_actions(k, include_0=False).values()]
|
49
|
+
all_opts = [x for x in all_opts_256 if x in all_opts_2560]
|
50
|
+
for opts in all_opts:
|
51
|
+
if exclude is not None and opts[-1] in exclude: continue
|
52
|
+
print(opts)
|
53
|
+
self.test_complexity(opts)
|
54
|
+
def test_all_opts_w_local(self):
|
55
|
+
with contextlib.suppress(KernelOptError):
|
56
|
+
return self.test_all_opts([Opt(OptOps.LOCAL, 0, 16)], [Opt(op=OptOps.PADTO, axis=1, amt=32)])
|
57
|
+
def test_all_opts_w_upcast(self): return self.test_all_opts([Opt(OptOps.UPCAST, 0, 4)])
|
58
|
+
def test_all_opts_w_unroll(self): return self.test_all_opts([Opt(OptOps.UNROLL, 0, 4)], [Opt(op=OptOps.GROUP, axis=0, amt=0)])
|
59
|
+
def test_all_opts_w_upcast_and_unroll(self):
|
60
|
+
return self.test_all_opts([Opt(OptOps.UPCAST, 0, 4), Opt(OptOps.UNROLL, 0, 4)], [Opt(op=OptOps.GROUP, axis=0, amt=0)])
|
61
|
+
|
62
|
+
class TestIndexing(unittest.TestCase):
|
63
|
+
def test_arange_2_reduce(self):
|
64
|
+
needle = Tensor.zeros(16384, dtype=dtypes.int).contiguous()
|
65
|
+
needle[1337] = 1
|
66
|
+
needle.realize()
|
67
|
+
with Context(NOOPT=1, FUSE_ARANGE=1):
|
68
|
+
GlobalCounters.reset()
|
69
|
+
# TODO: it should work without these reshapes
|
70
|
+
out = ((Tensor.arange(1,16385).reshape(16384,1)-1)*needle.reshape(16384,1)).sum()
|
71
|
+
sched = out.schedule()
|
72
|
+
assert len(sched) == 1
|
73
|
+
run_schedule(sched)
|
74
|
+
assert out.item() == 1337, f"expected 1337, got {out.item()}"
|
75
|
+
|
76
|
+
@unittest.skipIf(getenv("PTX"), "broken on ptx for some reason")
|
77
|
+
def test_manual_index(self):
|
78
|
+
dataset = Tensor.rand(16384, 256).realize()
|
79
|
+
idxs = Tensor([0,3,5,6]).realize()
|
80
|
+
real_index = dataset.numpy()[idxs.numpy()]
|
81
|
+
print("*** indexing ***")
|
82
|
+
with Context(NOOPT=1, FUSE_ARANGE=1):
|
83
|
+
GlobalCounters.reset()
|
84
|
+
rng = Tensor.ones(4, 256, 16384, dtype=dtypes.int)._cumsum(axis=-1, _first_zero=True).reshape(4, 256, 16384, 1)
|
85
|
+
idxs = idxs.reshape(4,1,1,1).expand(4, 256, 16384, 1)
|
86
|
+
reshape_dataset = dataset.T.reshape(1, 256, 16384, 1).expand(4, 256, 16384, 1)
|
87
|
+
full = (rng==idxs).where(reshape_dataset, Tensor.zeros(4, 256, 16384, 1))
|
88
|
+
X = full.sum(axis=(2,3))
|
89
|
+
sched = X.schedule()
|
90
|
+
assert len(sched) == 1
|
91
|
+
run_schedule(sched)
|
92
|
+
assert GlobalCounters.global_ops < 4*16384, f"too many ops {GlobalCounters.global_ops}"
|
93
|
+
np.testing.assert_allclose(real_index, X.numpy())
|
94
|
+
|
95
|
+
def test_index(self):
|
96
|
+
dataset = Tensor.rand(16384, 256).realize()
|
97
|
+
idxs = Tensor([0,3,5,6]).realize()
|
98
|
+
real_index = dataset.numpy()[idxs.numpy()]
|
99
|
+
print("*** indexing ***")
|
100
|
+
with Context(NOOPT=1):
|
101
|
+
GlobalCounters.reset()
|
102
|
+
X = dataset[idxs]
|
103
|
+
assert X.shape == (4,256)
|
104
|
+
sched = X.schedule()
|
105
|
+
# TODO: enable these asserts when the scheduler can handle this
|
106
|
+
#assert len(sched) == 1, f"{len(sched)} != 1"
|
107
|
+
run_schedule(sched)
|
108
|
+
#assert GlobalCounters.global_ops < 4*16384, f"too many ops {GlobalCounters.global_ops}"
|
109
|
+
np.testing.assert_allclose(real_index, X.numpy())
|
110
|
+
|
111
|
+
def test_index_fused(self, noopt=1):
|
112
|
+
dataset = Tensor.rand(16384, 256).realize()
|
113
|
+
idxs = Tensor([0,3,5,6]).realize()
|
114
|
+
real_index = dataset.numpy()[idxs.numpy()]
|
115
|
+
print("*** indexing ***")
|
116
|
+
with Context(NOOPT=noopt, FUSE_ARANGE=1):
|
117
|
+
GlobalCounters.reset()
|
118
|
+
X = dataset[idxs]
|
119
|
+
assert X.shape == (4,256)
|
120
|
+
sched = X.schedule()
|
121
|
+
assert len(sched) == 2
|
122
|
+
run_schedule(sched)
|
123
|
+
assert GlobalCounters.global_ops < 4*16384, f"too many ops {GlobalCounters.global_ops} != {4*16384}"
|
124
|
+
np.testing.assert_allclose(real_index, X.numpy())
|
125
|
+
@unittest.skip("not ready")
|
126
|
+
def test_index_fused_opt(self): self.test_index_fused(0)
|
127
|
+
|
128
|
+
@unittest.skipIf(getenv("PTX"), "broken on ptx for some reason")
|
129
|
+
def test_index_mnist(self, noopt=1):
|
130
|
+
from tinygrad.nn.datasets import mnist
|
131
|
+
X_train, Y_train, _, _ = mnist()
|
132
|
+
with Context(NOOPT=noopt, FUSE_ARANGE=1, SPLIT_REDUCEOP=0):
|
133
|
+
GlobalCounters.reset()
|
134
|
+
samples = Tensor.randint(getenv("BS", 512), high=X_train.shape[0])
|
135
|
+
x = X_train[samples].numpy()
|
136
|
+
y = Y_train[samples].numpy()
|
137
|
+
assert GlobalCounters.global_ops < 4*16384, f"too many ops {GlobalCounters.global_ops} != {4*16384}"
|
138
|
+
np.testing.assert_allclose(X_train.numpy()[samples.numpy()], x)
|
139
|
+
np.testing.assert_allclose(Y_train.numpy()[samples.numpy()], y)
|
140
|
+
@unittest.skip("not ready")
|
141
|
+
def test_index_mnist_opt(self): self.test_index_mnist(0)
|
142
|
+
|
143
|
+
@unittest.skipIf(getenv("PTX"), "broken on ptx for some reason")
|
144
|
+
def test_llama_embedding(self, noopt=1, op_limit=0):
|
145
|
+
# llama3 is 128256
|
146
|
+
vocab_size, embed_size = (10, 3) if CI else (32000, 4096)
|
147
|
+
emb = nn.Embedding(vocab_size, embed_size)
|
148
|
+
emb_w = emb.weight.numpy()
|
149
|
+
x = Tensor([1,2,3,4])
|
150
|
+
with Context(NOOPT=noopt, FUSE_ARANGE=1):
|
151
|
+
GlobalCounters.reset()
|
152
|
+
z = emb(x).realize()
|
153
|
+
self.assertLessEqual(GlobalCounters.global_ops, op_limit)
|
154
|
+
self.assertEqual(GlobalCounters.kernel_count, 2)
|
155
|
+
if getenv("CHECK", 1):
|
156
|
+
import torch
|
157
|
+
with torch.no_grad():
|
158
|
+
torch_emb = torch.nn.Embedding(vocab_size, embed_size).eval()
|
159
|
+
torch_emb.weight[:] = torch.tensor(emb_w, dtype=torch.float32)
|
160
|
+
torch_z = torch_emb(torch.tensor(x.numpy()))
|
161
|
+
# TODO: reshape to match torch, should we do this in nn?
|
162
|
+
np.testing.assert_allclose(z.numpy().reshape(4, embed_size), torch_z.detach().numpy(), atol=1e-8, rtol=1e-8)
|
163
|
+
# at least the arange is being fused
|
164
|
+
def test_llama_embedding_opt(self): self.test_llama_embedding(0, 1736704000)
|
165
|
+
|
166
|
+
if __name__ == "__main__":
|
167
|
+
unittest.main()
|
@@ -2,14 +2,14 @@ import unittest, math
|
|
2
2
|
from tinygrad import Tensor, Device, dtypes
|
3
3
|
from tinygrad.engine.schedule import create_schedule
|
4
4
|
from tinygrad.helpers import CI
|
5
|
-
from tinygrad.ops import
|
5
|
+
from tinygrad.ops import MetaOps
|
6
6
|
import numpy as np
|
7
7
|
from test.helpers import is_dtype_supported
|
8
8
|
|
9
9
|
def _check_ast_count(desired_count:int, t:Tensor):
|
10
10
|
# NOTE: this has side effect because everything can be scheduled only once
|
11
11
|
schedule = create_schedule(t.lazydata.lbs)
|
12
|
-
asts = [s for s in schedule if s.ast
|
12
|
+
asts = [s for s in schedule if s.ast.op is MetaOps.KERNEL]
|
13
13
|
assert len(asts) == desired_count
|
14
14
|
|
15
15
|
class TestUnaryOpsConstFolding(unittest.TestCase):
|
@@ -28,6 +28,11 @@ class TestUnaryOpsConstFolding(unittest.TestCase):
|
|
28
28
|
_check_ast_count(0, Tensor([1, 2, 3]).neg().mul(-1))
|
29
29
|
_check_ast_count(0, Tensor([1, 2, 3]).neg().neg())
|
30
30
|
|
31
|
+
def test_neg_realized_no_fold(self):
|
32
|
+
x = Tensor.randn(32, 32)
|
33
|
+
x = x.clip(0, 1).realize()
|
34
|
+
_check_ast_count(1, x.neg())
|
35
|
+
|
31
36
|
class TestBinaryOpsConstFolding(unittest.TestCase):
|
32
37
|
def test_add_literal_zero(self):
|
33
38
|
_check_ast_count(0, Tensor([1.0, 2, 3, 4]) + 0)
|
@@ -250,4 +255,4 @@ class TestTautologicalCompare(unittest.TestCase):
|
|
250
255
|
np.testing.assert_equal((a != a).numpy(), [True, False, False])
|
251
256
|
|
252
257
|
if __name__ == '__main__':
|
253
|
-
unittest.main()
|
258
|
+
unittest.main()
|
@@ -42,6 +42,14 @@ class TestConv(unittest.TestCase):
|
|
42
42
|
|
43
43
|
print(ret.numpy())
|
44
44
|
|
45
|
+
def test_two_binops_no_rerun_small(self):
|
46
|
+
Tensor.no_grad = True
|
47
|
+
x = Tensor.rand(1,1,32,32)
|
48
|
+
w = Tensor.rand(1,1,3,3)
|
49
|
+
out = x.conv2d(w, padding=(1,1))
|
50
|
+
np.testing.assert_allclose(out.relu().numpy(), np.maximum(out.numpy(), 0))
|
51
|
+
Tensor.no_grad = False
|
52
|
+
|
45
53
|
def test_two_binops_no_rerun(self):
|
46
54
|
Tensor.no_grad = True
|
47
55
|
x = Tensor.randn(1,12,128,256)
|
@@ -0,0 +1,63 @@
|
|
1
|
+
#!/usr/bin/env python
|
2
|
+
import unittest
|
3
|
+
from tinygrad.tensor import Tensor
|
4
|
+
from tinygrad.ops import MetaOps, BufferOps
|
5
|
+
from tinygrad.nn import Conv2d
|
6
|
+
from tinygrad.engine.schedule import create_schedule
|
7
|
+
from tinygrad.shape.shapetracker import ShapeTracker, View
|
8
|
+
from tinygrad.helpers import prod
|
9
|
+
from test.unit.test_shapetracker import shapetracker_getitem
|
10
|
+
|
11
|
+
class TestConvShapetracker(unittest.TestCase):
|
12
|
+
def test_conv_3x3_one_view(self):
|
13
|
+
conv = Conv2d(16, 32, (3, 3))
|
14
|
+
seen = set()
|
15
|
+
|
16
|
+
# first run to init the weights, they are saved in seen
|
17
|
+
create_schedule([conv(Tensor.empty(1, 16, 10, 10)).lazydata], seen)
|
18
|
+
# run it again to get the kernels
|
19
|
+
sched = [si for si in create_schedule([conv(Tensor.empty(1, 16, 10, 10)).lazydata], seen) if si.ast.op is MetaOps.KERNEL]
|
20
|
+
assert len(sched) == 1, f"conv should only have one kernel, getting {len(sched)}"
|
21
|
+
for st in [x.arg.st for x in sched[0].ast.lazyops if x.op is BufferOps.LOAD]:
|
22
|
+
assert len(st.views) == 1
|
23
|
+
|
24
|
+
@unittest.expectedFailure
|
25
|
+
def test_conv_2x2_backward_one_view(self):
|
26
|
+
X = Tensor.rand(1, 1, 3, 3, requires_grad=True)
|
27
|
+
conv = Conv2d(1, 1, (2, 2), bias=False)
|
28
|
+
conv(X).mean().backward()
|
29
|
+
si = X.grad.schedule()[-1]
|
30
|
+
print(si)
|
31
|
+
ldb = [x for x in si.ast.lazyops if x.op is BufferOps.LOAD][0]
|
32
|
+
st: ShapeTracker = ldb.arg.st.simplify()
|
33
|
+
# NOTE: st.real_size() is broken
|
34
|
+
print(si.inputs[0].size)
|
35
|
+
#self.assertEqual(si.inputs[0].size, st.real_size())
|
36
|
+
for v in st.views: print(v)
|
37
|
+
|
38
|
+
# same st
|
39
|
+
test_st = ShapeTracker((
|
40
|
+
View(shape=(1, 1, 2, 4, 2, 4), strides=(0, 0, 2, 8, 1, 4), offset=0, mask=((0, 1), (0, 1), (0, 2), (0, 2), (0, 2), (0, 2)), contiguous=False),
|
41
|
+
View(shape=(1, 1, 1, 1, 3, 3, 3, 3), strides=(0, 0, 0, 0, 24, 8, 3, 1), offset=0,
|
42
|
+
mask=((0, 1), (0, 1), (0, 1), (0, 1), (0, 2), (0, 3), (0, 2), (0, 3)), contiguous=False)))
|
43
|
+
#test_st = ShapeTracker((
|
44
|
+
# View(shape=(2,4), strides=(1,4), offset=0, mask=None, contiguous=False),
|
45
|
+
#)).simplify()
|
46
|
+
#View(shape=(1, 1, 2, 4, 2, 4), strides=(0, 0, 2, 8, 1, 4), offset=0, mask=((0, 1), (0, 1), (0, 2), (0, 2), (0, 2), (0, 2)), contiguous=False),
|
47
|
+
#View(shape=(1, 1, 1, 1, 3, 3, 3, 3), strides=(0, 0, 0, 0, 24, 8, 3, 1), offset=0,
|
48
|
+
# mask=((0, 1), (0, 1), (0, 1), (0, 1), (0, 2), (0, 3), (0, 2), (0, 3)), contiguous=False))).simplify()
|
49
|
+
print("*** new ***")
|
50
|
+
for v in test_st.views: print(v)
|
51
|
+
for i in range(prod(st.shape)):
|
52
|
+
i1, i2 = shapetracker_getitem(st, i), shapetracker_getitem(test_st, i)
|
53
|
+
print(i, i1, i2, si.inputs[0].size, i1==i2)
|
54
|
+
#self.assertEqual(i1, i2)
|
55
|
+
|
56
|
+
for stt in [st, test_st]:
|
57
|
+
s,va = stt.expr_idxs()
|
58
|
+
print(s)
|
59
|
+
print(va)
|
60
|
+
assert len(st.views) <= 2
|
61
|
+
|
62
|
+
if __name__ == '__main__':
|
63
|
+
unittest.main()
|
@@ -31,7 +31,7 @@ def atan2_cpu(ret:Buffer, a:Buffer, b:Buffer): ret.copyin(np.require(np.arctan2(
|
|
31
31
|
# NOTE: The derivative of atan2 doesn't need a custom op! https://www.liquisearch.com/atan2/derivative
|
32
32
|
# In general, it is also optional to write a backward function, just your backward pass won't work without it
|
33
33
|
|
34
|
-
from tinygrad.ops import
|
34
|
+
from tinygrad.ops import MetaOps, BinaryOps, UnaryOps
|
35
35
|
from tinygrad.lazy import LazyBuffer
|
36
36
|
from tinygrad.tensor import Function
|
37
37
|
|
@@ -39,12 +39,13 @@ class ATan2(Function):
|
|
39
39
|
def forward(self, a:LazyBuffer, b:LazyBuffer) -> LazyBuffer:
|
40
40
|
assert prod(a.shape) == prod(b.shape) and a.device == b.device, "shape or device mismatch"
|
41
41
|
self.a, self.b = a, b
|
42
|
-
return create_lazybuffer(a.device, ShapeTracker.from_shape(a.shape), max(a.dtype, b.dtype),
|
42
|
+
return create_lazybuffer(a.device, ShapeTracker.from_shape(a.shape), max(a.dtype, b.dtype), MetaOps.CUSTOM,
|
43
43
|
arg={"GPU": atan2_gpu, "CPU": atan2_cpu}[a.device], srcs=(a.contiguous(), b.contiguous()))
|
44
44
|
def backward(self, grad_output:LazyBuffer) -> Tuple[Optional[LazyBuffer], Optional[LazyBuffer]]:
|
45
|
-
|
46
|
-
return grad_output.e(BinaryOps.MUL, self.b.e(BinaryOps.
|
47
|
-
grad_output.e(BinaryOps.MUL, self.a.const(0).e(BinaryOps.
|
45
|
+
recip = (self.a.e(BinaryOps.MUL, self.a)).e(BinaryOps.ADD, self.b.e(BinaryOps.MUL, self.b)).e(UnaryOps.RECIP)
|
46
|
+
return grad_output.e(BinaryOps.MUL, self.b.e(BinaryOps.MUL, recip)) if self.needs_input_grad[0] else None, \
|
47
|
+
grad_output.e(BinaryOps.MUL, self.a.const(0).e(BinaryOps.ADD, self.a.e(UnaryOps.NEG)).e(BinaryOps.MUL, recip)) \
|
48
|
+
if self.needs_input_grad[1] else None
|
48
49
|
|
49
50
|
# *** third, we use our lovely new mlop in some tests ***
|
50
51
|
|
@@ -1,13 +1,13 @@
|
|
1
1
|
import unittest
|
2
2
|
from tinygrad import Device
|
3
|
-
from tinygrad.codegen.
|
3
|
+
from tinygrad.codegen.uopgraph import UOpGraph
|
4
4
|
from tinygrad.helpers import Timing, Profiling
|
5
5
|
|
6
6
|
class TestDeviceSpeed(unittest.TestCase):
|
7
7
|
@classmethod
|
8
8
|
def setUpClass(cls):
|
9
9
|
cls.dev = Device[Device.DEFAULT]
|
10
|
-
cls.empty = Device[Device.DEFAULT].renderer.render("test", UOpGraph())
|
10
|
+
cls.empty = Device[Device.DEFAULT].renderer.render("test", UOpGraph([]))
|
11
11
|
|
12
12
|
def test_empty_compile(self):
|
13
13
|
with Timing("compiler "):
|