PyPI - tinygrad - Versions diffs - 0.9.0__tar.gz → 0.9.2__tar.gz - Mend

tinygrad 0.9.0tar.gz → 0.9.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (160) hide show

{tinygrad-0.9.0 → tinygrad-0.9.2}/PKG-INFO +23 -14
{tinygrad-0.9.0 → tinygrad-0.9.2}/README.md +18 -12
{tinygrad-0.9.0 → tinygrad-0.9.2}/setup.py +7 -3
tinygrad-0.9.2/test/test_arange.py +167 -0
{tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_const_folding.py +8 -3
{tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_conv.py +8 -0
tinygrad-0.9.2/test/test_conv_shapetracker.py +63 -0
{tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_custom_function.py +6 -5
{tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_device_speed.py +2 -2
{tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_dtype.py +94 -13
{tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_dtype_alu.py +54 -15
{tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_fuzz_shape_ops.py +3 -2
tinygrad-0.9.2/test/test_graph.py +235 -0
tinygrad-0.9.2/test/test_hcq.py +463 -0
{tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_image_dtype.py +22 -10
{tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_jit.py +122 -2
{tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_lazybuffer.py +9 -9
{tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_lazyop.py +1 -1
tinygrad-0.9.2/test/test_linearizer.py +2077 -0
tinygrad-0.9.2/test/test_linearizer_dumb.py +104 -0
tinygrad-0.9.2/test/test_linearizer_failures.py +467 -0
{tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_linearizer_overflows.py +3 -3
{tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_multitensor.py +231 -95
{tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_nn.py +147 -68
tinygrad-0.9.2/test/test_ocl.py +20 -0
{tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_ops.py +380 -105
{tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_optim.py +2 -1
tinygrad-0.9.2/test/test_pattern_matcher.py +186 -0
{tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_pickle.py +36 -4
tinygrad-0.9.2/test/test_profiler.py +220 -0
{tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_randomness.py +12 -6
tinygrad-0.9.2/test/test_renderer_failures.py +43 -0
tinygrad-0.9.2/test/test_schedule.py +1589 -0
{tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_search.py +24 -13
{tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_speed_v_torch.py +5 -14
{tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_subbuffer.py +2 -3
{tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_symbolic_jit.py +62 -1
{tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_symbolic_ops.py +37 -29
{tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_symbolic_shapetracker.py +47 -1
{tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_tensor.py +225 -62
{tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_tensor_data.py +12 -1
{tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_tensor_variable.py +23 -18
tinygrad-0.9.2/test/test_transcendental.py +71 -0
tinygrad-0.9.2/test/test_uop_graph.py +662 -0
tinygrad-0.9.2/test/test_uops.py +379 -0
tinygrad-0.9.2/test/test_uops_stats.py +203 -0
tinygrad-0.9.2/test/test_verify_lazyop.py +76 -0
{tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_winograd.py +9 -7
{tinygrad-0.9.0 → tinygrad-0.9.2}/tinygrad/codegen/kernel.py +313 -192
tinygrad-0.9.2/tinygrad/codegen/lowerer.py +215 -0
tinygrad-0.9.2/tinygrad/codegen/transcendental.py +310 -0
tinygrad-0.9.2/tinygrad/codegen/uopgraph.py +622 -0
tinygrad-0.9.2/tinygrad/codegen/uops.py +293 -0
tinygrad-0.9.2/tinygrad/device.py +679 -0
{tinygrad-0.9.0 → tinygrad-0.9.2}/tinygrad/dtype.py +25 -11
tinygrad-0.9.2/tinygrad/engine/__init__.py +0 -0
{tinygrad-0.9.0 → tinygrad-0.9.2}/tinygrad/engine/graph.py +24 -37
tinygrad-0.9.2/tinygrad/engine/jit.py +276 -0
tinygrad-0.9.2/tinygrad/engine/realize.py +268 -0
tinygrad-0.9.2/tinygrad/engine/schedule.py +413 -0
{tinygrad-0.9.0 → tinygrad-0.9.2}/tinygrad/engine/search.py +33 -23
{tinygrad-0.9.0 → tinygrad-0.9.2}/tinygrad/function.py +26 -23
{tinygrad-0.9.0 → tinygrad-0.9.2}/tinygrad/helpers.py +121 -14
{tinygrad-0.9.0 → tinygrad-0.9.2}/tinygrad/lazy.py +55 -56
{tinygrad-0.9.0 → tinygrad-0.9.2}/tinygrad/multi.py +51 -42
{tinygrad-0.9.0 → tinygrad-0.9.2}/tinygrad/nn/__init__.py +40 -23
{tinygrad-0.9.0 → tinygrad-0.9.2}/tinygrad/nn/datasets.py +2 -1
{tinygrad-0.9.0 → tinygrad-0.9.2}/tinygrad/nn/state.py +6 -7
tinygrad-0.9.2/tinygrad/ops.py +170 -0
tinygrad-0.9.2/tinygrad/renderer/__init__.py +87 -0
tinygrad-0.9.2/tinygrad/renderer/assembly.py +267 -0
{tinygrad-0.9.0 → tinygrad-0.9.2}/tinygrad/renderer/cstyle.py +125 -93
{tinygrad-0.9.0 → tinygrad-0.9.2}/tinygrad/renderer/llvmir.py +44 -53
tinygrad-0.9.2/tinygrad/runtime/__init__.py +0 -0
tinygrad-0.9.2/tinygrad/runtime/autogen/amd_gpu.py +32858 -0
{tinygrad-0.9.0 → tinygrad-0.9.2}/tinygrad/runtime/autogen/comgr.py +36 -10
{tinygrad-0.9.0 → tinygrad-0.9.2}/tinygrad/runtime/autogen/cuda.py +6 -162
{tinygrad-0.9.0 → tinygrad-0.9.2}/tinygrad/runtime/autogen/hsa.py +146 -14
tinygrad-0.9.2/tinygrad/runtime/autogen/io_uring.py +1486 -0
{tinygrad-0.9.0 → tinygrad-0.9.2}/tinygrad/runtime/autogen/kfd.py +32 -0
tinygrad-0.9.2/tinygrad/runtime/autogen/libc.py +4260 -0
{tinygrad-0.9.0 → tinygrad-0.9.2}/tinygrad/runtime/autogen/nv_gpu.py +269 -0
tinygrad-0.9.2/tinygrad/runtime/autogen/nvrtc.py +579 -0
tinygrad-0.9.2/tinygrad/runtime/graph/__init__.py +0 -0
{tinygrad-0.9.0 → tinygrad-0.9.2}/tinygrad/runtime/graph/clang.py +5 -4
{tinygrad-0.9.0 → tinygrad-0.9.2}/tinygrad/runtime/graph/cuda.py +9 -12
tinygrad-0.9.2/tinygrad/runtime/graph/hcq.py +200 -0
{tinygrad-0.9.0 → tinygrad-0.9.2}/tinygrad/runtime/graph/metal.py +18 -15
tinygrad-0.9.2/tinygrad/runtime/ops_amd.py +442 -0
{tinygrad-0.9.0 → tinygrad-0.9.2}/tinygrad/runtime/ops_clang.py +2 -2
tinygrad-0.9.2/tinygrad/runtime/ops_cuda.py +127 -0
tinygrad-0.9.2/tinygrad/runtime/ops_disk.py +121 -0
{tinygrad-0.9.0 → tinygrad-0.9.2}/tinygrad/runtime/ops_gpu.py +6 -4
tinygrad-0.9.2/tinygrad/runtime/ops_hip.py +70 -0
{tinygrad-0.9.0 → tinygrad-0.9.2}/tinygrad/runtime/ops_metal.py +43 -33
{tinygrad-0.9.0 → tinygrad-0.9.2}/tinygrad/runtime/ops_npy.py +1 -1
tinygrad-0.9.2/tinygrad/runtime/ops_nv.py +545 -0
{tinygrad-0.9.0 → tinygrad-0.9.2}/tinygrad/runtime/ops_python.py +35 -35
tinygrad-0.9.2/tinygrad/runtime/support/__init__.py +0 -0
tinygrad-0.9.2/tinygrad/runtime/support/compiler_cuda.py +78 -0
tinygrad-0.9.0/tinygrad/runtime/driver/hip_comgr.py → tinygrad-0.9.2/tinygrad/runtime/support/compiler_hip.py +35 -12
tinygrad-0.9.2/tinygrad/runtime/support/elf.py +38 -0
tinygrad-0.9.2/tinygrad/shape/__init__.py +0 -0
{tinygrad-0.9.0 → tinygrad-0.9.2}/tinygrad/shape/shapetracker.py +10 -16
{tinygrad-0.9.0 → tinygrad-0.9.2}/tinygrad/shape/symbolic.py +5 -11
{tinygrad-0.9.0 → tinygrad-0.9.2}/tinygrad/shape/view.py +67 -40
{tinygrad-0.9.0 → tinygrad-0.9.2}/tinygrad/tensor.py +601 -215
{tinygrad-0.9.0 → tinygrad-0.9.2}/tinygrad.egg-info/PKG-INFO +23 -14
{tinygrad-0.9.0 → tinygrad-0.9.2}/tinygrad.egg-info/SOURCES.txt +23 -5
{tinygrad-0.9.0 → tinygrad-0.9.2}/tinygrad.egg-info/requires.txt +4 -1
tinygrad-0.9.0/test/test_arange.py +0 -17
tinygrad-0.9.0/test/test_conv_shapetracker.py +0 -22
tinygrad-0.9.0/test/test_linearizer.py +0 -1453
tinygrad-0.9.0/test/test_linearizer_failures.py +0 -248
tinygrad-0.9.0/test/test_pattern_matcher.py +0 -93
tinygrad-0.9.0/test/test_schedule.py +0 -859
tinygrad-0.9.0/test/test_uop_graph.py +0 -82
tinygrad-0.9.0/test/test_uops.py +0 -245
tinygrad-0.9.0/test/test_uops_stats.py +0 -83
tinygrad-0.9.0/tinygrad/codegen/linearizer.py +0 -460
tinygrad-0.9.0/tinygrad/codegen/uops.py +0 -415
tinygrad-0.9.0/tinygrad/device.py +0 -183
tinygrad-0.9.0/tinygrad/engine/jit.py +0 -195
tinygrad-0.9.0/tinygrad/engine/realize.py +0 -191
tinygrad-0.9.0/tinygrad/engine/schedule.py +0 -362
tinygrad-0.9.0/tinygrad/ops.py +0 -136
tinygrad-0.9.0/tinygrad/renderer/__init__.py +0 -61
tinygrad-0.9.0/tinygrad/renderer/assembly.py +0 -276
tinygrad-0.9.0/tinygrad/runtime/autogen/amd_gpu.py +0 -1900
tinygrad-0.9.0/tinygrad/runtime/driver/hsa.py +0 -143
tinygrad-0.9.0/tinygrad/runtime/graph/hcq.py +0 -143
tinygrad-0.9.0/tinygrad/runtime/graph/hsa.py +0 -171
tinygrad-0.9.0/tinygrad/runtime/ops_amd.py +0 -564
tinygrad-0.9.0/tinygrad/runtime/ops_cuda.py +0 -185
tinygrad-0.9.0/tinygrad/runtime/ops_disk.py +0 -60
tinygrad-0.9.0/tinygrad/runtime/ops_hsa.py +0 -278
tinygrad-0.9.0/tinygrad/runtime/ops_nv.py +0 -630
{tinygrad-0.9.0 → tinygrad-0.9.2}/LICENSE +0 -0
{tinygrad-0.9.0 → tinygrad-0.9.2}/setup.cfg +0 -0
{tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_assign.py +0 -0
{tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_copy_speed.py +0 -0
{tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_fusion_op.py +0 -0
{tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_gc.py +0 -0
{tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_kernel_cache.py +0 -0
{tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_masked_st.py +0 -0
{tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_method_cache.py +0 -0
{tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_net_speed.py +0 -0
{tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_sample.py +0 -0
{tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_setitem.py +0 -0
{tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_specific_conv.py +0 -0
{tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_to_numpy.py +0 -0
{tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_zero_copy.py +0 -0
{tinygrad-0.9.0 → tinygrad-0.9.2}/tinygrad/__init__.py +0 -0
{tinygrad-0.9.0/tinygrad/engine → tinygrad-0.9.2/tinygrad/codegen}/__init__.py +0 -0
{tinygrad-0.9.0 → tinygrad-0.9.2}/tinygrad/nn/optim.py +0 -0
{tinygrad-0.9.0 → tinygrad-0.9.2}/tinygrad/runtime/autogen/hip.py +0 -0
{tinygrad-0.9.0 → tinygrad-0.9.2}/tinygrad/runtime/autogen/opencl.py +0 -0
{tinygrad-0.9.0 → tinygrad-0.9.2}/tinygrad/runtime/ops_llvm.py +0 -0
{tinygrad-0.9.0 → tinygrad-0.9.2}/tinygrad.egg-info/dependency_links.txt +0 -0
{tinygrad-0.9.0 → tinygrad-0.9.2}/tinygrad.egg-info/top_level.txt +0 -0

{tinygrad-0.9.0 → tinygrad-0.9.2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: tinygrad
-Version: 0.9.0
+Version: 0.9.2
 Summary: You like pytorch? You like micrograd? You love tinygrad! <3
 Author: George Hotz
 License: MIT
@@ -10,7 +10,6 @@ Requires-Python: >=3.8
 Description-Content-Type: text/markdown
 License-File: LICENSE
 Requires-Dist: numpy
-Requires-Dist: tqdm
 Requires-Dist: pyobjc-framework-Metal; platform_system == "Darwin"
 Requires-Dist: pyobjc-framework-libdispatch; platform_system == "Darwin"
 Provides-Extra: llvm
@@ -35,15 +34,19 @@ Requires-Dist: onnx==1.16.0; extra == "testing"
 Requires-Dist: onnx2torch; extra == "testing"
 Requires-Dist: opencv-python; extra == "testing"
 Requires-Dist: tabulate; extra == "testing"
+Requires-Dist: tqdm; extra == "testing"
 Requires-Dist: safetensors; extra == "testing"
 Requires-Dist: transformers; extra == "testing"
 Requires-Dist: sentencepiece; extra == "testing"
 Requires-Dist: tiktoken; extra == "testing"
+Requires-Dist: blobfile; extra == "testing"
 Requires-Dist: librosa; extra == "testing"
 Requires-Dist: networkx; extra == "testing"
 Requires-Dist: hypothesis; extra == "testing"
 Requires-Dist: nibabel; extra == "testing"
+Requires-Dist: bottle; extra == "testing"
 Provides-Extra: docs
+Requires-Dist: mkdocs; extra == "docs"
 Requires-Dist: mkdocs-material; extra == "docs"
 Requires-Dist: mkdocstrings[python]; extra == "docs"
 Requires-Dist: markdown-callouts; extra == "docs"
@@ -64,7 +67,7 @@ tinygrad: For something between [PyTorch](https://github.com/pytorch/pytorch) an
 <h3>
-[Homepage](https://github.com/tinygrad/tinygrad) | [Documentation](/docs) | [Examples](/examples) | [Showcase](/docs/showcase.md) | [Discord](https://discord.gg/ZjZadyC7PK)
+[Homepage](https://github.com/tinygrad/tinygrad) | [Documentation](https://docs.tinygrad.org/) | [Discord](https://discord.gg/ZjZadyC7PK)
 </h3>
@@ -106,7 +109,7 @@ And we can change `DEBUG` to `4` to see the generated code.
 As it turns out, 90% of what you need for neural networks are a decent autograd/tensor library.
 Throw in an optimizer, a data loader, and some compute, and you have all you need.
-```py
+```python
 from tinygrad import Tensor, nn
 class LinearNet:
@@ -121,11 +124,12 @@ optim = nn.optim.Adam([model.l1, model.l2], lr=0.001)
 x, y = Tensor.rand(4, 1, 28, 28), Tensor([2,4,3,7])  # replace with real mnist dataloader
-for i in range(10):
-  optim.zero_grad()
-  loss = model(x).sparse_categorical_crossentropy(y).backward()
-  optim.step()
-  print(i, loss.item())
+with Tensor.train():
+  for i in range(10):
+    optim.zero_grad()
+    loss = model(x).sparse_categorical_crossentropy(y).backward()
+    optim.step()
+    print(i, loss.item())
 ```
 See [examples/beautiful_mnist.py](examples/beautiful_mnist.py) for the full version that gets 98% in ~5 seconds
@@ -139,7 +143,8 @@ tinygrad already supports numerous accelerators, including:
 - [x] [LLVM](tinygrad/runtime/ops_llvm.py)
 - [x] [METAL](tinygrad/runtime/ops_metal.py)
 - [x] [CUDA](tinygrad/runtime/ops_cuda.py)
-- [x] [HSA](tinygrad/runtime/ops_hsa.py)
+- [x] [AMD](tinygrad/runtime/ops_amd.py)
+- [x] [NV](tinygrad/runtime/ops_nv.py)
 And it is easy to add more! Your accelerator of choice only needs to support a total of ~25 low level ops.
@@ -163,11 +168,11 @@ python3 -m pip install git+https://github.com/tinygrad/tinygrad.git
 ## Documentation
-Documentation along with a quick start guide can be found in the [docs/](/docs) directory.
+Documentation along with a quick start guide can be found on the [docs website](https://docs.tinygrad.org/) built from the [docs/](/docs) directory.
 ### Quick example comparing to PyTorch
-```py
+```python
 from tinygrad import Tensor
 x = Tensor.eye(3, requires_grad=True)
@@ -180,7 +185,7 @@ print(y.grad.numpy())  # dz/dy
 ```
 The same thing but in PyTorch:
-```py
+```python
 import torch
 x = torch.eye(3, requires_grad=True)
@@ -209,7 +214,7 @@ Now, what we want:
 - Bug fixes (with a regression test) are great! This library isn't 1.0 yet, so if you stumble upon a bug, fix it, write a test, and submit a PR, this is valuable work.
 - Solving bounties! tinygrad [offers cash bounties](https://docs.google.com/spreadsheets/d/1WKHbT-7KOgjEawq5h5Ic1qUWzpfAzuD_J06N1JwOCGs/edit?usp=sharing) for certain improvements to the library. All new code should be high quality and well tested.
 - Features. However, if you are adding a feature, consider the line tradeoff. If it's 3 lines, there's less of a bar of usefulness it has to meet over something that's 30 or 300 lines. All features must have regression tests. In general with no other constraints, your feature's API should match torch or numpy.
-- Refactors that are clear wins. In general, if your refactor isn't a clear win it will be closed. But some refactors are amazing! Think about readability in a deep core sense. A whitespace change or moving a few functions around is useless, but if you realize that two 100 line functions can actually use the same 110 line function with arguments while also improving readability, this is a big win.
+- Refactors that are clear wins. In general, if your refactor isn't a clear win it will be closed. But some refactors are amazing! Think about readability in a deep core sense. A whitespace change or moving a few functions around is useless, but if you realize that two 100 line functions can actually use the same 110 line function with arguments while also improving readability, this is a big win. Refactors should pass [process replay](#process-replay-tests).
 - Tests/fuzzers. If you can add tests that are non brittle, they are welcome. We have some fuzzers in here too, and there's a plethora of bugs that can be found with them and by improving them. Finding bugs, even writing broken tests (that should pass) with `@unittest.expectedFailure` is great. This is how we make progress.
 - Dead code removal from core `tinygrad/` folder. We don't care about the code in extra, but removing dead code from the core library is great. Less for new people to read and be confused by.
@@ -225,3 +230,7 @@ python3 -m pip install -e '.[testing]'  # install extra deps for testing
 python3 test/test_ops.py                # just the ops tests
 python3 -m pytest test/                 # whole test suite
 ```
+#### Process replay tests
+[Process replay](https://github.com/tinygrad/tinygrad/blob/master/test/external/process_replay/process_replay.py) compares your PR's generated kernels against master. If your PR is a refactor or speedup without any expected behavior change, It should include [run_process_replay] in the PR title, [example](https://github.com/tinygrad/tinygrad/pull/4995). Note that you should keep your branch up-to-date with master.

{tinygrad-0.9.0 → tinygrad-0.9.2}/README.md RENAMED Viewed

@@ -9,7 +9,7 @@ tinygrad: For something between [PyTorch](https://github.com/pytorch/pytorch) an
 <h3>
-[Homepage](https://github.com/tinygrad/tinygrad) | [Documentation](/docs) | [Examples](/examples) | [Showcase](/docs/showcase.md) | [Discord](https://discord.gg/ZjZadyC7PK)
+[Homepage](https://github.com/tinygrad/tinygrad) | [Documentation](https://docs.tinygrad.org/) | [Discord](https://discord.gg/ZjZadyC7PK)
 </h3>
@@ -51,7 +51,7 @@ And we can change `DEBUG` to `4` to see the generated code.
 As it turns out, 90% of what you need for neural networks are a decent autograd/tensor library.
 Throw in an optimizer, a data loader, and some compute, and you have all you need.
-```py
+```python
 from tinygrad import Tensor, nn
 class LinearNet:
@@ -66,11 +66,12 @@ optim = nn.optim.Adam([model.l1, model.l2], lr=0.001)
 x, y = Tensor.rand(4, 1, 28, 28), Tensor([2,4,3,7])  # replace with real mnist dataloader
-for i in range(10):
-  optim.zero_grad()
-  loss = model(x).sparse_categorical_crossentropy(y).backward()
-  optim.step()
-  print(i, loss.item())
+with Tensor.train():
+  for i in range(10):
+    optim.zero_grad()
+    loss = model(x).sparse_categorical_crossentropy(y).backward()
+    optim.step()
+    print(i, loss.item())
 ```
 See [examples/beautiful_mnist.py](examples/beautiful_mnist.py) for the full version that gets 98% in ~5 seconds
@@ -84,7 +85,8 @@ tinygrad already supports numerous accelerators, including:
 - [x] [LLVM](tinygrad/runtime/ops_llvm.py)
 - [x] [METAL](tinygrad/runtime/ops_metal.py)
 - [x] [CUDA](tinygrad/runtime/ops_cuda.py)
-- [x] [HSA](tinygrad/runtime/ops_hsa.py)
+- [x] [AMD](tinygrad/runtime/ops_amd.py)
+- [x] [NV](tinygrad/runtime/ops_nv.py)
 And it is easy to add more! Your accelerator of choice only needs to support a total of ~25 low level ops.
@@ -108,11 +110,11 @@ python3 -m pip install git+https://github.com/tinygrad/tinygrad.git
 ## Documentation
-Documentation along with a quick start guide can be found in the [docs/](/docs) directory.
+Documentation along with a quick start guide can be found on the [docs website](https://docs.tinygrad.org/) built from the [docs/](/docs) directory.
 ### Quick example comparing to PyTorch
-```py
+```python
 from tinygrad import Tensor
 x = Tensor.eye(3, requires_grad=True)
@@ -125,7 +127,7 @@ print(y.grad.numpy())  # dz/dy
 ```
 The same thing but in PyTorch:
-```py
+```python
 import torch
 x = torch.eye(3, requires_grad=True)
@@ -154,7 +156,7 @@ Now, what we want:
 - Bug fixes (with a regression test) are great! This library isn't 1.0 yet, so if you stumble upon a bug, fix it, write a test, and submit a PR, this is valuable work.
 - Solving bounties! tinygrad [offers cash bounties](https://docs.google.com/spreadsheets/d/1WKHbT-7KOgjEawq5h5Ic1qUWzpfAzuD_J06N1JwOCGs/edit?usp=sharing) for certain improvements to the library. All new code should be high quality and well tested.
 - Features. However, if you are adding a feature, consider the line tradeoff. If it's 3 lines, there's less of a bar of usefulness it has to meet over something that's 30 or 300 lines. All features must have regression tests. In general with no other constraints, your feature's API should match torch or numpy.
-- Refactors that are clear wins. In general, if your refactor isn't a clear win it will be closed. But some refactors are amazing! Think about readability in a deep core sense. A whitespace change or moving a few functions around is useless, but if you realize that two 100 line functions can actually use the same 110 line function with arguments while also improving readability, this is a big win.
+- Refactors that are clear wins. In general, if your refactor isn't a clear win it will be closed. But some refactors are amazing! Think about readability in a deep core sense. A whitespace change or moving a few functions around is useless, but if you realize that two 100 line functions can actually use the same 110 line function with arguments while also improving readability, this is a big win. Refactors should pass [process replay](#process-replay-tests).
 - Tests/fuzzers. If you can add tests that are non brittle, they are welcome. We have some fuzzers in here too, and there's a plethora of bugs that can be found with them and by improving them. Finding bugs, even writing broken tests (that should pass) with `@unittest.expectedFailure` is great. This is how we make progress.
 - Dead code removal from core `tinygrad/` folder. We don't care about the code in extra, but removing dead code from the core library is great. Less for new people to read and be confused by.
@@ -170,3 +172,7 @@ python3 -m pip install -e '.[testing]'  # install extra deps for testing
 python3 test/test_ops.py                # just the ops tests
 python3 -m pytest test/                 # whole test suite
 ```
+#### Process replay tests
+[Process replay](https://github.com/tinygrad/tinygrad/blob/master/test/external/process_replay/process_replay.py) compares your PR's generated kernels against master. If your PR is a refactor or speedup without any expected behavior change, It should include [run_process_replay] in the PR title, [example](https://github.com/tinygrad/tinygrad/pull/4995). Note that you should keep your branch up-to-date with master.

{tinygrad-0.9.0 → tinygrad-0.9.2}/setup.py RENAMED Viewed

@@ -8,19 +8,19 @@ with open(directory / 'README.md', encoding='utf-8') as f:
   long_description = f.read()
 setup(name='tinygrad',
-      version='0.9.0',
+      version='0.9.2',
       description='You like pytorch? You like micrograd? You love tinygrad! <3',
       author='George Hotz',
       license='MIT',
       long_description=long_description,
       long_description_content_type='text/markdown',
       packages = ['tinygrad', 'tinygrad.runtime.autogen', 'tinygrad.codegen', 'tinygrad.nn', 'tinygrad.renderer', 'tinygrad.engine',
-                  'tinygrad.runtime', 'tinygrad.runtime.driver', 'tinygrad.runtime.graph', 'tinygrad.shape'],
+                  'tinygrad.runtime', 'tinygrad.runtime.support', 'tinygrad.runtime.graph', 'tinygrad.shape'],
       classifiers=[
         "Programming Language :: Python :: 3",
         "License :: OSI Approved :: MIT License"
       ],
-      install_requires=["numpy", "tqdm",
+      install_requires=["numpy",
                         "pyobjc-framework-Metal; platform_system=='Darwin'",
                         "pyobjc-framework-libdispatch; platform_system=='Darwin'"],
       python_requires='>=3.8',
@@ -46,16 +46,20 @@ setup(name='tinygrad',
             "onnx2torch",
             "opencv-python",
             "tabulate",
+            "tqdm",
             "safetensors",
             "transformers",
             "sentencepiece",
             "tiktoken",
+            "blobfile",
             "librosa",
             "networkx",
             "hypothesis",
             "nibabel",
+            "bottle",
         ],
         'docs': [
+            "mkdocs",
             "mkdocs-material",
             "mkdocstrings[python]",
             "markdown-callouts",

tinygrad-0.9.2/test/test_arange.py ADDED Viewed

@@ -0,0 +1,167 @@
+import unittest, contextlib
+import numpy as np
+from tinygrad import Tensor, GlobalCounters, dtypes, nn
+from tinygrad.helpers import CI, Context, getenv
+from tinygrad.engine.realize import run_schedule
+from tinygrad.codegen.kernel import Opt, OptOps, Kernel, KernelOptError
+from tinygrad.engine.realize import CompiledRunner, ExecItem
+from tinygrad.engine.search import get_kernel_actions
+class TestArange(unittest.TestCase):
+  def _get_flops(self, N, opts=None):
+    GlobalCounters.reset()
+    tt = Tensor.arange(N)
+    sched = tt.schedule()
+    self.assertEqual(len(sched), 1)
+    k = Kernel(sched[-1].ast)
+    if opts is not None:
+      for o in opts: k.apply_opt(o)
+    p = k.to_program()
+    print(p.name)
+    #print(p.src)
+    ExecItem(CompiledRunner(p), [tt.lazydata.buffer]).run()
+    np.testing.assert_equal(tt.numpy(), np.arange(N))
+    return p.op_estimate
+  def test_complexity(self, opts=None):
+    # add 1 to avoid divide by 0. arange is 0 flops now!
+    f1 = self._get_flops(256, opts) + 1
+    f2 = self._get_flops(2560, opts) + 1
+    print(f"{f1=}, {f2=}")
+    assert (f1 < 5000 and f2 < 5000) or (f2 / f1 < 15), f"bad complexity, flops {f2/f1:.1f}X while inputs 10X"
+  def test_complexity_w_upcast(self): return self.test_complexity([Opt(OptOps.UPCAST, 0, 4)])
+  def test_complexity_w_unroll(self): return self.test_complexity([Opt(OptOps.UNROLL, 0, 4)])
+  def test_complexity_w_upcast_and_unroll(self): return self.test_complexity([Opt(OptOps.UPCAST, 0, 4), Opt(OptOps.UNROLL, 0, 4)])
+  @unittest.skip("doesn't work yet")
+  def test_complexity_w_local_and_padto(self): return self.test_complexity([Opt(OptOps.LOCAL, 0, 16), Opt(op=OptOps.PADTO, axis=1, amt=32)])
+  def test_all_opts(self, opts=None, exclude=None):
+    k = Kernel(Tensor.arange(256).schedule()[-1].ast)
+    if opts is not None:
+      for o in opts: k.apply_opt(o)
+    all_opts_256 = [kk.applied_opts for kk in get_kernel_actions(k, include_0=False).values()]
+    k = Kernel(Tensor.arange(2560).schedule()[-1].ast)
+    if opts is not None:
+      for o in opts: k.apply_opt(o)
+    all_opts_2560 = [kk.applied_opts for kk in get_kernel_actions(k, include_0=False).values()]
+    all_opts = [x for x in all_opts_256 if x in all_opts_2560]
+    for opts in all_opts:
+      if exclude is not None and opts[-1] in exclude: continue
+      print(opts)
+      self.test_complexity(opts)
+  def test_all_opts_w_local(self):
+    with contextlib.suppress(KernelOptError):
+      return self.test_all_opts([Opt(OptOps.LOCAL, 0, 16)], [Opt(op=OptOps.PADTO, axis=1, amt=32)])
+  def test_all_opts_w_upcast(self): return self.test_all_opts([Opt(OptOps.UPCAST, 0, 4)])
+  def test_all_opts_w_unroll(self): return self.test_all_opts([Opt(OptOps.UNROLL, 0, 4)], [Opt(op=OptOps.GROUP, axis=0, amt=0)])
+  def test_all_opts_w_upcast_and_unroll(self):
+    return self.test_all_opts([Opt(OptOps.UPCAST, 0, 4), Opt(OptOps.UNROLL, 0, 4)], [Opt(op=OptOps.GROUP, axis=0, amt=0)])
+class TestIndexing(unittest.TestCase):
+  def test_arange_2_reduce(self):
+    needle = Tensor.zeros(16384, dtype=dtypes.int).contiguous()
+    needle[1337] = 1
+    needle.realize()
+    with Context(NOOPT=1, FUSE_ARANGE=1):
+      GlobalCounters.reset()
+      # TODO: it should work without these reshapes
+      out = ((Tensor.arange(1,16385).reshape(16384,1)-1)*needle.reshape(16384,1)).sum()
+      sched = out.schedule()
+      assert len(sched) == 1
+      run_schedule(sched)
+    assert out.item() == 1337, f"expected 1337, got {out.item()}"
+  @unittest.skipIf(getenv("PTX"), "broken on ptx for some reason")
+  def test_manual_index(self):
+    dataset = Tensor.rand(16384, 256).realize()
+    idxs = Tensor([0,3,5,6]).realize()
+    real_index = dataset.numpy()[idxs.numpy()]
+    print("*** indexing ***")
+    with Context(NOOPT=1, FUSE_ARANGE=1):
+      GlobalCounters.reset()
+      rng = Tensor.ones(4, 256, 16384, dtype=dtypes.int)._cumsum(axis=-1, _first_zero=True).reshape(4, 256, 16384, 1)
+      idxs = idxs.reshape(4,1,1,1).expand(4, 256, 16384, 1)
+      reshape_dataset = dataset.T.reshape(1, 256, 16384, 1).expand(4, 256, 16384, 1)
+      full = (rng==idxs).where(reshape_dataset, Tensor.zeros(4, 256, 16384, 1))
+      X = full.sum(axis=(2,3))
+      sched = X.schedule()
+      assert len(sched) == 1
+      run_schedule(sched)
+      assert GlobalCounters.global_ops < 4*16384, f"too many ops {GlobalCounters.global_ops}"
+    np.testing.assert_allclose(real_index, X.numpy())
+  def test_index(self):
+    dataset = Tensor.rand(16384, 256).realize()
+    idxs = Tensor([0,3,5,6]).realize()
+    real_index = dataset.numpy()[idxs.numpy()]
+    print("*** indexing ***")
+    with Context(NOOPT=1):
+      GlobalCounters.reset()
+      X = dataset[idxs]
+      assert X.shape == (4,256)
+      sched = X.schedule()
+      # TODO: enable these asserts when the scheduler can handle this
+      #assert len(sched) == 1, f"{len(sched)} != 1"
+      run_schedule(sched)
+      #assert GlobalCounters.global_ops < 4*16384, f"too many ops {GlobalCounters.global_ops}"
+    np.testing.assert_allclose(real_index, X.numpy())
+  def test_index_fused(self, noopt=1):
+    dataset = Tensor.rand(16384, 256).realize()
+    idxs = Tensor([0,3,5,6]).realize()
+    real_index = dataset.numpy()[idxs.numpy()]
+    print("*** indexing ***")
+    with Context(NOOPT=noopt, FUSE_ARANGE=1):
+      GlobalCounters.reset()
+      X = dataset[idxs]
+      assert X.shape == (4,256)
+      sched = X.schedule()
+      assert len(sched) == 2
+      run_schedule(sched)
+      assert GlobalCounters.global_ops < 4*16384, f"too many ops {GlobalCounters.global_ops} != {4*16384}"
+    np.testing.assert_allclose(real_index, X.numpy())
+  @unittest.skip("not ready")
+  def test_index_fused_opt(self): self.test_index_fused(0)
+  @unittest.skipIf(getenv("PTX"), "broken on ptx for some reason")
+  def test_index_mnist(self, noopt=1):
+    from tinygrad.nn.datasets import mnist
+    X_train, Y_train, _, _ = mnist()
+    with Context(NOOPT=noopt, FUSE_ARANGE=1, SPLIT_REDUCEOP=0):
+      GlobalCounters.reset()
+      samples = Tensor.randint(getenv("BS", 512), high=X_train.shape[0])
+      x = X_train[samples].numpy()
+      y = Y_train[samples].numpy()
+      assert GlobalCounters.global_ops < 4*16384, f"too many ops {GlobalCounters.global_ops} != {4*16384}"
+    np.testing.assert_allclose(X_train.numpy()[samples.numpy()], x)
+    np.testing.assert_allclose(Y_train.numpy()[samples.numpy()], y)
+  @unittest.skip("not ready")
+  def test_index_mnist_opt(self): self.test_index_mnist(0)
+  @unittest.skipIf(getenv("PTX"), "broken on ptx for some reason")
+  def test_llama_embedding(self, noopt=1, op_limit=0):
+    # llama3 is 128256
+    vocab_size, embed_size = (10, 3) if CI else (32000, 4096)
+    emb = nn.Embedding(vocab_size, embed_size)
+    emb_w = emb.weight.numpy()
+    x = Tensor([1,2,3,4])
+    with Context(NOOPT=noopt, FUSE_ARANGE=1):
+      GlobalCounters.reset()
+      z = emb(x).realize()
+      self.assertLessEqual(GlobalCounters.global_ops, op_limit)
+      self.assertEqual(GlobalCounters.kernel_count, 2)
+    if getenv("CHECK", 1):
+      import torch
+      with torch.no_grad():
+        torch_emb = torch.nn.Embedding(vocab_size, embed_size).eval()
+        torch_emb.weight[:] = torch.tensor(emb_w, dtype=torch.float32)
+      torch_z = torch_emb(torch.tensor(x.numpy()))
+      # TODO: reshape to match torch, should we do this in nn?
+      np.testing.assert_allclose(z.numpy().reshape(4, embed_size), torch_z.detach().numpy(), atol=1e-8, rtol=1e-8)
+  # at least the arange is being fused
+  def test_llama_embedding_opt(self): self.test_llama_embedding(0, 1736704000)
+if __name__ == "__main__":
+  unittest.main()

{tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_const_folding.py RENAMED Viewed

@@ -2,14 +2,14 @@ import unittest, math
 from tinygrad import Tensor, Device, dtypes
 from tinygrad.engine.schedule import create_schedule
 from tinygrad.helpers import CI
-from tinygrad.ops import BufferOps
+from tinygrad.ops import MetaOps
 import numpy as np
 from test.helpers import is_dtype_supported
 def _check_ast_count(desired_count:int, t:Tensor):
   # NOTE: this has side effect because everything can be scheduled only once
   schedule = create_schedule(t.lazydata.lbs)
-  asts = [s for s in schedule if s.ast[0].op is BufferOps.STORE]
+  asts = [s for s in schedule if s.ast.op is MetaOps.KERNEL]
   assert len(asts) == desired_count
 class TestUnaryOpsConstFolding(unittest.TestCase):
@@ -28,6 +28,11 @@ class TestUnaryOpsConstFolding(unittest.TestCase):
     _check_ast_count(0, Tensor([1, 2, 3]).neg().mul(-1))
     _check_ast_count(0, Tensor([1, 2, 3]).neg().neg())
+  def test_neg_realized_no_fold(self):
+    x = Tensor.randn(32, 32)
+    x = x.clip(0, 1).realize()
+    _check_ast_count(1, x.neg())
 class TestBinaryOpsConstFolding(unittest.TestCase):
   def test_add_literal_zero(self):
     _check_ast_count(0, Tensor([1.0, 2, 3, 4]) + 0)
@@ -250,4 +255,4 @@ class TestTautologicalCompare(unittest.TestCase):
     np.testing.assert_equal((a != a).numpy(), [True, False, False])
 if __name__ == '__main__':
-  unittest.main()
+  unittest.main()

{tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_conv.py RENAMED Viewed

@@ -42,6 +42,14 @@ class TestConv(unittest.TestCase):
     print(ret.numpy())
+  def test_two_binops_no_rerun_small(self):
+    Tensor.no_grad = True
+    x = Tensor.rand(1,1,32,32)
+    w = Tensor.rand(1,1,3,3)
+    out = x.conv2d(w, padding=(1,1))
+    np.testing.assert_allclose(out.relu().numpy(), np.maximum(out.numpy(), 0))
+    Tensor.no_grad = False
   def test_two_binops_no_rerun(self):
     Tensor.no_grad = True
     x = Tensor.randn(1,12,128,256)

tinygrad-0.9.2/test/test_conv_shapetracker.py ADDED Viewed

@@ -0,0 +1,63 @@
+#!/usr/bin/env python
+import unittest
+from tinygrad.tensor import Tensor
+from tinygrad.ops import MetaOps, BufferOps
+from tinygrad.nn import Conv2d
+from tinygrad.engine.schedule import create_schedule
+from tinygrad.shape.shapetracker import ShapeTracker, View
+from tinygrad.helpers import prod
+from test.unit.test_shapetracker import shapetracker_getitem
+class TestConvShapetracker(unittest.TestCase):
+  def test_conv_3x3_one_view(self):
+    conv = Conv2d(16, 32, (3, 3))
+    seen = set()
+    # first run to init the weights, they are saved in seen
+    create_schedule([conv(Tensor.empty(1, 16, 10, 10)).lazydata], seen)
+    # run it again to get the kernels
+    sched = [si for si in create_schedule([conv(Tensor.empty(1, 16, 10, 10)).lazydata], seen) if si.ast.op is MetaOps.KERNEL]
+    assert len(sched) == 1, f"conv should only have one kernel, getting {len(sched)}"
+    for st in [x.arg.st for x in sched[0].ast.lazyops if x.op is BufferOps.LOAD]:
+      assert len(st.views) == 1
+  @unittest.expectedFailure
+  def test_conv_2x2_backward_one_view(self):
+    X = Tensor.rand(1, 1, 3, 3, requires_grad=True)
+    conv = Conv2d(1, 1, (2, 2), bias=False)
+    conv(X).mean().backward()
+    si = X.grad.schedule()[-1]
+    print(si)
+    ldb = [x for x in si.ast.lazyops if x.op is BufferOps.LOAD][0]
+    st: ShapeTracker = ldb.arg.st.simplify()
+    # NOTE: st.real_size() is broken
+    print(si.inputs[0].size)
+    #self.assertEqual(si.inputs[0].size, st.real_size())
+    for v in st.views: print(v)
+    # same st
+    test_st = ShapeTracker((
+      View(shape=(1, 1, 2, 4, 2, 4), strides=(0, 0, 2, 8, 1, 4), offset=0, mask=((0, 1), (0, 1), (0, 2), (0, 2), (0, 2), (0, 2)), contiguous=False),
+      View(shape=(1, 1, 1, 1, 3, 3, 3, 3), strides=(0, 0, 0, 0, 24, 8, 3, 1), offset=0,
+           mask=((0, 1), (0, 1), (0, 1), (0, 1), (0, 2), (0, 3), (0, 2), (0, 3)), contiguous=False)))
+    #test_st = ShapeTracker((
+    #  View(shape=(2,4), strides=(1,4), offset=0, mask=None, contiguous=False),
+    #)).simplify()
+      #View(shape=(1, 1, 2, 4, 2, 4), strides=(0, 0, 2, 8, 1, 4), offset=0, mask=((0, 1), (0, 1), (0, 2), (0, 2), (0, 2), (0, 2)), contiguous=False),
+      #View(shape=(1, 1, 1, 1, 3, 3, 3, 3), strides=(0, 0, 0, 0, 24, 8, 3, 1), offset=0,
+      #     mask=((0, 1), (0, 1), (0, 1), (0, 1), (0, 2), (0, 3), (0, 2), (0, 3)), contiguous=False))).simplify()
+    print("*** new ***")
+    for v in test_st.views: print(v)
+    for i in range(prod(st.shape)):
+      i1, i2 = shapetracker_getitem(st, i), shapetracker_getitem(test_st, i)
+      print(i, i1, i2, si.inputs[0].size, i1==i2)
+      #self.assertEqual(i1, i2)
+    for stt in [st, test_st]:
+      s,va = stt.expr_idxs()
+      print(s)
+      print(va)
+    assert len(st.views) <= 2
+if __name__ == '__main__':
+  unittest.main()

{tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_custom_function.py RENAMED Viewed

@@ -31,7 +31,7 @@ def atan2_cpu(ret:Buffer, a:Buffer, b:Buffer): ret.copyin(np.require(np.arctan2(
 # NOTE: The derivative of atan2 doesn't need a custom op! https://www.liquisearch.com/atan2/derivative
 # In general, it is also optional to write a backward function, just your backward pass won't work without it
-from tinygrad.ops import LoadOps, BinaryOps
+from tinygrad.ops import MetaOps, BinaryOps, UnaryOps
 from tinygrad.lazy import LazyBuffer
 from tinygrad.tensor import Function
@@ -39,12 +39,13 @@ class ATan2(Function):
   def forward(self, a:LazyBuffer, b:LazyBuffer) -> LazyBuffer:
     assert prod(a.shape) == prod(b.shape) and a.device == b.device, "shape or device mismatch"
     self.a, self.b = a, b
-    return create_lazybuffer(a.device, ShapeTracker.from_shape(a.shape), max(a.dtype, b.dtype), LoadOps.CUSTOM,
+    return create_lazybuffer(a.device, ShapeTracker.from_shape(a.shape), max(a.dtype, b.dtype), MetaOps.CUSTOM,
                              arg={"GPU": atan2_gpu, "CPU": atan2_cpu}[a.device], srcs=(a.contiguous(), b.contiguous()))
   def backward(self, grad_output:LazyBuffer) -> Tuple[Optional[LazyBuffer], Optional[LazyBuffer]]:
-    denom = (self.a.e(BinaryOps.MUL, self.a)).e(BinaryOps.ADD, self.b.e(BinaryOps.MUL, self.b))
-    return grad_output.e(BinaryOps.MUL, self.b.e(BinaryOps.DIV, denom)) if self.needs_input_grad[0] else None, \
-           grad_output.e(BinaryOps.MUL, self.a.const(0).e(BinaryOps.SUB, self.a).e(BinaryOps.DIV, denom)) if self.needs_input_grad[1] else None
+    recip = (self.a.e(BinaryOps.MUL, self.a)).e(BinaryOps.ADD, self.b.e(BinaryOps.MUL, self.b)).e(UnaryOps.RECIP)
+    return grad_output.e(BinaryOps.MUL, self.b.e(BinaryOps.MUL, recip)) if self.needs_input_grad[0] else None, \
+           grad_output.e(BinaryOps.MUL, self.a.const(0).e(BinaryOps.ADD, self.a.e(UnaryOps.NEG)).e(BinaryOps.MUL, recip)) \
+             if self.needs_input_grad[1] else None
 # *** third, we use our lovely new mlop in some tests ***

{tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_device_speed.py RENAMED Viewed

@@ -1,13 +1,13 @@
 import unittest
 from tinygrad import Device
-from tinygrad.codegen.uops import UOpGraph
+from tinygrad.codegen.uopgraph import UOpGraph
 from tinygrad.helpers import Timing, Profiling
 class TestDeviceSpeed(unittest.TestCase):
   @classmethod
   def setUpClass(cls):
     cls.dev = Device[Device.DEFAULT]
-    cls.empty = Device[Device.DEFAULT].renderer.render("test", UOpGraph())
+    cls.empty = Device[Device.DEFAULT].renderer.render("test", UOpGraph([]))
   def test_empty_compile(self):
     with Timing("compiler "):

tinygrad 0.9.0__tar.gz → 0.9.2__tar.gz

tinygrad 0.9.0tar.gz → 0.9.2tar.gz