PyPI - tinygrad - Versions diffs - 0.9.1__tar.gz → 0.10.0__tar.gz - Mend

tinygrad 0.9.1tar.gz → 0.10.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (179) hide show

{tinygrad-0.9.1 → tinygrad-0.10.0}/PKG-INFO +21 -17
{tinygrad-0.9.1 → tinygrad-0.10.0}/README.md +13 -11
{tinygrad-0.9.1 → tinygrad-0.10.0}/setup.py +13 -9
tinygrad-0.10.0/test/test_arange.py +179 -0
{tinygrad-0.9.1 → tinygrad-0.10.0}/test/test_assign.py +17 -4
tinygrad-0.10.0/test/test_compile_failures.py +18 -0
{tinygrad-0.9.1 → tinygrad-0.10.0}/test/test_const_folding.py +27 -12
{tinygrad-0.9.1 → tinygrad-0.10.0}/test/test_conv.py +8 -0
tinygrad-0.10.0/test/test_conv_shapetracker.py +58 -0
{tinygrad-0.9.1 → tinygrad-0.10.0}/test/test_copy_speed.py +1 -1
{tinygrad-0.9.1 → tinygrad-0.10.0}/test/test_device_speed.py +1 -2
{tinygrad-0.9.1 → tinygrad-0.10.0}/test/test_dtype.py +172 -35
{tinygrad-0.9.1 → tinygrad-0.10.0}/test/test_dtype_alu.py +27 -15
{tinygrad-0.9.1 → tinygrad-0.10.0}/test/test_fusion_op.py +28 -9
{tinygrad-0.9.1 → tinygrad-0.10.0}/test/test_fuzz_shape_ops.py +2 -2
tinygrad-0.10.0/test/test_gc.py +67 -0
{tinygrad-0.9.1 → tinygrad-0.10.0}/test/test_graph.py +1 -2
tinygrad-0.10.0/test/test_hcq.py +475 -0
{tinygrad-0.9.1 → tinygrad-0.10.0}/test/test_image_dtype.py +51 -11
{tinygrad-0.9.1 → tinygrad-0.10.0}/test/test_jit.py +79 -2
{tinygrad-0.9.1 → tinygrad-0.10.0}/test/test_lazybuffer.py +34 -13
tinygrad-0.10.0/test/test_linearizer.py +2174 -0
tinygrad-0.10.0/test/test_linearizer_dumb.py +223 -0
tinygrad-0.10.0/test/test_linearizer_failures.py +1435 -0
tinygrad-0.10.0/test/test_linearizer_overflows.py +196 -0
tinygrad-0.10.0/test/test_metal.py +77 -0
{tinygrad-0.9.1 → tinygrad-0.10.0}/test/test_multitensor.py +202 -47
{tinygrad-0.9.1 → tinygrad-0.10.0}/test/test_nn.py +259 -30
tinygrad-0.10.0/test/test_ocl.py +31 -0
{tinygrad-0.9.1 → tinygrad-0.10.0}/test/test_ops.py +615 -52
{tinygrad-0.9.1 → tinygrad-0.10.0}/test/test_optim.py +1 -1
{tinygrad-0.9.1 → tinygrad-0.10.0}/test/test_pickle.py +52 -6
tinygrad-0.10.0/test/test_profiler.py +221 -0
{tinygrad-0.9.1 → tinygrad-0.10.0}/test/test_randomness.py +146 -28
tinygrad-0.10.0/test/test_rearrange_einops.py +321 -0
tinygrad-0.10.0/test/test_renderer_failures.py +68 -0
{tinygrad-0.9.1 → tinygrad-0.10.0}/test/test_sample.py +1 -2
tinygrad-0.10.0/test/test_schedule.py +1859 -0
tinygrad-0.10.0/test/test_search.py +158 -0
{tinygrad-0.9.1 → tinygrad-0.10.0}/test/test_setitem.py +23 -8
{tinygrad-0.9.1 → tinygrad-0.10.0}/test/test_specific_conv.py +1 -1
{tinygrad-0.9.1 → tinygrad-0.10.0}/test/test_speed_v_torch.py +9 -16
{tinygrad-0.9.1 → tinygrad-0.10.0}/test/test_subbuffer.py +2 -3
{tinygrad-0.9.1 → tinygrad-0.10.0}/test/test_symbolic_jit.py +1 -3
{tinygrad-0.9.1 → tinygrad-0.10.0}/test/test_symbolic_ops.py +2 -2
{tinygrad-0.9.1 → tinygrad-0.10.0}/test/test_symbolic_shapetracker.py +37 -40
{tinygrad-0.9.1 → tinygrad-0.10.0}/test/test_tensor.py +182 -11
{tinygrad-0.9.1 → tinygrad-0.10.0}/test/test_tensor_data.py +12 -1
{tinygrad-0.9.1 → tinygrad-0.10.0}/test/test_tensor_variable.py +36 -20
tinygrad-0.10.0/test/test_tiny.py +84 -0
tinygrad-0.10.0/test/test_transcendental.py +121 -0
tinygrad-0.10.0/test/test_uop_graph.py +716 -0
tinygrad-0.10.0/test/test_uops.py +454 -0
tinygrad-0.10.0/test/test_uops_stats.py +224 -0
tinygrad-0.10.0/test/test_viz.py +93 -0
{tinygrad-0.9.1 → tinygrad-0.10.0}/test/test_winograd.py +7 -6
tinygrad-0.10.0/tinygrad/__init__.py +11 -0
{tinygrad-0.9.1 → tinygrad-0.10.0}/tinygrad/codegen/kernel.py +308 -175
tinygrad-0.10.0/tinygrad/codegen/linearize.py +95 -0
tinygrad-0.10.0/tinygrad/codegen/lowerer.py +143 -0
tinygrad-0.10.0/tinygrad/codegen/transcendental.py +257 -0
tinygrad-0.10.0/tinygrad/codegen/uopgraph.py +506 -0
tinygrad-0.10.0/tinygrad/device.py +221 -0
tinygrad-0.10.0/tinygrad/dtype.py +188 -0
tinygrad-0.10.0/tinygrad/engine/jit.py +295 -0
{tinygrad-0.9.1/tinygrad → tinygrad-0.10.0/tinygrad/engine}/lazy.py +74 -66
tinygrad-0.10.0/tinygrad/engine/memory.py +51 -0
{tinygrad-0.9.1 → tinygrad-0.10.0}/tinygrad/engine/realize.py +86 -61
tinygrad-0.10.0/tinygrad/engine/schedule.py +419 -0
{tinygrad-0.9.1 → tinygrad-0.10.0}/tinygrad/engine/search.py +58 -47
{tinygrad-0.9.1 → tinygrad-0.10.0}/tinygrad/function.py +59 -58
{tinygrad-0.9.1 → tinygrad-0.10.0}/tinygrad/helpers.py +120 -102
{tinygrad-0.9.1 → tinygrad-0.10.0}/tinygrad/multi.py +82 -78
{tinygrad-0.9.1 → tinygrad-0.10.0}/tinygrad/nn/__init__.py +116 -67
tinygrad-0.10.0/tinygrad/nn/datasets.py +15 -0
{tinygrad-0.9.1 → tinygrad-0.10.0}/tinygrad/nn/optim.py +1 -1
{tinygrad-0.9.1 → tinygrad-0.10.0}/tinygrad/nn/state.py +91 -6
tinygrad-0.10.0/tinygrad/ops.py +1152 -0
tinygrad-0.10.0/tinygrad/renderer/__init__.py +89 -0
tinygrad-0.10.0/tinygrad/renderer/cstyle.py +462 -0
tinygrad-0.10.0/tinygrad/renderer/llvmir.py +142 -0
tinygrad-0.10.0/tinygrad/renderer/ptx.py +225 -0
tinygrad-0.10.0/tinygrad/runtime/autogen/adreno.py +17904 -0
tinygrad-0.10.0/tinygrad/runtime/autogen/amd_gpu.py +48384 -0
{tinygrad-0.9.1 → tinygrad-0.10.0}/tinygrad/runtime/autogen/cuda.py +6 -162
{tinygrad-0.9.1 → tinygrad-0.10.0}/tinygrad/runtime/autogen/io_uring.py +97 -63
{tinygrad-0.9.1 → tinygrad-0.10.0}/tinygrad/runtime/autogen/kfd.py +60 -47
tinygrad-0.10.0/tinygrad/runtime/autogen/kgsl.py +1386 -0
tinygrad-0.10.0/tinygrad/runtime/autogen/libc.py +5462 -0
{tinygrad-0.9.1 → tinygrad-0.10.0}/tinygrad/runtime/autogen/nv_gpu.py +1976 -1957
tinygrad-0.10.0/tinygrad/runtime/autogen/nvrtc.py +579 -0
{tinygrad-0.9.1 → tinygrad-0.10.0}/tinygrad/runtime/autogen/opencl.py +11 -11
tinygrad-0.10.0/tinygrad/runtime/autogen/qcom_dsp.py +1739 -0
{tinygrad-0.9.1 → tinygrad-0.10.0}/tinygrad/runtime/graph/clang.py +3 -3
{tinygrad-0.9.1 → tinygrad-0.10.0}/tinygrad/runtime/graph/cuda.py +11 -15
tinygrad-0.10.0/tinygrad/runtime/graph/hcq.py +200 -0
tinygrad-0.10.0/tinygrad/runtime/graph/metal.py +103 -0
tinygrad-0.10.0/tinygrad/runtime/ops_amd.py +471 -0
{tinygrad-0.9.1 → tinygrad-0.10.0}/tinygrad/runtime/ops_clang.py +12 -5
tinygrad-0.10.0/tinygrad/runtime/ops_cloud.py +220 -0
tinygrad-0.10.0/tinygrad/runtime/ops_cuda.py +128 -0
{tinygrad-0.9.1 → tinygrad-0.10.0}/tinygrad/runtime/ops_disk.py +25 -26
tinygrad-0.10.0/tinygrad/runtime/ops_dsp.py +181 -0
{tinygrad-0.9.1 → tinygrad-0.10.0}/tinygrad/runtime/ops_gpu.py +29 -16
tinygrad-0.10.0/tinygrad/runtime/ops_hip.py +68 -0
{tinygrad-0.9.1 → tinygrad-0.10.0}/tinygrad/runtime/ops_llvm.py +15 -10
tinygrad-0.10.0/tinygrad/runtime/ops_metal.py +188 -0
tinygrad-0.10.0/tinygrad/runtime/ops_nv.py +584 -0
{tinygrad-0.9.1 → tinygrad-0.10.0}/tinygrad/runtime/ops_python.py +78 -79
tinygrad-0.10.0/tinygrad/runtime/ops_qcom.py +405 -0
tinygrad-0.10.0/tinygrad/runtime/support/compiler_cuda.py +77 -0
tinygrad-0.9.1/tinygrad/runtime/driver/hip_comgr.py → tinygrad-0.10.0/tinygrad/runtime/support/compiler_hip.py +13 -1
tinygrad-0.10.0/tinygrad/runtime/support/elf.py +38 -0
tinygrad-0.10.0/tinygrad/runtime/support/hcq.py +539 -0
tinygrad-0.10.0/tinygrad/shape/__init__.py +0 -0
{tinygrad-0.9.1 → tinygrad-0.10.0}/tinygrad/shape/shapetracker.py +40 -50
{tinygrad-0.9.1 → tinygrad-0.10.0}/tinygrad/shape/view.py +102 -63
{tinygrad-0.9.1 → tinygrad-0.10.0}/tinygrad/tensor.py +1109 -365
{tinygrad-0.9.1 → tinygrad-0.10.0}/tinygrad.egg-info/PKG-INFO +21 -17
{tinygrad-0.9.1 → tinygrad-0.10.0}/tinygrad.egg-info/SOURCES.txt +33 -13
{tinygrad-0.9.1 → tinygrad-0.10.0}/tinygrad.egg-info/requires.txt +6 -6
tinygrad-0.9.1/test/test_arange.py +0 -19
tinygrad-0.9.1/test/test_conv_shapetracker.py +0 -22
tinygrad-0.9.1/test/test_custom_function.py +0 -106
tinygrad-0.9.1/test/test_gc.py +0 -37
tinygrad-0.9.1/test/test_lazyop.py +0 -34
tinygrad-0.9.1/test/test_linearizer.py +0 -1778
tinygrad-0.9.1/test/test_linearizer_failures.py +0 -255
tinygrad-0.9.1/test/test_linearizer_overflows.py +0 -89
tinygrad-0.9.1/test/test_pattern_matcher.py +0 -168
tinygrad-0.9.1/test/test_print_tree.py +0 -66
tinygrad-0.9.1/test/test_schedule.py +0 -1156
tinygrad-0.9.1/test/test_search.py +0 -101
tinygrad-0.9.1/test/test_uop_graph.py +0 -190
tinygrad-0.9.1/test/test_uops.py +0 -319
tinygrad-0.9.1/test/test_uops_stats.py +0 -81
tinygrad-0.9.1/test/test_verify_lazyop.py +0 -64
tinygrad-0.9.1/tinygrad/__init__.py +0 -6
tinygrad-0.9.1/tinygrad/codegen/linearizer.py +0 -528
tinygrad-0.9.1/tinygrad/codegen/uops.py +0 -451
tinygrad-0.9.1/tinygrad/device.py +0 -320
tinygrad-0.9.1/tinygrad/dtype.py +0 -113
tinygrad-0.9.1/tinygrad/engine/graph.py +0 -100
tinygrad-0.9.1/tinygrad/engine/jit.py +0 -198
tinygrad-0.9.1/tinygrad/engine/schedule.py +0 -370
tinygrad-0.9.1/tinygrad/nn/datasets.py +0 -8
tinygrad-0.9.1/tinygrad/ops.py +0 -169
tinygrad-0.9.1/tinygrad/renderer/__init__.py +0 -65
tinygrad-0.9.1/tinygrad/renderer/assembly.py +0 -269
tinygrad-0.9.1/tinygrad/renderer/cstyle.py +0 -389
tinygrad-0.9.1/tinygrad/renderer/llvmir.py +0 -160
tinygrad-0.9.1/tinygrad/runtime/autogen/amd_gpu.py +0 -13403
tinygrad-0.9.1/tinygrad/runtime/graph/hcq.py +0 -187
tinygrad-0.9.1/tinygrad/runtime/graph/metal.py +0 -75
tinygrad-0.9.1/tinygrad/runtime/ops_amd.py +0 -550
tinygrad-0.9.1/tinygrad/runtime/ops_cuda.py +0 -185
tinygrad-0.9.1/tinygrad/runtime/ops_metal.py +0 -105
tinygrad-0.9.1/tinygrad/runtime/ops_nv.py +0 -625
tinygrad-0.9.1/tinygrad/shape/symbolic.py +0 -327
{tinygrad-0.9.1 → tinygrad-0.10.0}/LICENSE +0 -0
{tinygrad-0.9.1 → tinygrad-0.10.0}/setup.cfg +0 -0
{tinygrad-0.9.1 → tinygrad-0.10.0}/test/test_kernel_cache.py +0 -0
{tinygrad-0.9.1 → tinygrad-0.10.0}/test/test_masked_st.py +0 -0
{tinygrad-0.9.1 → tinygrad-0.10.0}/test/test_method_cache.py +0 -0
{tinygrad-0.9.1 → tinygrad-0.10.0}/test/test_net_speed.py +0 -0
{tinygrad-0.9.1 → tinygrad-0.10.0}/test/test_to_numpy.py +0 -0
{tinygrad-0.9.1 → tinygrad-0.10.0}/test/test_zero_copy.py +0 -0
{tinygrad-0.9.1 → tinygrad-0.10.0}/tinygrad/codegen/__init__.py +0 -0
{tinygrad-0.9.1 → tinygrad-0.10.0}/tinygrad/engine/__init__.py +0 -0
/tinygrad-0.9.1/tinygrad/runtime/__init__.py → /tinygrad-0.10.0/tinygrad/py.typed +0 -0
{tinygrad-0.9.1/tinygrad/runtime/driver → tinygrad-0.10.0/tinygrad/runtime}/__init__.py +0 -0
{tinygrad-0.9.1 → tinygrad-0.10.0}/tinygrad/runtime/autogen/comgr.py +0 -0
{tinygrad-0.9.1 → tinygrad-0.10.0}/tinygrad/runtime/autogen/hip.py +0 -0
{tinygrad-0.9.1 → tinygrad-0.10.0}/tinygrad/runtime/autogen/hsa.py +0 -0
{tinygrad-0.9.1 → tinygrad-0.10.0}/tinygrad/runtime/graph/__init__.py +0 -0
{tinygrad-0.9.1 → tinygrad-0.10.0}/tinygrad/runtime/ops_npy.py +0 -0
{tinygrad-0.9.1/tinygrad/shape → tinygrad-0.10.0/tinygrad/runtime/support}/__init__.py +0 -0
{tinygrad-0.9.1 → tinygrad-0.10.0}/tinygrad.egg-info/dependency_links.txt +0 -0
{tinygrad-0.9.1 → tinygrad-0.10.0}/tinygrad.egg-info/top_level.txt +0 -0

{tinygrad-0.9.1 → tinygrad-0.10.0}/PKG-INFO RENAMED Viewed

@@ -1,17 +1,14 @@
 Metadata-Version: 2.1
 Name: tinygrad
-Version: 0.9.1
+Version: 0.10.0
 Summary: You like pytorch? You like micrograd? You love tinygrad! <3
 Author: George Hotz
 License: MIT
 Classifier: Programming Language :: Python :: 3
 Classifier: License :: OSI Approved :: MIT License
-Requires-Python: >=3.8
+Requires-Python: >=3.10
 Description-Content-Type: text/markdown
 License-File: LICENSE
-Requires-Dist: numpy
-Requires-Dist: pyobjc-framework-Metal; platform_system == "Darwin"
-Requires-Dist: pyobjc-framework-libdispatch; platform_system == "Darwin"
 Provides-Extra: llvm
 Requires-Dist: llvmlite; extra == "llvm"
 Provides-Extra: arm
@@ -20,12 +17,13 @@ Provides-Extra: triton
 Requires-Dist: triton-nightly>=2.1.0.dev20231014192330; extra == "triton"
 Provides-Extra: linting
 Requires-Dist: pylint; extra == "linting"
-Requires-Dist: mypy; extra == "linting"
+Requires-Dist: mypy==1.11.2; extra == "linting"
 Requires-Dist: typing-extensions; extra == "linting"
 Requires-Dist: pre-commit; extra == "linting"
 Requires-Dist: ruff; extra == "linting"
 Requires-Dist: types-tqdm; extra == "linting"
 Provides-Extra: testing
+Requires-Dist: numpy; extra == "testing"
 Requires-Dist: torch; extra == "testing"
 Requires-Dist: pillow; extra == "testing"
 Requires-Dist: pytest; extra == "testing"
@@ -39,17 +37,21 @@ Requires-Dist: safetensors; extra == "testing"
 Requires-Dist: transformers; extra == "testing"
 Requires-Dist: sentencepiece; extra == "testing"
 Requires-Dist: tiktoken; extra == "testing"
+Requires-Dist: blobfile; extra == "testing"
 Requires-Dist: librosa; extra == "testing"
 Requires-Dist: networkx; extra == "testing"
 Requires-Dist: hypothesis; extra == "testing"
 Requires-Dist: nibabel; extra == "testing"
 Requires-Dist: bottle; extra == "testing"
+Requires-Dist: ggml-python; extra == "testing"
 Provides-Extra: docs
+Requires-Dist: mkdocs; extra == "docs"
 Requires-Dist: mkdocs-material; extra == "docs"
 Requires-Dist: mkdocstrings[python]; extra == "docs"
 Requires-Dist: markdown-callouts; extra == "docs"
 Requires-Dist: markdown-exec[ansi]; extra == "docs"
 Requires-Dist: black; extra == "docs"
+Requires-Dist: numpy; extra == "docs"
 Provides-Extra: testing-tf
 Requires-Dist: tensorflow==2.15.1; extra == "testing-tf"
 Requires-Dist: tensorflow_addons; extra == "testing-tf"
@@ -107,7 +109,7 @@ And we can change `DEBUG` to `4` to see the generated code.
 As it turns out, 90% of what you need for neural networks are a decent autograd/tensor library.
 Throw in an optimizer, a data loader, and some compute, and you have all you need.
-```py
+```python
 from tinygrad import Tensor, nn
 class LinearNet:
@@ -122,11 +124,12 @@ optim = nn.optim.Adam([model.l1, model.l2], lr=0.001)
 x, y = Tensor.rand(4, 1, 28, 28), Tensor([2,4,3,7])  # replace with real mnist dataloader
-for i in range(10):
-  optim.zero_grad()
-  loss = model(x).sparse_categorical_crossentropy(y).backward()
-  optim.step()
-  print(i, loss.item())
+with Tensor.train():
+  for i in range(10):
+    optim.zero_grad()
+    loss = model(x).sparse_categorical_crossentropy(y).backward()
+    optim.step()
+    print(i, loss.item())
 ```
 See [examples/beautiful_mnist.py](examples/beautiful_mnist.py) for the full version that gets 98% in ~5 seconds
@@ -142,9 +145,12 @@ tinygrad already supports numerous accelerators, including:
 - [x] [CUDA](tinygrad/runtime/ops_cuda.py)
 - [x] [AMD](tinygrad/runtime/ops_amd.py)
 - [x] [NV](tinygrad/runtime/ops_nv.py)
+- [x] [QCOM](tinygrad/runtime/ops_qcom.py)
 And it is easy to add more! Your accelerator of choice only needs to support a total of ~25 low level ops.
+To check default accelerator run: `python3 -c "from tinygrad import Device; print(Device.DEFAULT)"`
 ## Installation
 The current recommended way to install tinygrad is from source.
@@ -169,7 +175,7 @@ Documentation along with a quick start guide can be found on the [docs website](
 ### Quick example comparing to PyTorch
-```py
+```python
 from tinygrad import Tensor
 x = Tensor.eye(3, requires_grad=True)
@@ -182,7 +188,7 @@ print(y.grad.numpy())  # dz/dy
 ```
 The same thing but in PyTorch:
-```py
+```python
 import torch
 x = torch.eye(3, requires_grad=True)
@@ -230,6 +236,4 @@ python3 -m pytest test/                 # whole test suite
 #### Process replay tests
-[Process replay](https://github.com/tinygrad/tinygrad/blob/master/test/external/process_replay/process_replay.py) detects changes in the generated kernels of CI tests by comparing them against tinygrad master. If your PR is a refactor or speedup without any expected behavior change, it should include a green process replay pass to get merged.
-You can enable process replay by adding [run_process_replay] to your PR title. [example](https://github.com/tinygrad/tinygrad/pull/4995). Note that you should keep your branch up-to-date with master.
+[Process replay](https://github.com/tinygrad/tinygrad/blob/master/test/external/process_replay/README.md) compares your PR's generated kernels against master. If your PR is a refactor or speedup without any expected behavior change, It should include [pr] in the pull request title.

{tinygrad-0.9.1 → tinygrad-0.10.0}/README.md RENAMED Viewed

@@ -51,7 +51,7 @@ And we can change `DEBUG` to `4` to see the generated code.
 As it turns out, 90% of what you need for neural networks are a decent autograd/tensor library.
 Throw in an optimizer, a data loader, and some compute, and you have all you need.
-```py
+```python
 from tinygrad import Tensor, nn
 class LinearNet:
@@ -66,11 +66,12 @@ optim = nn.optim.Adam([model.l1, model.l2], lr=0.001)
 x, y = Tensor.rand(4, 1, 28, 28), Tensor([2,4,3,7])  # replace with real mnist dataloader
-for i in range(10):
-  optim.zero_grad()
-  loss = model(x).sparse_categorical_crossentropy(y).backward()
-  optim.step()
-  print(i, loss.item())
+with Tensor.train():
+  for i in range(10):
+    optim.zero_grad()
+    loss = model(x).sparse_categorical_crossentropy(y).backward()
+    optim.step()
+    print(i, loss.item())
 ```
 See [examples/beautiful_mnist.py](examples/beautiful_mnist.py) for the full version that gets 98% in ~5 seconds
@@ -86,9 +87,12 @@ tinygrad already supports numerous accelerators, including:
 - [x] [CUDA](tinygrad/runtime/ops_cuda.py)
 - [x] [AMD](tinygrad/runtime/ops_amd.py)
 - [x] [NV](tinygrad/runtime/ops_nv.py)
+- [x] [QCOM](tinygrad/runtime/ops_qcom.py)
 And it is easy to add more! Your accelerator of choice only needs to support a total of ~25 low level ops.
+To check default accelerator run: `python3 -c "from tinygrad import Device; print(Device.DEFAULT)"`
 ## Installation
 The current recommended way to install tinygrad is from source.
@@ -113,7 +117,7 @@ Documentation along with a quick start guide can be found on the [docs website](
 ### Quick example comparing to PyTorch
-```py
+```python
 from tinygrad import Tensor
 x = Tensor.eye(3, requires_grad=True)
@@ -126,7 +130,7 @@ print(y.grad.numpy())  # dz/dy
 ```
 The same thing but in PyTorch:
-```py
+```python
 import torch
 x = torch.eye(3, requires_grad=True)
@@ -174,6 +178,4 @@ python3 -m pytest test/                 # whole test suite
 #### Process replay tests
-[Process replay](https://github.com/tinygrad/tinygrad/blob/master/test/external/process_replay/process_replay.py) detects changes in the generated kernels of CI tests by comparing them against tinygrad master. If your PR is a refactor or speedup without any expected behavior change, it should include a green process replay pass to get merged.
-You can enable process replay by adding [run_process_replay] to your PR title. [example](https://github.com/tinygrad/tinygrad/pull/4995). Note that you should keep your branch up-to-date with master.
+[Process replay](https://github.com/tinygrad/tinygrad/blob/master/test/external/process_replay/README.md) compares your PR's generated kernels against master. If your PR is a refactor or speedup without any expected behavior change, It should include [pr] in the pull request title.

{tinygrad-0.9.1 → tinygrad-0.10.0}/setup.py RENAMED Viewed

@@ -8,36 +8,36 @@ with open(directory / 'README.md', encoding='utf-8') as f:
   long_description = f.read()
 setup(name='tinygrad',
-      version='0.9.1',
+      version='0.10.0',
       description='You like pytorch? You like micrograd? You love tinygrad! <3',
       author='George Hotz',
       license='MIT',
       long_description=long_description,
       long_description_content_type='text/markdown',
       packages = ['tinygrad', 'tinygrad.runtime.autogen', 'tinygrad.codegen', 'tinygrad.nn', 'tinygrad.renderer', 'tinygrad.engine',
-                  'tinygrad.runtime', 'tinygrad.runtime.driver', 'tinygrad.runtime.graph', 'tinygrad.shape'],
+                  'tinygrad.runtime', 'tinygrad.runtime.support', 'tinygrad.runtime.graph', 'tinygrad.shape'],
+      package_data = {'tinygrad': ['py.typed']},
       classifiers=[
         "Programming Language :: Python :: 3",
         "License :: OSI Approved :: MIT License"
       ],
-      install_requires=["numpy",
-                        "pyobjc-framework-Metal; platform_system=='Darwin'",
-                        "pyobjc-framework-libdispatch; platform_system=='Darwin'"],
-      python_requires='>=3.8',
+      install_requires=[],
+      python_requires='>=3.10',
       extras_require={
         'llvm': ["llvmlite"],
         'arm': ["unicorn"],
         'triton': ["triton-nightly>=2.1.0.dev20231014192330"],
         'linting': [
             "pylint",
-            "mypy",
+            "mypy==1.11.2",
             "typing-extensions",
             "pre-commit",
             "ruff",
             "types-tqdm",
         ],
-        #'mlperf': ["mlperf-logging @ git+https://github.com/mlperf/logging.git@4.0.0-rc2"],
+        #'mlperf': ["mlperf-logging @ git+https://github.com/mlperf/logging.git@4.1.0-rc3"],
         'testing': [
+            "numpy",
             "torch",
             "pillow",
             "pytest",
@@ -51,18 +51,22 @@ setup(name='tinygrad',
             "transformers",
             "sentencepiece",
             "tiktoken",
+            "blobfile",
             "librosa",
             "networkx",
             "hypothesis",
             "nibabel",
             "bottle",
+            "ggml-python"
         ],
         'docs': [
+            "mkdocs",
             "mkdocs-material",
             "mkdocstrings[python]",
             "markdown-callouts",
             "markdown-exec[ansi]",
-            "black"
+            "black",
+            "numpy",
         ],
         'testing_tf': [
             "tensorflow==2.15.1",

tinygrad-0.10.0/test/test_arange.py ADDED Viewed

@@ -0,0 +1,179 @@
+import unittest, contextlib
+import numpy as np
+from tinygrad import Tensor, GlobalCounters, dtypes, nn
+from tinygrad.helpers import CI, Context, getenv
+from tinygrad.engine.realize import run_schedule
+from tinygrad.codegen.kernel import Opt, OptOps, Kernel, KernelOptError
+from tinygrad.engine.realize import CompiledRunner, ExecItem
+from tinygrad.engine.search import get_kernel_actions
+class TestArange(unittest.TestCase):
+  def _get_flops(self, N, opts=None):
+    GlobalCounters.reset()
+    tt = Tensor.arange(N)
+    sched = tt.schedule()
+    self.assertEqual(len(sched), 1)
+    k = Kernel(sched[-1].ast)
+    if opts is not None:
+      for o in opts: k.apply_opt(o)
+    p = k.to_program()
+    print(p.name)
+    #print(p.src)
+    ExecItem(CompiledRunner(p), [tt.lazydata.buffer]).run()
+    np.testing.assert_equal(tt.numpy(), np.arange(N))
+    return p.op_estimate
+  def test_complexity(self, opts=None, limit=None):
+    # add 1 to avoid divide by 0. arange is 0 flops now!
+    f1 = self._get_flops(256, opts) + 1
+    f2 = self._get_flops(2560, opts) + 1
+    print(f"{f1=}, {f2=}")
+    assert (f1 < 5000 and f2 < 5000) or (f2 / f1 < 15), f"bad complexity, flops {f2/f1:.1f}X while inputs 10X"
+    if limit is not None and not getenv("PTX"):
+      # PTX counts index ALU in flops
+      assert f1 <= limit, f"{f1=}, {limit=}"
+  def test_complexity_w_upcast(self): return self.test_complexity([Opt(OptOps.UPCAST, 0, 4)], limit=1)
+  def test_complexity_w_unroll2(self): return self.test_complexity([Opt(OptOps.UNROLL, 0, 2)], limit=1)
+  def test_complexity_w_unroll4(self): return self.test_complexity([Opt(OptOps.UNROLL, 0, 4)], limit=1)
+  def test_complexity_w_unroll8(self): return self.test_complexity([Opt(OptOps.UNROLL, 0, 8)], limit=1)
+  def test_complexity_w_upcast_and_unroll(self): return self.test_complexity([Opt(OptOps.UPCAST, 0, 4), Opt(OptOps.UNROLL, 0, 4)], limit=1)
+  @unittest.skip("doesn't work yet")
+  def test_complexity_w_local_and_padto(self): return self.test_complexity([Opt(OptOps.LOCAL, 0, 16), Opt(op=OptOps.PADTO, axis=1, amt=32)])
+  def test_all_opts(self, opts=None, exclude=None):
+    k = Kernel(Tensor.arange(256).schedule()[-1].ast)
+    if opts is not None:
+      for o in opts: k.apply_opt(o)
+    all_opts_256 = [kk.applied_opts for kk in get_kernel_actions(k, include_0=False).values()]
+    k = Kernel(Tensor.arange(2560).schedule()[-1].ast)
+    if opts is not None:
+      for o in opts: k.apply_opt(o)
+    all_opts_2560 = [kk.applied_opts for kk in get_kernel_actions(k, include_0=False).values()]
+    all_opts = [x for x in all_opts_256 if x in all_opts_2560]
+    for opts in all_opts:
+      if exclude is not None and opts[-1] in exclude: continue
+      print(opts)
+      self.test_complexity(opts)
+  def test_all_opts_w_local(self):
+    with contextlib.suppress(KernelOptError):
+      return self.test_all_opts([Opt(OptOps.LOCAL, 0, 16)], [Opt(op=OptOps.PADTO, axis=1, amt=32)])
+  def test_all_opts_w_upcast(self): return self.test_all_opts([Opt(OptOps.UPCAST, 0, 4)])
+  def test_all_opts_w_unroll(self): return self.test_all_opts([Opt(OptOps.UNROLL, 0, 4)], [Opt(op=OptOps.GROUP, axis=0, amt=0)])
+  def test_all_opts_w_upcast_and_unroll(self):
+    return self.test_all_opts([Opt(OptOps.UPCAST, 0, 4), Opt(OptOps.UNROLL, 0, 4)], [Opt(op=OptOps.GROUP, axis=0, amt=0)])
+class TestIndexing(unittest.TestCase):
+  def test_arange_2_reduce(self):
+    needle = Tensor.zeros(16384, dtype=dtypes.int).contiguous()
+    needle[1337] = 1
+    needle.realize()
+    with Context(NOOPT=1, FUSE_ARANGE=1):
+      GlobalCounters.reset()
+      # TODO: it should work without these reshapes
+      out = ((Tensor.arange(1,16385).reshape(16384,1)-1)*needle.reshape(16384,1)).sum()
+      sched = out.schedule()
+      assert len(sched) == 1
+      run_schedule(sched)
+    assert out.item() == 1337, f"expected 1337, got {out.item()}"
+  @unittest.skipIf(getenv("PTX"), "broken on ptx for some reason")
+  def test_manual_index(self):
+    dataset = Tensor.rand(16384, 256).realize()
+    idxs = Tensor([0,3,5,6]).realize()
+    real_index = dataset.numpy()[idxs.numpy()]
+    print("*** indexing ***")
+    with Context(NOOPT=1, FUSE_ARANGE=1):
+      GlobalCounters.reset()
+      rng = Tensor.ones(4, 256, 16384, dtype=dtypes.int)._cumsum(axis=-1, _first_zero=True).reshape(4, 256, 16384, 1)
+      idxs = idxs.reshape(4,1,1,1).expand(4, 256, 16384, 1)
+      reshape_dataset = dataset.T.reshape(1, 256, 16384, 1).expand(4, 256, 16384, 1)
+      full = (rng==idxs).where(reshape_dataset, Tensor.zeros(4, 256, 16384, 1))
+      X = full.sum(axis=(2,3))
+      sched = X.schedule()
+      assert len(sched) == 1
+      run_schedule(sched)
+      assert GlobalCounters.global_ops < 4*16384, f"too many ops {GlobalCounters.global_ops}"
+    np.testing.assert_allclose(real_index, X.numpy())
+  def test_index(self):
+    dataset = Tensor.rand(16384, 256).realize()
+    idxs = Tensor([0,3,5,6]).realize()
+    real_index = dataset.numpy()[idxs.numpy()]
+    print("*** indexing ***")
+    with Context(NOOPT=1):
+      GlobalCounters.reset()
+      X = dataset[idxs]
+      assert X.shape == (4,256)
+      sched = X.schedule()
+      # TODO: enable these asserts when the scheduler can handle this
+      #assert len(sched) == 1, f"{len(sched)} != 1"
+      run_schedule(sched)
+      #assert GlobalCounters.global_ops < 4*16384, f"too many ops {GlobalCounters.global_ops}"
+    np.testing.assert_allclose(real_index, X.numpy())
+  def test_index_fused(self, noopt=1):
+    dataset = Tensor.rand(16384, 256).realize()
+    idxs = Tensor([0,3,5,6]).realize()
+    real_index = dataset.numpy()[idxs.numpy()]
+    print("*** indexing ***")
+    with Context(NOOPT=noopt, FUSE_ARANGE=1):
+      GlobalCounters.reset()
+      X = dataset[idxs]
+      assert X.shape == (4,256)
+      sched = X.schedule()
+      assert len(sched) == 2
+      run_schedule(sched)
+      assert GlobalCounters.global_ops < 4*16384, f"too many ops {GlobalCounters.global_ops} != {4*16384}"
+    np.testing.assert_allclose(real_index, X.numpy())
+  @unittest.skip("not ready")
+  def test_index_fused_opt(self): self.test_index_fused(0)
+  def test_index_fused_out_of_bounds(self):
+    dataset = Tensor.rand(256, 256).realize()
+    idxs = Tensor([-19238, -257, 256, 495, 10982377]).realize()
+    with Context(NOOPT=1, FUSE_ARANGE=1):
+      X = dataset[idxs]
+      np.testing.assert_equal(X.numpy(), 0)
+  @unittest.skipIf(getenv("PTX"), "broken on ptx for some reason")
+  def test_index_mnist(self, noopt=1, op_limit=512*784*5):
+    from tinygrad.nn.datasets import mnist
+    X_train, Y_train, _, _ = mnist()
+    with Context(NOOPT=noopt, FUSE_ARANGE=1, SPLIT_REDUCEOP=0):
+      samples = Tensor.randint(getenv("BS", 512), high=X_train.shape[0]).realize()
+      GlobalCounters.reset()
+      x = X_train[samples].numpy()
+      y = Y_train[samples].numpy()
+      assert GlobalCounters.global_ops < op_limit, f"too many ops {GlobalCounters.global_ops} != {op_limit}"
+    np.testing.assert_allclose(X_train.numpy()[samples.numpy()], x)
+    np.testing.assert_allclose(Y_train.numpy()[samples.numpy()], y)
+  @unittest.skip("not ready")
+  def test_index_mnist_opt(self): self.test_index_mnist(0)
+  @unittest.skipIf(getenv("PTX"), "broken on ptx for some reason")
+  def test_llama_embedding(self, noopt=1, op_limit=65536):
+    # llama3 is 128256
+    vocab_size, embed_size = (10, 3) if CI else (32000, 4096)
+    emb = nn.Embedding(vocab_size, embed_size)
+    emb_w = emb.weight.numpy()
+    x = Tensor([1,2,3,4])
+    with Context(NOOPT=noopt, FUSE_ARANGE=1):
+      GlobalCounters.reset()
+      z = emb(x).realize()
+      self.assertLessEqual(GlobalCounters.global_ops, op_limit)
+      self.assertEqual(GlobalCounters.kernel_count, 2)
+    if getenv("CHECK", 1):
+      import torch
+      with torch.no_grad():
+        torch_emb = torch.nn.Embedding(vocab_size, embed_size).eval()
+        torch_emb.weight[:] = torch.tensor(emb_w, dtype=torch.float32)
+      torch_z = torch_emb(torch.tensor(x.numpy()))
+      # TODO: reshape to match torch, should we do this in nn?
+      np.testing.assert_allclose(z.numpy().reshape(4, embed_size), torch_z.detach().numpy(), atol=1e-8, rtol=1e-8)
+  # at least the arange is being fused
+  def test_llama_embedding_opt(self): self.test_llama_embedding(0, 1_736_704_000 if CI else 5_898_240_000)
+if __name__ == "__main__":
+  unittest.main()

{tinygrad-0.9.1 → tinygrad-0.10.0}/test/test_assign.py RENAMED Viewed

@@ -2,6 +2,7 @@
 import unittest
 import numpy as np
 from tinygrad import dtypes, Tensor, TinyJit, GlobalCounters, Variable
+from tinygrad.engine.schedule import create_schedule
 N = 200  # has to be bigger than the cache to fail
@@ -57,10 +58,12 @@ class TestAssign(unittest.TestCase):
       x.realize()
     x = Tensor([0])
     f(x)
-    assert (out:=x.item()) == 1, f"expected 1, got {out}"
+    out = x.item()
+    assert out == 1, f"expected 1, got {out}"
     x = Tensor([0])
     f(x)
-    assert (out:=x.item()) == 1, f"expected 1, got {out}"
+    out = x.item()
+    assert out == 1, f"expected 1, got {out}"
   def test_assign_add_jit(self):
     @TinyJit
@@ -165,6 +168,16 @@ class TestAssign(unittest.TestCase):
     a += 1
     np.testing.assert_allclose(a.numpy(), 3)
+  # NOTE: this is similar to the resnet failure
+  #@unittest.expectedFailure
+  def test_double_assign_alt(self):
+    a = Tensor.ones(4).contiguous().realize()
+    b = Tensor([1, 2, 3, 4]).realize().lazydata
+    a1 = a.lazydata.assign(b)
+    a2 = a.lazydata.assign(b)
+    sched = create_schedule([a1, a2])
+    self.assertEqual(len(sched), 1)
   def test_crossover_assign(self):
     a = Tensor.full((4,), 2).contiguous().realize()
     b = Tensor.full((4,), 3).contiguous().realize()
@@ -347,7 +360,7 @@ class TestAssign(unittest.TestCase):
   def test_permuted_assignment_masked_view_possible(self):
     a = Tensor.ones(4, 4).contiguous().realize()
-    b = a.shrink((None, (0, 2))).pad((None, (0, 2)), 2)
+    b = a.shrink((None, (0, 2))).pad((None, (0, 2)), value=2)
     a.assign(a + b)
     kc = GlobalCounters.kernel_count
     a.realize()
@@ -357,7 +370,7 @@ class TestAssign(unittest.TestCase):
   def test_permuted_assignment_masked_view_not_contiguous(self):
     a = Tensor.ones(4, 4).contiguous().realize()
     with self.assertRaisesRegex(RuntimeError, "contiguous"):
-      b = a.shrink((None, (0, 2))).pad((None, (0, 2)), 2).permute(1, 0)
+      b = a.shrink((None, (0, 2))).pad((None, (0, 2)), value=2).permute(1, 0)
       a.assign(a + b)
       a.realize()

tinygrad-0.10.0/test/test_compile_failures.py ADDED Viewed

@@ -0,0 +1,18 @@
+import unittest
+from tinygrad import Tensor, dtypes, Device
+from tinygrad.engine.realize import lower_schedule
+from tinygrad.device import is_dtype_supported
+class TestCompileFailures(unittest.TestCase):
+  def compile(self, out:Tensor):
+    for _ in lower_schedule(out.schedule()): pass
+  @unittest.skipUnless(is_dtype_supported(dtypes.uchar, Device.DEFAULT), f"no uint8 on {Device.DEFAULT}")
+  def test_interpolate_atari(self):
+    self.compile(Tensor.empty(210, 160, dtype='uint8').interpolate((64, 64)))
+  def test_add_max_uchar(self):
+    self.compile((Tensor.empty(1024, dtype='uint8') + Tensor.empty(1024, dtype='uint8')).max())
+if __name__ == '__main__':
+  unittest.main()

{tinygrad-0.9.1 → tinygrad-0.10.0}/test/test_const_folding.py RENAMED Viewed

@@ -1,15 +1,15 @@
 import unittest, math
 from tinygrad import Tensor, Device, dtypes
+from tinygrad.ops import Ops
 from tinygrad.engine.schedule import create_schedule
 from tinygrad.helpers import CI
-from tinygrad.ops import BufferOps
 import numpy as np
-from test.helpers import is_dtype_supported
+from tinygrad.device import is_dtype_supported
 def _check_ast_count(desired_count:int, t:Tensor):
   # NOTE: this has side effect because everything can be scheduled only once
   schedule = create_schedule(t.lazydata.lbs)
-  asts = [s for s in schedule if s.ast[0].op is BufferOps.STORE]
+  asts = [s for s in schedule if s.ast.op is Ops.SINK]
   assert len(asts) == desired_count
 class TestUnaryOpsConstFolding(unittest.TestCase):
@@ -23,6 +23,7 @@ class TestUnaryOpsConstFolding(unittest.TestCase):
     _check_ast_count(0, Tensor.ones(4).cast(dtypes.int16))
     _check_ast_count(0, Tensor.full(4, fill_value=-1).cast(dtypes.uint16))
+  @unittest.expectedFailure  # no two level fold at lazybuffer
   def test_neg_folding(self):
     _check_ast_count(0, Tensor([1, 2, 3]).mul(-1).neg())
     _check_ast_count(0, Tensor([1, 2, 3]).neg().mul(-1))
@@ -78,6 +79,11 @@ class TestBinaryOpsConstFolding(unittest.TestCase):
   def test_div_tensor_one(self):
     _check_ast_count(0, Tensor([1.0, 2, 3, 4]) / Tensor.ones(4))
+  def test_idiv_literal_one(self):
+    _check_ast_count(0, Tensor([1, 2, 3, 4]) // 1)
+  def test_idiv_tensor_one(self):
+    _check_ast_count(0, Tensor([1, 2, 3, 4]) // Tensor.ones(4, dtype=dtypes.int32))
   def test_pow_literal_zero(self):
     _check_ast_count(0, Tensor([1.0, 2, 3, 4]) ** 0)
   def test_pow_tensor_zero(self):
@@ -124,13 +130,16 @@ class TestMovedConstFolding(unittest.TestCase):
   def test_cast_padded(self):
     # NOTE: this is folded due to CAST_BEFORE_VIEW
-    _check_ast_count(0, Tensor.ones(4).pad(((1, 1),)).cast(dtypes.int16))
-    np.testing.assert_equal(Tensor.ones(4).pad(((1, 1),)).cast(dtypes.int16).numpy(), [0, 1, 1, 1, 1, 0])
-    _check_ast_count(0, Tensor.full(4, fill_value=-1).pad(((1, 1),)).cast(dtypes.uint16))
-    np.testing.assert_equal(Tensor.full(4, fill_value=-1).pad(((1, 1),)).cast(dtypes.uint16).numpy(), [0, 65535, 65535, 65535, 65535, 0])
+    if is_dtype_supported(dtypes.int16):
+      _check_ast_count(0, Tensor.ones(4).pad(((1, 1),)).cast(dtypes.int16))
+      np.testing.assert_equal(Tensor.ones(4).pad(((1, 1),)).cast(dtypes.int16).numpy(), [0, 1, 1, 1, 1, 0])
+    if is_dtype_supported(dtypes.uint16):
+      _check_ast_count(0, Tensor.full(4, fill_value=-1).pad(((1, 1),)).cast(dtypes.uint16))
+      np.testing.assert_equal(Tensor.full(4, fill_value=-1).pad(((1, 1),)).cast(dtypes.uint16).numpy(), [0, 65535, 65535, 65535, 65535, 0])
     # not folded
-    _check_ast_count(1, Tensor.ones(4).pad(((1, 1),)).cast(dtypes.int64))
-    np.testing.assert_equal(Tensor.ones(4).pad(((1, 1),)).cast(dtypes.int64).numpy(), [0, 1, 1, 1, 1, 0])
+    if is_dtype_supported(dtypes.int64):
+      _check_ast_count(1, Tensor.ones(4).pad(((1, 1),)).cast(dtypes.int64))
+      np.testing.assert_equal(Tensor.ones(4).pad(((1, 1),)).cast(dtypes.int64).numpy(), [0, 1, 1, 1, 1, 0])
 class TestReduceOpsConstFolding(unittest.TestCase):
   def test_const_sum(self):
@@ -145,10 +154,18 @@ class TestReduceOpsConstFolding(unittest.TestCase):
     _check_ast_count(1, Tensor.ones(4).pad(((1, 1),)).sum())
     np.testing.assert_equal(Tensor.ones(4).pad(((1, 1),)).sum().numpy(), 4)
-    # NOTE: cannot just count the non-padded area because some UnaryOps f do not have f(0) = 0.
+    # NOTE: cannot just count the non-padded area because some Ops f do not have f(0) = 0.
     _check_ast_count(1, Tensor.ones(4).pad(((1, 1),)).exp().sum())
     np.testing.assert_allclose(Tensor.ones(4).pad(((1, 1),)).exp().sum().numpy(), 4 * math.e + 2)
+  def test_const_prod(self):
+    _check_ast_count(0, Tensor.full((2, 3), fill_value=2).prod())
+    np.testing.assert_equal(Tensor.full((2, 3), fill_value=2).prod().numpy(), 2**(2*3))
+    _check_ast_count(0, Tensor.full((4, 5, 6), fill_value=2).prod(axis=0))
+    np.testing.assert_equal(Tensor.full((4, 5, 6), fill_value=2).prod(axis=0).numpy(), np.full((5, 6), 2**4))
+    _check_ast_count(0, Tensor(4).prod())
+    np.testing.assert_equal(Tensor(4).prod().numpy(), 4)
   def test_const_max(self):
     _check_ast_count(0, Tensor.ones(4, 5, 6).max())
     np.testing.assert_equal(Tensor.ones(4, 5, 6).max().numpy(), 1)
@@ -234,7 +251,6 @@ class TestTautologicalCompare(unittest.TestCase):
     np.testing.assert_equal((Tensor(True) < Tensor(False)).numpy(), False)
     np.testing.assert_equal((Tensor(True) < Tensor(True)).numpy(), False)
-  @unittest.skip("not implemented yet")
   def test_a_eq_a(self):
     # self eq is always true for int or bool
     a = Tensor([1, 2, 3])
@@ -244,7 +260,6 @@ class TestTautologicalCompare(unittest.TestCase):
     a = Tensor([math.nan, 1.0, 2.0])
     np.testing.assert_equal((a == a).numpy(), [False, True, True])
-  @unittest.skip("not implemented yet")
   def test_a_ne_a(self):
     # self not eq is always false for int or bool
     a = Tensor([1, 2, 3])

{tinygrad-0.9.1 → tinygrad-0.10.0}/test/test_conv.py RENAMED Viewed

@@ -42,6 +42,14 @@ class TestConv(unittest.TestCase):
     print(ret.numpy())
+  def test_two_binops_no_rerun_small(self):
+    Tensor.no_grad = True
+    x = Tensor.rand(1,1,32,32)
+    w = Tensor.rand(1,1,3,3)
+    out = x.conv2d(w, padding=(1,1))
+    np.testing.assert_allclose(out.relu().numpy(), np.maximum(out.numpy(), 0))
+    Tensor.no_grad = False
   def test_two_binops_no_rerun(self):
     Tensor.no_grad = True
     x = Tensor.randn(1,12,128,256)

tinygrad 0.9.1__tar.gz → 0.10.0__tar.gz

tinygrad 0.9.1tar.gz → 0.10.0tar.gz