PyPI - tinygrad - Versions diffs - 0.10.0__tar.gz → 0.10.2__tar.gz - Mend

tinygrad 0.10.0tar.gz → 0.10.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (186) hide show

{tinygrad-0.10.0 → tinygrad-0.10.2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
-Metadata-Version: 2.1
+Metadata-Version: 2.2
 Name: tinygrad
-Version: 0.10.0
+Version: 0.10.2
 Summary: You like pytorch? You like micrograd? You love tinygrad! <3
 Author: George Hotz
 License: MIT
@@ -9,25 +9,39 @@ Classifier: License :: OSI Approved :: MIT License
 Requires-Python: >=3.10
 Description-Content-Type: text/markdown
 License-File: LICENSE
-Provides-Extra: llvm
-Requires-Dist: llvmlite; extra == "llvm"
 Provides-Extra: arm
 Requires-Dist: unicorn; extra == "arm"
 Provides-Extra: triton
 Requires-Dist: triton-nightly>=2.1.0.dev20231014192330; extra == "triton"
 Provides-Extra: linting
 Requires-Dist: pylint; extra == "linting"
-Requires-Dist: mypy==1.11.2; extra == "linting"
+Requires-Dist: mypy==1.13.0; extra == "linting"
 Requires-Dist: typing-extensions; extra == "linting"
 Requires-Dist: pre-commit; extra == "linting"
 Requires-Dist: ruff; extra == "linting"
 Requires-Dist: types-tqdm; extra == "linting"
+Provides-Extra: testing-minimal
+Requires-Dist: numpy; extra == "testing-minimal"
+Requires-Dist: torch; extra == "testing-minimal"
+Requires-Dist: pytest; extra == "testing-minimal"
+Requires-Dist: pytest-xdist; extra == "testing-minimal"
+Requires-Dist: hypothesis; extra == "testing-minimal"
+Provides-Extra: testing-unit
+Requires-Dist: numpy; extra == "testing-unit"
+Requires-Dist: torch; extra == "testing-unit"
+Requires-Dist: pytest; extra == "testing-unit"
+Requires-Dist: pytest-xdist; extra == "testing-unit"
+Requires-Dist: hypothesis; extra == "testing-unit"
+Requires-Dist: tqdm; extra == "testing-unit"
+Requires-Dist: safetensors; extra == "testing-unit"
+Requires-Dist: tabulate; extra == "testing-unit"
 Provides-Extra: testing
 Requires-Dist: numpy; extra == "testing"
 Requires-Dist: torch; extra == "testing"
-Requires-Dist: pillow; extra == "testing"
 Requires-Dist: pytest; extra == "testing"
 Requires-Dist: pytest-xdist; extra == "testing"
+Requires-Dist: hypothesis; extra == "testing"
+Requires-Dist: pillow; extra == "testing"
 Requires-Dist: onnx==1.16.0; extra == "testing"
 Requires-Dist: onnx2torch; extra == "testing"
 Requires-Dist: opencv-python; extra == "testing"
@@ -40,10 +54,10 @@ Requires-Dist: tiktoken; extra == "testing"
 Requires-Dist: blobfile; extra == "testing"
 Requires-Dist: librosa; extra == "testing"
 Requires-Dist: networkx; extra == "testing"
-Requires-Dist: hypothesis; extra == "testing"
 Requires-Dist: nibabel; extra == "testing"
 Requires-Dist: bottle; extra == "testing"
 Requires-Dist: ggml-python; extra == "testing"
+Requires-Dist: capstone; extra == "testing"
 Provides-Extra: docs
 Requires-Dist: mkdocs; extra == "docs"
 Requires-Dist: mkdocs-material; extra == "docs"
@@ -55,6 +69,14 @@ Requires-Dist: numpy; extra == "docs"
 Provides-Extra: testing-tf
 Requires-Dist: tensorflow==2.15.1; extra == "testing-tf"
 Requires-Dist: tensorflow_addons; extra == "testing-tf"
+Dynamic: author
+Dynamic: classifier
+Dynamic: description
+Dynamic: description-content-type
+Dynamic: license
+Dynamic: provides-extra
+Dynamic: requires-python
+Dynamic: summary
 <div align="center">
@@ -139,13 +161,14 @@ See [examples/beautiful_mnist.py](examples/beautiful_mnist.py) for the full vers
 tinygrad already supports numerous accelerators, including:
 - [x] [GPU (OpenCL)](tinygrad/runtime/ops_gpu.py)
-- [x] [CLANG (C Code)](tinygrad/runtime/ops_clang.py)
+- [x] [CPU (C Code)](tinygrad/runtime/ops_cpu.py)
 - [x] [LLVM](tinygrad/runtime/ops_llvm.py)
 - [x] [METAL](tinygrad/runtime/ops_metal.py)
 - [x] [CUDA](tinygrad/runtime/ops_cuda.py)
 - [x] [AMD](tinygrad/runtime/ops_amd.py)
 - [x] [NV](tinygrad/runtime/ops_nv.py)
 - [x] [QCOM](tinygrad/runtime/ops_qcom.py)
+- [x] [WEBGPU](tinygrad/runtime/ops_webgpu.py)
 And it is easy to add more! Your accelerator of choice only needs to support a total of ~25 low level ops.
@@ -183,8 +206,8 @@ y = Tensor([[2.0,0,-2.0]], requires_grad=True)
 z = y.matmul(x).sum()
 z.backward()
-print(x.grad.numpy())  # dz/dx
-print(y.grad.numpy())  # dz/dy
+print(x.grad.tolist())  # dz/dx
+print(y.grad.tolist())  # dz/dy
 ```
 The same thing but in PyTorch:
@@ -196,8 +219,8 @@ y = torch.tensor([[2.0,0,-2.0]], requires_grad=True)
 z = y.matmul(x).sum()
 z.backward()
-print(x.grad.numpy())  # dz/dx
-print(y.grad.numpy())  # dz/dy
+print(x.grad.tolist())  # dz/dx
+print(y.grad.tolist())  # dz/dy
 ```
 ## Contributing
@@ -208,7 +231,7 @@ We'll start with what will get your PR closed with a pointer to this section:
 - No code golf! While low line count is a guiding light of this project, anything that remotely looks like code golf will be closed. The true goal is reducing complexity and increasing readability, and deleting `\n`s does nothing to help with that.
 - All docs and whitespace changes will be closed unless you are a well-known contributor. The people writing the docs should be those who know the codebase the absolute best. People who have not demonstrated that shouldn't be messing with docs. Whitespace changes are both useless *and* carry a risk of introducing bugs.
-- Anything you claim is a "speedup" must be benchmarked. In general, the goal is simplicity, so even if your PR makes things marginally faster, you have to consider the tradeoff with maintainablity and readablity.
+- Anything you claim is a "speedup" must be benchmarked. In general, the goal is simplicity, so even if your PR makes things marginally faster, you have to consider the tradeoff with maintainability and readability.
 - In general, the code outside the core `tinygrad/` folder is not well tested, so unless the current code there is broken, you shouldn't be changing it.
 - If your PR looks "complex", is a big diff, or adds lots of lines, it won't be reviewed or merged. Consider breaking it up into smaller PRs that are individually clear wins. A common pattern I see is prerequisite refactors before adding new functionality. If you can (cleanly) refactor to the point that the feature is a 3 line change, this is great, and something easy for us to review.

{tinygrad-0.10.0 → tinygrad-0.10.2}/README.md RENAMED Viewed

@@ -81,13 +81,14 @@ See [examples/beautiful_mnist.py](examples/beautiful_mnist.py) for the full vers
 tinygrad already supports numerous accelerators, including:
 - [x] [GPU (OpenCL)](tinygrad/runtime/ops_gpu.py)
-- [x] [CLANG (C Code)](tinygrad/runtime/ops_clang.py)
+- [x] [CPU (C Code)](tinygrad/runtime/ops_cpu.py)
 - [x] [LLVM](tinygrad/runtime/ops_llvm.py)
 - [x] [METAL](tinygrad/runtime/ops_metal.py)
 - [x] [CUDA](tinygrad/runtime/ops_cuda.py)
 - [x] [AMD](tinygrad/runtime/ops_amd.py)
 - [x] [NV](tinygrad/runtime/ops_nv.py)
 - [x] [QCOM](tinygrad/runtime/ops_qcom.py)
+- [x] [WEBGPU](tinygrad/runtime/ops_webgpu.py)
 And it is easy to add more! Your accelerator of choice only needs to support a total of ~25 low level ops.
@@ -125,8 +126,8 @@ y = Tensor([[2.0,0,-2.0]], requires_grad=True)
 z = y.matmul(x).sum()
 z.backward()
-print(x.grad.numpy())  # dz/dx
-print(y.grad.numpy())  # dz/dy
+print(x.grad.tolist())  # dz/dx
+print(y.grad.tolist())  # dz/dy
 ```
 The same thing but in PyTorch:
@@ -138,8 +139,8 @@ y = torch.tensor([[2.0,0,-2.0]], requires_grad=True)
 z = y.matmul(x).sum()
 z.backward()
-print(x.grad.numpy())  # dz/dx
-print(y.grad.numpy())  # dz/dy
+print(x.grad.tolist())  # dz/dx
+print(y.grad.tolist())  # dz/dy
 ```
 ## Contributing
@@ -150,7 +151,7 @@ We'll start with what will get your PR closed with a pointer to this section:
 - No code golf! While low line count is a guiding light of this project, anything that remotely looks like code golf will be closed. The true goal is reducing complexity and increasing readability, and deleting `\n`s does nothing to help with that.
 - All docs and whitespace changes will be closed unless you are a well-known contributor. The people writing the docs should be those who know the codebase the absolute best. People who have not demonstrated that shouldn't be messing with docs. Whitespace changes are both useless *and* carry a risk of introducing bugs.
-- Anything you claim is a "speedup" must be benchmarked. In general, the goal is simplicity, so even if your PR makes things marginally faster, you have to consider the tradeoff with maintainablity and readablity.
+- Anything you claim is a "speedup" must be benchmarked. In general, the goal is simplicity, so even if your PR makes things marginally faster, you have to consider the tradeoff with maintainability and readability.
 - In general, the code outside the core `tinygrad/` folder is not well tested, so unless the current code there is broken, you shouldn't be changing it.
 - If your PR looks "complex", is a big diff, or adds lots of lines, it won't be reviewed or merged. Consider breaking it up into smaller PRs that are individually clear wins. A common pattern I see is prerequisite refactors before adding new functionality. If you can (cleanly) refactor to the point that the feature is a 3 line change, this is great, and something easy for us to review.

{tinygrad-0.10.0 → tinygrad-0.10.2}/setup.py RENAMED Viewed

@@ -7,16 +7,24 @@ directory = Path(__file__).resolve().parent
 with open(directory / 'README.md', encoding='utf-8') as f:
   long_description = f.read()
+testing_minimal = [
+  "numpy",
+  "torch",
+  "pytest",
+  "pytest-xdist",
+  "hypothesis",
+]
 setup(name='tinygrad',
-      version='0.10.0',
+      version='0.10.2',
       description='You like pytorch? You like micrograd? You love tinygrad! <3',
       author='George Hotz',
       license='MIT',
       long_description=long_description,
       long_description_content_type='text/markdown',
-      packages = ['tinygrad', 'tinygrad.runtime.autogen', 'tinygrad.codegen', 'tinygrad.nn', 'tinygrad.renderer', 'tinygrad.engine',
-                  'tinygrad.runtime', 'tinygrad.runtime.support', 'tinygrad.runtime.graph', 'tinygrad.shape'],
-      package_data = {'tinygrad': ['py.typed']},
+      packages = ['tinygrad', 'tinygrad.runtime.autogen', 'tinygrad.codegen', 'tinygrad.nn', 'tinygrad.renderer', 'tinygrad.engine', 'tinygrad.viz',
+                  'tinygrad.runtime', 'tinygrad.runtime.support', 'tinygrad.runtime.support.am', 'tinygrad.runtime.graph', 'tinygrad.shape'],
+      package_data = {'tinygrad': ['py.typed'], 'tinygrad.viz': ['index.html', 'perfetto.html', 'assets/**/*']},
       classifiers=[
         "Programming Language :: Python :: 3",
         "License :: OSI Approved :: MIT License"
@@ -24,24 +32,25 @@ setup(name='tinygrad',
       install_requires=[],
       python_requires='>=3.10',
       extras_require={
-        'llvm': ["llvmlite"],
         'arm': ["unicorn"],
         'triton': ["triton-nightly>=2.1.0.dev20231014192330"],
         'linting': [
             "pylint",
-            "mypy==1.11.2",
+            "mypy==1.13.0",
             "typing-extensions",
             "pre-commit",
             "ruff",
             "types-tqdm",
         ],
         #'mlperf': ["mlperf-logging @ git+https://github.com/mlperf/logging.git@4.1.0-rc3"],
-        'testing': [
-            "numpy",
-            "torch",
+        'testing_minimal': testing_minimal,
+        'testing_unit': testing_minimal + [
+            "tqdm",
+            "safetensors",
+            "tabulate"  # for sz.py
+        ],
+        'testing': testing_minimal + [
             "pillow",
-            "pytest",
-            "pytest-xdist",
             "onnx==1.16.0",
             "onnx2torch",
             "opencv-python",
@@ -54,10 +63,10 @@ setup(name='tinygrad',
             "blobfile",
             "librosa",
             "networkx",
-            "hypothesis",
             "nibabel",
             "bottle",
-            "ggml-python"
+            "ggml-python",
+            "capstone"
         ],
         'docs': [
             "mkdocs",
@@ -71,6 +80,6 @@ setup(name='tinygrad',
         'testing_tf': [
             "tensorflow==2.15.1",
             "tensorflow_addons",
-        ]
+        ],
       },
       include_package_data=True)

{tinygrad-0.10.0 → tinygrad-0.10.2}/test/test_arange.py RENAMED Viewed

@@ -1,11 +1,12 @@
 import unittest, contextlib
 import numpy as np
-from tinygrad import Tensor, GlobalCounters, dtypes, nn
+from tinygrad import Tensor, GlobalCounters, dtypes, nn, Device
 from tinygrad.helpers import CI, Context, getenv
 from tinygrad.engine.realize import run_schedule
 from tinygrad.codegen.kernel import Opt, OptOps, Kernel, KernelOptError
 from tinygrad.engine.realize import CompiledRunner, ExecItem
 from tinygrad.engine.search import get_kernel_actions
+from tinygrad.ops import Ops
 class TestArange(unittest.TestCase):
   def _get_flops(self, N, opts=None):
@@ -21,7 +22,7 @@ class TestArange(unittest.TestCase):
     #print(p.src)
     ExecItem(CompiledRunner(p), [tt.lazydata.buffer]).run()
     np.testing.assert_equal(tt.numpy(), np.arange(N))
-    return p.op_estimate
+    return p.estimates.ops
   def test_complexity(self, opts=None, limit=None):
     # add 1 to avoid divide by 0. arange is 0 flops now!
@@ -40,7 +41,7 @@ class TestArange(unittest.TestCase):
   def test_complexity_w_upcast_and_unroll(self): return self.test_complexity([Opt(OptOps.UPCAST, 0, 4), Opt(OptOps.UNROLL, 0, 4)], limit=1)
   @unittest.skip("doesn't work yet")
-  def test_complexity_w_local_and_padto(self): return self.test_complexity([Opt(OptOps.LOCAL, 0, 16), Opt(op=OptOps.PADTO, axis=1, amt=32)])
+  def test_complexity_w_local_and_padto(self): return self.test_complexity([Opt(OptOps.LOCAL, 0, 16), Opt(op=OptOps.PADTO, axis=1, arg=32)])
   def test_all_opts(self, opts=None, exclude=None):
     k = Kernel(Tensor.arange(256).schedule()[-1].ast)
@@ -58,11 +59,11 @@ class TestArange(unittest.TestCase):
       self.test_complexity(opts)
   def test_all_opts_w_local(self):
     with contextlib.suppress(KernelOptError):
-      return self.test_all_opts([Opt(OptOps.LOCAL, 0, 16)], [Opt(op=OptOps.PADTO, axis=1, amt=32)])
+      return self.test_all_opts([Opt(OptOps.LOCAL, 0, 16)], [Opt(op=OptOps.PADTO, axis=1, arg=32)])
   def test_all_opts_w_upcast(self): return self.test_all_opts([Opt(OptOps.UPCAST, 0, 4)])
-  def test_all_opts_w_unroll(self): return self.test_all_opts([Opt(OptOps.UNROLL, 0, 4)], [Opt(op=OptOps.GROUP, axis=0, amt=0)])
+  def test_all_opts_w_unroll(self): return self.test_all_opts([Opt(OptOps.UNROLL, 0, 4)], [Opt(op=OptOps.GROUP, axis=0, arg=0)])
   def test_all_opts_w_upcast_and_unroll(self):
-    return self.test_all_opts([Opt(OptOps.UPCAST, 0, 4), Opt(OptOps.UNROLL, 0, 4)], [Opt(op=OptOps.GROUP, axis=0, amt=0)])
+    return self.test_all_opts([Opt(OptOps.UPCAST, 0, 4), Opt(OptOps.UNROLL, 0, 4)], [Opt(op=OptOps.GROUP, axis=0, arg=0)])
 class TestIndexing(unittest.TestCase):
   def test_arange_2_reduce(self):
@@ -71,12 +72,11 @@ class TestIndexing(unittest.TestCase):
     needle.realize()
     with Context(NOOPT=1, FUSE_ARANGE=1):
       GlobalCounters.reset()
-      # TODO: it should work without these reshapes
-      out = ((Tensor.arange(1,16385).reshape(16384,1)-1)*needle.reshape(16384,1)).sum()
+      out = ((Tensor.arange(1,16385)-1)*needle).sum()
       sched = out.schedule()
-      assert len(sched) == 1
+      self.assertEqual(len(sched), 1)
       run_schedule(sched)
-    assert out.item() == 1337, f"expected 1337, got {out.item()}"
+    self.assertEqual(out.item(), 1337)
   @unittest.skipIf(getenv("PTX"), "broken on ptx for some reason")
   def test_manual_index(self):
@@ -86,13 +86,13 @@ class TestIndexing(unittest.TestCase):
     print("*** indexing ***")
     with Context(NOOPT=1, FUSE_ARANGE=1):
       GlobalCounters.reset()
-      rng = Tensor.ones(4, 256, 16384, dtype=dtypes.int)._cumsum(axis=-1, _first_zero=True).reshape(4, 256, 16384, 1)
+      rng = Tensor.ones(4, 256, 16384, dtype=dtypes.int)._cumalu(axis=-1, op=Ops.ADD, _include_initial=True).reshape(4, 256, 16384, 1)
       idxs = idxs.reshape(4,1,1,1).expand(4, 256, 16384, 1)
       reshape_dataset = dataset.T.reshape(1, 256, 16384, 1).expand(4, 256, 16384, 1)
       full = (rng==idxs).where(reshape_dataset, Tensor.zeros(4, 256, 16384, 1))
       X = full.sum(axis=(2,3))
       sched = X.schedule()
-      assert len(sched) == 1
+      self.assertEqual(len(sched), 1)
       run_schedule(sched)
       assert GlobalCounters.global_ops < 4*16384, f"too many ops {GlobalCounters.global_ops}"
     np.testing.assert_allclose(real_index, X.numpy())
@@ -108,7 +108,7 @@ class TestIndexing(unittest.TestCase):
       assert X.shape == (4,256)
       sched = X.schedule()
       # TODO: enable these asserts when the scheduler can handle this
-      #assert len(sched) == 1, f"{len(sched)} != 1"
+      #self.assertEqual(len(sched), 1)
       run_schedule(sched)
       #assert GlobalCounters.global_ops < 4*16384, f"too many ops {GlobalCounters.global_ops}"
     np.testing.assert_allclose(real_index, X.numpy())
@@ -123,7 +123,7 @@ class TestIndexing(unittest.TestCase):
       X = dataset[idxs]
       assert X.shape == (4,256)
       sched = X.schedule()
-      assert len(sched) == 2
+      self.assertEqual(len(sched), 2)
       run_schedule(sched)
       assert GlobalCounters.global_ops < 4*16384, f"too many ops {GlobalCounters.global_ops} != {4*16384}"
     np.testing.assert_allclose(real_index, X.numpy())
@@ -138,7 +138,7 @@ class TestIndexing(unittest.TestCase):
       np.testing.assert_equal(X.numpy(), 0)
   @unittest.skipIf(getenv("PTX"), "broken on ptx for some reason")
-  def test_index_mnist(self, noopt=1, op_limit=512*784*5):
+  def test_index_mnist(self, noopt=1, op_limit=512*784*13):
     from tinygrad.nn.datasets import mnist
     X_train, Y_train, _, _ = mnist()
     with Context(NOOPT=noopt, FUSE_ARANGE=1, SPLIT_REDUCEOP=0):
@@ -152,12 +152,13 @@ class TestIndexing(unittest.TestCase):
   @unittest.skip("not ready")
   def test_index_mnist_opt(self): self.test_index_mnist(0)
-  @unittest.skipIf(getenv("PTX"), "broken on ptx for some reason")
+  @unittest.skipIf(getenv("PTX") or Device.DEFAULT == "WEBGPU", "broken on ptx and WebGPU for some reason")
   def test_llama_embedding(self, noopt=1, op_limit=65536):
     # llama3 is 128256
     vocab_size, embed_size = (10, 3) if CI else (32000, 4096)
     emb = nn.Embedding(vocab_size, embed_size)
-    emb_w = emb.weight.numpy()
+    # TODO: why is a new realize needed here
+    emb_w = emb.weight.realize().numpy()
     x = Tensor([1,2,3,4])
     with Context(NOOPT=noopt, FUSE_ARANGE=1):
       GlobalCounters.reset()

{tinygrad-0.10.0 → tinygrad-0.10.2}/test/test_assign.py RENAMED Viewed

@@ -2,7 +2,8 @@
 import unittest
 import numpy as np
 from tinygrad import dtypes, Tensor, TinyJit, GlobalCounters, Variable
-from tinygrad.engine.schedule import create_schedule
+from tinygrad.device import is_dtype_supported
+from tinygrad.helpers import temp
 N = 200  # has to be bigger than the cache to fail
@@ -168,16 +169,6 @@ class TestAssign(unittest.TestCase):
     a += 1
     np.testing.assert_allclose(a.numpy(), 3)
-  # NOTE: this is similar to the resnet failure
-  #@unittest.expectedFailure
-  def test_double_assign_alt(self):
-    a = Tensor.ones(4).contiguous().realize()
-    b = Tensor([1, 2, 3, 4]).realize().lazydata
-    a1 = a.lazydata.assign(b)
-    a2 = a.lazydata.assign(b)
-    sched = create_schedule([a1, a2])
-    self.assertEqual(len(sched), 1)
   def test_crossover_assign(self):
     a = Tensor.full((4,), 2).contiguous().realize()
     b = Tensor.full((4,), 3).contiguous().realize()
@@ -212,6 +203,7 @@ class TestAssign(unittest.TestCase):
     np.testing.assert_equal(b0.numpy(), 128)
     np.testing.assert_equal(b1.numpy(), 608)
+  @unittest.skip("TODO: bring this assert back")
   def test_crossunder_assign(self):
     # NOTE: should *not* raise AssertionError from numpy
     with self.assertRaisesRegex(RuntimeError, "cycle"):
@@ -293,6 +285,7 @@ class TestAssign(unittest.TestCase):
       #assert ba1 == ba2 and ba1 != bb1
       np.testing.assert_allclose(a.numpy(), np.arange(N*N).reshape((N,N)) + np.arange(N*N).reshape((N,N)).transpose(1,0))
+  @unittest.skip("multi output not supported anymore")
   def test_simple_assignment_multioutput(self):
     a = Tensor.randn(32, 32).realize()
     b = Tensor.full((32, ), 1.).contiguous().realize()
@@ -331,6 +324,7 @@ class TestAssign(unittest.TestCase):
       b.assign(r + b.permute(1, 0))
       b.realize()
+  @unittest.skip("multi output not supported anymore")
   def test_permuted_reduceop_multioutput_dual_use(self):
     a = Tensor.randn(32, 32, 32).realize()
     b = Tensor.full((32, 32), 1.).contiguous().realize()
@@ -343,6 +337,7 @@ class TestAssign(unittest.TestCase):
       c.assign(r + b_perm)
       Tensor.realize(b, c)
+  @unittest.skip("multi output not supported anymore")
   def test_permuted_reduceop_multioutput_dual_use_possible(self):
     a = Tensor.randn(32, 32, 32, dtype=dtypes.int).realize()
     b = Tensor.arange(32 * 32).reshape(32, 32).realize()
@@ -376,6 +371,14 @@ class TestAssign(unittest.TestCase):
   # TODO: is there a way to sneak in a permute such that it returns the wrong answer?
+  @unittest.skipUnless(is_dtype_supported(dtypes.half), "need half")
+  def test_setitem_half(self):
+    a = Tensor.full((8,), 1.0, dtype=dtypes.half).contiguous().realize()
+    b = Tensor.full((4,), 2.0, dtype=dtypes.half).contiguous().realize()
+    assign = a[:4].assign(b)
+    assign.realize()
+    np.testing.assert_allclose(a.numpy(), [2., 2., 2., 2., 1., 1., 1., 1.])
   @unittest.skip("don't use output buffer, and mismatch dtype no longer supported")
   def test_cast_assignment(self):
     a = Tensor(np.arange(N*N, dtype=np.float32)).reshape(N,N)
@@ -387,5 +390,9 @@ class TestAssign(unittest.TestCase):
     assert oba1 is None and oba2 is None
     np.testing.assert_allclose(a.numpy(), np.arange(N*N,dtype=np.int32).reshape((N,N)))
+  def test_disk_assignment(self):
+    a = Tensor.empty(5, device=f"disk:{temp('disk_assignment')}").assign(Tensor.ones(5)).numpy()
+    np.testing.assert_equal(a, np.ones(5))
 if __name__ == "__main__":
   unittest.main()

{tinygrad-0.10.0 → tinygrad-0.10.2}/test/test_const_folding.py RENAMED Viewed

@@ -1,16 +1,18 @@
-import unittest, math
+import unittest, itertools, math
+from typing import Any
 from tinygrad import Tensor, Device, dtypes
-from tinygrad.ops import Ops
-from tinygrad.engine.schedule import create_schedule
+from tinygrad.dtype import DType
+from tinygrad.ops import Ops, UOp
 from tinygrad.helpers import CI
+from tinygrad.codegen.devectorizer import full_graph_rewrite
 import numpy as np
 from tinygrad.device import is_dtype_supported
 def _check_ast_count(desired_count:int, t:Tensor):
   # NOTE: this has side effect because everything can be scheduled only once
-  schedule = create_schedule(t.lazydata.lbs)
+  schedule = t.schedule()
   asts = [s for s in schedule if s.ast.op is Ops.SINK]
-  assert len(asts) == desired_count
+  assert len(asts) == desired_count, f"{len(asts)} != {desired_count}"
 class TestUnaryOpsConstFolding(unittest.TestCase):
   def test_all_consts_ops(self):
@@ -98,13 +100,47 @@ class TestBinaryOpsConstFolding(unittest.TestCase):
   def test_tensor_one_pow(self):
     _check_ast_count(0, Tensor.ones(4) ** Tensor([1.0, 2, 3, 4]))
+class TestBitcastConstFolding(unittest.TestCase):
+  def test_scalar_bitcast(self):
+    def t(cases: dict[DType, Any]):
+      for (from_dt, from_v), (to_dt, to_v) in itertools.product(cases.items(), cases.items()):
+        if not math.isnan(from_v):
+          r = full_graph_rewrite(UOp.const(from_dt, from_v).bitcast(to_dt).sink()).src[0]
+          self.assertEqual(r.op, Ops.CONST, msg:=f"{from_dt} -> {to_dt} ({from_v} -> {to_v})")
+          self.assertEqual(r.dtype, to_dt, msg)
+          np.testing.assert_equal(r.arg, to_v, msg)
+    t({dtypes.int8: 0, dtypes.uint8: 0, dtypes.bool: False})
+    t({dtypes.int8: 1, dtypes.uint8: 1, dtypes.bool: True})
+    t({dtypes.int8:  -1, dtypes.uint8:  2**8-1})
+    t({dtypes.int16: -1, dtypes.uint16: 2**16-1, dtypes.float16: float('nan')})
+    t({dtypes.int32: -1, dtypes.uint32: 2**32-1, dtypes.float32: float('nan')})
+    t({dtypes.int64: -1, dtypes.uint64: 2**64-1, dtypes.float64: float('nan')})
+    t({dtypes.int8:  -2**7,  dtypes.uint8:  2**7})
+    t({dtypes.int16: -2**15, dtypes.uint16: 2**15})
+    t({dtypes.int32: -2**31, dtypes.uint32: 2**31})
+    t({dtypes.int64: -2**63, dtypes.uint64: 2**63})
+    t({dtypes.int16: 13496, dtypes.uint16: 13496, dtypes.float16: 0.294921875})
+    t({dtypes.int32: 1050081145, dtypes.uint32: 1050081145, dtypes.float32: 0.29485681653022766})
+    t({dtypes.int64: 4598983288165178391, dtypes.uint64: 4598983288165178391, dtypes.float64: 0.29485681936461233})
+  def test_vec_bitcast(self):
+    r = full_graph_rewrite(UOp.const(dtypes.int32.vec(3), (-1, -2**31, 75)).bitcast(dtypes.uint32.vec(3)).sink()).src[0]
+    self.assertEqual(r.op, Ops.VECTORIZE)
+    self.assertEqual(r.dtype, dtypes.uint32.vec(3))
+    self.assertEqual(tuple(x.arg for x in r.src), (2**32-1, 2**31, 75))
 # folds advance indexing into basic indexing
 class TestIndexingConstFolding(unittest.TestCase):
   def test_scalar_index(self):
     t = Tensor.arange(16).float().reshape(1,1,4,4).realize()
-    _check_ast_count(0, t[:,:,Tensor(1),:])
-    _check_ast_count(0, t[:,:,Tensor(1)+2,:])
-    _check_ast_count(0, t[:,:,Tensor(1),Tensor(0)])
+    # TODO: fold these
+    _check_ast_count(2, t[:,:,Tensor(1),:])
+    _check_ast_count(2, t[:,:,Tensor(1)+2,:])
+    _check_ast_count(2, t[:,:,Tensor(1),Tensor(0)])
   @unittest.expectedFailure
   def test_const_tensor_index(self):
@@ -130,11 +166,12 @@ class TestMovedConstFolding(unittest.TestCase):
   def test_cast_padded(self):
     # NOTE: this is folded due to CAST_BEFORE_VIEW
+    # update: CAST_BEFORE_VIEW=1 is no longer supported
     if is_dtype_supported(dtypes.int16):
-      _check_ast_count(0, Tensor.ones(4).pad(((1, 1),)).cast(dtypes.int16))
+      _check_ast_count(1, Tensor.ones(4).pad(((1, 1),)).cast(dtypes.int16))
       np.testing.assert_equal(Tensor.ones(4).pad(((1, 1),)).cast(dtypes.int16).numpy(), [0, 1, 1, 1, 1, 0])
     if is_dtype_supported(dtypes.uint16):
-      _check_ast_count(0, Tensor.full(4, fill_value=-1).pad(((1, 1),)).cast(dtypes.uint16))
+      _check_ast_count(1, Tensor.full(4, fill_value=-1).pad(((1, 1),)).cast(dtypes.uint16))
       np.testing.assert_equal(Tensor.full(4, fill_value=-1).pad(((1, 1),)).cast(dtypes.uint16).numpy(), [0, 65535, 65535, 65535, 65535, 0])
     # not folded
     if is_dtype_supported(dtypes.int64):
@@ -158,6 +195,37 @@ class TestReduceOpsConstFolding(unittest.TestCase):
     _check_ast_count(1, Tensor.ones(4).pad(((1, 1),)).exp().sum())
     np.testing.assert_allclose(Tensor.ones(4).pad(((1, 1),)).exp().sum().numpy(), 4 * math.e + 2)
+  def test_bool_zero_max(self):
+    _check_ast_count(0, Tensor.full((1, 2), True).shrink(((0, 1), (0, 0))).max((1, 0)))
+    np.testing.assert_equal(Tensor.full((1, 2), True).shrink(((0, 1), (0, 0))).max((1, 0)).numpy(), False)
+  def test_zero_size_ops(self):
+    for reduceop in [lambda x:x.prod(), lambda x:x.sum()]: # lambda x:x.max() NOTE: numpy gives "reduction operation maximum which has no identity"
+      _check_ast_count(0, reduceop(Tensor.empty(1, 0)))
+      np.testing.assert_equal(reduceop(Tensor.empty(shape:=(1, 0))).numpy(), reduceop(np.empty(shape)))
+  def test_zero_size_ops_view(self):
+    for reduceop in [lambda x:x.prod(), lambda x:x.sum()]:
+      _check_ast_count(0, reduceop(Tensor.empty(1, 0, 4).permute((1, 2, 0)).contiguous()))
+      np.testing.assert_equal(reduceop(Tensor.empty(shape:=(1, 0))).numpy(), reduceop(np.empty((shape))))
+  def test_zero_size_ops_realized(self):
+    for reduceop in [lambda x:x.prod(), lambda x:x.sum()]:
+      _check_ast_count(0, reduceop((Tensor.randn(0, 1)+1).realize()))
+      np.testing.assert_equal(reduceop((Tensor.randn(shape:=(0, 1))+1).realize()).numpy(), reduceop(np.empty(shape)))
+  def test_zero_size_realize_folded(self):
+    # non contiguous folded output doesn't realize
+    _check_ast_count(0, Tensor.empty(1, 0).sum())
+    # contiguous folded const can still schedule
+    a = Tensor.empty(1, 0).sum().contiguous()
+    _check_ast_count(2, a+2)
+    self.assertIsNotNone(a.lazydata.base.realized)
+    np.testing.assert_equal((Tensor.empty(1, 0).sum().contiguous()+2).numpy(), 2)
+    # otherwise we just fuse it
+    _check_ast_count(1, (Tensor.empty(1, 0).sum()+2).contiguous())
+    np.testing.assert_equal((Tensor.empty(1, 0).sum()+2).numpy(), 2)
   def test_const_prod(self):
     _check_ast_count(0, Tensor.full((2, 3), fill_value=2).prod())
     np.testing.assert_equal(Tensor.full((2, 3), fill_value=2).prod().numpy(), 2**(2*3))
@@ -206,6 +274,8 @@ class TestMultiConstFolding(unittest.TestCase):
     _check_ast_count(0, t ** 1)
     _check_ast_count(0, 1 ** t)
+  # failing because multi calls .contiguous() on every single sharded uop
+  @unittest.expectedFailure
   def test_multi_const_folding_tensor(self):
     ds = tuple(f"{Device.DEFAULT}:{i}" for i in range(4))
     t = Tensor.arange(16).float().realize().to(ds)

{tinygrad-0.10.0 → tinygrad-0.10.2}/test/test_conv_shapetracker.py RENAMED Viewed

@@ -3,7 +3,6 @@ import unittest
 from tinygrad.ops import Ops
 from tinygrad.tensor import Tensor
 from tinygrad.nn import Conv2d
-from tinygrad.engine.schedule import create_schedule
 from tinygrad.shape.shapetracker import ShapeTracker, View
 from tinygrad.helpers import prod
 from test.unit.test_shapetracker import shapetracker_getitem
@@ -11,13 +10,12 @@ from test.unit.test_shapetracker import shapetracker_getitem
 class TestConvShapetracker(unittest.TestCase):
   def test_conv_3x3_one_view(self):
     conv = Conv2d(16, 32, (3, 3))
     # first run to init the weights, they are scheduled.
-    create_schedule([conv(Tensor.empty(1, 16, 10, 10)).lazydata])
+    conv(Tensor.empty(1, 16, 10, 10)).schedule()
     # run it again to get the kernels
-    sched = [si for si in create_schedule([conv(Tensor.empty(1, 16, 10, 10)).lazydata]) if si.ast.op is Ops.SINK]
+    sched = [si for si in conv(Tensor.empty(1, 16, 10, 10)).schedule() if si.ast.op is Ops.SINK]
     assert len(sched) == 1, f"conv should only have one kernel, getting {len(sched)}"
-    for st in [x.st_arg for x in sched[0].ast.parents if x.op is Ops.LOAD]:
+    for st in [x.st_arg for x in sched[0].ast.toposort if x.op is Ops.LOAD]:
       assert len(st.views) == 1
   def test_conv_2x2_backward_one_view(self):
@@ -26,11 +24,10 @@ class TestConvShapetracker(unittest.TestCase):
     conv(X).mean().backward()
     si = X.grad.schedule()[-1]
     print(si)
-    ldb = [x for x in si.ast.parents if x.op is Ops.LOAD][0]
+    ldb = [x for x in si.ast.toposort if x.op is Ops.LOAD][0]
     st: ShapeTracker = ldb.st_arg.simplify()
-    # NOTE: st.real_size() is broken
     print(si.inputs[0].size)
-    #self.assertEqual(si.inputs[0].size, st.real_size())
+    self.assertEqual(si.inputs[0].size, st.real_size())
     for v in st.views: print(v)
     # same st

tinygrad 0.10.0__tar.gz → 0.10.2__tar.gz

tinygrad 0.10.0tar.gz → 0.10.2tar.gz