PyPI - tinygrad - Versions diffs - 0.10.2__tar.gz → 0.11.0__tar.gz - Mend

tinygrad 0.10.2tar.gz → 0.11.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (252) hide show

{tinygrad-0.10.2/tinygrad.egg-info → tinygrad-0.11.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
-Metadata-Version: 2.2
+Metadata-Version: 2.4
 Name: tinygrad
-Version: 0.10.2
+Version: 0.11.0
 Summary: You like pytorch? You like micrograd? You love tinygrad! <3
 Author: George Hotz
 License: MIT
@@ -19,31 +19,38 @@ Requires-Dist: mypy==1.13.0; extra == "linting"
 Requires-Dist: typing-extensions; extra == "linting"
 Requires-Dist: pre-commit; extra == "linting"
 Requires-Dist: ruff; extra == "linting"
-Requires-Dist: types-tqdm; extra == "linting"
+Requires-Dist: numpy; extra == "linting"
 Provides-Extra: testing-minimal
 Requires-Dist: numpy; extra == "testing-minimal"
-Requires-Dist: torch; extra == "testing-minimal"
+Requires-Dist: torch==2.7.1; extra == "testing-minimal"
 Requires-Dist: pytest; extra == "testing-minimal"
 Requires-Dist: pytest-xdist; extra == "testing-minimal"
 Requires-Dist: hypothesis; extra == "testing-minimal"
+Requires-Dist: z3-solver; extra == "testing-minimal"
+Requires-Dist: ml_dtypes; extra == "testing-minimal"
 Provides-Extra: testing-unit
 Requires-Dist: numpy; extra == "testing-unit"
-Requires-Dist: torch; extra == "testing-unit"
+Requires-Dist: torch==2.7.1; extra == "testing-unit"
 Requires-Dist: pytest; extra == "testing-unit"
 Requires-Dist: pytest-xdist; extra == "testing-unit"
 Requires-Dist: hypothesis; extra == "testing-unit"
+Requires-Dist: z3-solver; extra == "testing-unit"
+Requires-Dist: ml_dtypes; extra == "testing-unit"
 Requires-Dist: tqdm; extra == "testing-unit"
 Requires-Dist: safetensors; extra == "testing-unit"
 Requires-Dist: tabulate; extra == "testing-unit"
 Provides-Extra: testing
 Requires-Dist: numpy; extra == "testing"
-Requires-Dist: torch; extra == "testing"
+Requires-Dist: torch==2.7.1; extra == "testing"
 Requires-Dist: pytest; extra == "testing"
 Requires-Dist: pytest-xdist; extra == "testing"
 Requires-Dist: hypothesis; extra == "testing"
+Requires-Dist: z3-solver; extra == "testing"
+Requires-Dist: ml_dtypes; extra == "testing"
 Requires-Dist: pillow; extra == "testing"
-Requires-Dist: onnx==1.16.0; extra == "testing"
+Requires-Dist: onnx==1.18.0; extra == "testing"
 Requires-Dist: onnx2torch; extra == "testing"
+Requires-Dist: onnxruntime; extra == "testing"
 Requires-Dist: opencv-python; extra == "testing"
 Requires-Dist: tabulate; extra == "testing"
 Requires-Dist: tqdm; extra == "testing"
@@ -58,6 +65,10 @@ Requires-Dist: nibabel; extra == "testing"
 Requires-Dist: bottle; extra == "testing"
 Requires-Dist: ggml-python; extra == "testing"
 Requires-Dist: capstone; extra == "testing"
+Requires-Dist: pycocotools; extra == "testing"
+Requires-Dist: boto3; extra == "testing"
+Requires-Dist: pandas; extra == "testing"
+Requires-Dist: influxdb3-python; extra == "testing"
 Provides-Extra: docs
 Requires-Dist: mkdocs; extra == "docs"
 Requires-Dist: mkdocs-material; extra == "docs"
@@ -66,14 +77,12 @@ Requires-Dist: markdown-callouts; extra == "docs"
 Requires-Dist: markdown-exec[ansi]; extra == "docs"
 Requires-Dist: black; extra == "docs"
 Requires-Dist: numpy; extra == "docs"
-Provides-Extra: testing-tf
-Requires-Dist: tensorflow==2.15.1; extra == "testing-tf"
-Requires-Dist: tensorflow_addons; extra == "testing-tf"
 Dynamic: author
 Dynamic: classifier
 Dynamic: description
 Dynamic: description-content-type
 Dynamic: license
+Dynamic: license-file
 Dynamic: provides-extra
 Dynamic: requires-python
 Dynamic: summary
@@ -101,11 +110,11 @@ tinygrad: For something between [PyTorch](https://github.com/pytorch/pytorch) an
 ---
-This may not be the best deep learning framework, but it is a deep learning framework.
+Despite tinygrad's size, it is a fully featured deep learning framework.
-Due to its extreme simplicity, it aims to be the easiest framework to add new accelerators to, with support for both inference and training. If XLA is CISC, tinygrad is RISC.
+Due to its extreme simplicity, it is the easiest framework to add new accelerators to, with support for both inference and training. If XLA is CISC, tinygrad is RISC.
-tinygrad is still alpha software, but we [raised some money](https://geohot.github.io/blog/jekyll/update/2023/05/24/the-tiny-corp-raised-5M.html) to make it good. Someday, we will tape out chips.
+tinygrad is now beta software, we [raised some money](https://geohot.github.io/blog/jekyll/update/2023/05/24/the-tiny-corp-raised-5M.html) to make it good. Someday, we will tape out chips.
 ## Features
@@ -119,9 +128,8 @@ Try a matmul. See how, despite the style, it is fused into one kernel with the p
 ```sh
 DEBUG=3 python3 -c "from tinygrad import Tensor;
-N = 1024; a, b = Tensor.rand(N, N), Tensor.rand(N, N);
-c = (a.reshape(N, 1, N) * b.T.reshape(1, N, N)).sum(axis=2);
-print((c.numpy() - (a.numpy() @ b.numpy())).mean())"
+N = 1024; a, b = Tensor.empty(N, N), Tensor.empty(N, N);
+(a.reshape(N, 1, N) * b.T.reshape(1, N, N)).sum(axis=2).realize()"
 ```
 And we can change `DEBUG` to `4` to see the generated code.

{tinygrad-0.10.2 → tinygrad-0.11.0}/README.md RENAMED Viewed

@@ -21,11 +21,11 @@ tinygrad: For something between [PyTorch](https://github.com/pytorch/pytorch) an
 ---
-This may not be the best deep learning framework, but it is a deep learning framework.
+Despite tinygrad's size, it is a fully featured deep learning framework.
-Due to its extreme simplicity, it aims to be the easiest framework to add new accelerators to, with support for both inference and training. If XLA is CISC, tinygrad is RISC.
+Due to its extreme simplicity, it is the easiest framework to add new accelerators to, with support for both inference and training. If XLA is CISC, tinygrad is RISC.
-tinygrad is still alpha software, but we [raised some money](https://geohot.github.io/blog/jekyll/update/2023/05/24/the-tiny-corp-raised-5M.html) to make it good. Someday, we will tape out chips.
+tinygrad is now beta software, we [raised some money](https://geohot.github.io/blog/jekyll/update/2023/05/24/the-tiny-corp-raised-5M.html) to make it good. Someday, we will tape out chips.
 ## Features
@@ -39,9 +39,8 @@ Try a matmul. See how, despite the style, it is fused into one kernel with the p
 ```sh
 DEBUG=3 python3 -c "from tinygrad import Tensor;
-N = 1024; a, b = Tensor.rand(N, N), Tensor.rand(N, N);
-c = (a.reshape(N, 1, N) * b.T.reshape(1, N, N)).sum(axis=2);
-print((c.numpy() - (a.numpy() @ b.numpy())).mean())"
+N = 1024; a, b = Tensor.empty(N, N), Tensor.empty(N, N);
+(a.reshape(N, 1, N) * b.T.reshape(1, N, N)).sum(axis=2).realize()"
 ```
 And we can change `DEBUG` to `4` to see the generated code.

{tinygrad-0.10.2 → tinygrad-0.11.0}/setup.py RENAMED Viewed

@@ -9,22 +9,44 @@ with open(directory / 'README.md', encoding='utf-8') as f:
 testing_minimal = [
   "numpy",
-  "torch",
+  "torch==2.7.1",
   "pytest",
   "pytest-xdist",
   "hypothesis",
+  "z3-solver",
+  "ml_dtypes"
 ]
 setup(name='tinygrad',
-      version='0.10.2',
+      version='0.11.0',
       description='You like pytorch? You like micrograd? You love tinygrad! <3',
       author='George Hotz',
       license='MIT',
       long_description=long_description,
       long_description_content_type='text/markdown',
-      packages = ['tinygrad', 'tinygrad.runtime.autogen', 'tinygrad.codegen', 'tinygrad.nn', 'tinygrad.renderer', 'tinygrad.engine', 'tinygrad.viz',
-                  'tinygrad.runtime', 'tinygrad.runtime.support', 'tinygrad.runtime.support.am', 'tinygrad.runtime.graph', 'tinygrad.shape'],
-      package_data = {'tinygrad': ['py.typed'], 'tinygrad.viz': ['index.html', 'perfetto.html', 'assets/**/*']},
+      packages = [
+        'tinygrad',
+        'tinygrad.apps',
+        'tinygrad.codegen',
+        'tinygrad.codegen.opt',
+        'tinygrad.engine',
+        'tinygrad.frontend',
+        'tinygrad.nn',
+        'tinygrad.renderer',
+        'tinygrad.runtime',
+        'tinygrad.runtime.autogen',
+        'tinygrad.runtime.autogen.am',
+        'tinygrad.runtime.autogen.nv',
+        'tinygrad.runtime.graph',
+        'tinygrad.runtime.support',
+        'tinygrad.runtime.support.am',
+        'tinygrad.runtime.support.nv',
+        'tinygrad.schedule',
+        'tinygrad.shape',
+        'tinygrad.uop',
+        'tinygrad.viz',
+      ],
+      package_data = {'tinygrad': ['py.typed'], 'tinygrad.viz': ['index.html', 'assets/**/*', 'js/*']},
       classifiers=[
         "Programming Language :: Python :: 3",
         "License :: OSI Approved :: MIT License"
@@ -40,19 +62,20 @@ setup(name='tinygrad',
             "typing-extensions",
             "pre-commit",
             "ruff",
-            "types-tqdm",
+            "numpy",
         ],
-        #'mlperf': ["mlperf-logging @ git+https://github.com/mlperf/logging.git@4.1.0-rc3"],
+        #'mlperf': ["mlperf-logging @ git+https://github.com/mlperf/logging.git@5.0.0-rc3"],
         'testing_minimal': testing_minimal,
         'testing_unit': testing_minimal + [
             "tqdm",
             "safetensors",
-            "tabulate"  # for sz.py
+            "tabulate",  # for sz.py
         ],
         'testing': testing_minimal + [
             "pillow",
-            "onnx==1.16.0",
+            "onnx==1.18.0",
             "onnx2torch",
+            "onnxruntime",
             "opencv-python",
             "tabulate",
             "tqdm",
@@ -66,7 +89,11 @@ setup(name='tinygrad',
             "nibabel",
             "bottle",
             "ggml-python",
-            "capstone"
+            "capstone",
+            "pycocotools",
+            "boto3",
+            "pandas",
+            "influxdb3-python"
         ],
         'docs': [
             "mkdocs",
@@ -77,9 +104,5 @@ setup(name='tinygrad',
             "black",
             "numpy",
         ],
-        'testing_tf': [
-            "tensorflow==2.15.1",
-            "tensorflow_addons",
-        ],
       },
       include_package_data=True)

tinygrad-0.11.0/test/test_amd_llvm.py ADDED Viewed

@@ -0,0 +1,52 @@
+import unittest
+import numpy as np
+from tinygrad import Device
+from tinygrad.device import CompileError
+from tinygrad.helpers import flat_mv
+if Device.DEFAULT=="AMD":
+  from tinygrad.runtime.ops_amd import AMDAllocator, AMDDevice, AMDProgram
+  from tinygrad.runtime.support.compiler_amd import AMDLLVMCompiler
+@unittest.skipUnless(Device.DEFAULT == "AMD", "Runs only on AMD")
+class TestAMDLLVM(unittest.TestCase):
+  def test_compiler(self):
+    src = '''
+; https://github.com/llvm/llvm-project/blob/main/llvm/test/CodeGen/AMDGPU/imm.ll
+define amdgpu_kernel void @i64_imm_inline_lo(ptr addrspace(1) %out) {
+entry:
+  store i64 1311768464867721221, ptr addrspace(1) %out ; 0x1234567800000005
+  ret void
+}
+    '''
+    device = AMDDevice()
+    compiler = AMDLLVMCompiler("gfx1100")
+    obj = compiler.compile(src)
+    allocator = AMDAllocator(device)
+    a = allocator.alloc(1*8)
+    prog = AMDProgram(device, "test", obj)
+    prog(a, wait=True)
+    na = np.empty(1, np.uint64)
+    allocator._copyout(flat_mv(na.data), a)
+    assert na == [0x1234567800000005]
+  def test_compiler_diag_error(self):
+    src = """
+@local_temp0 = internal unnamed_addr addrspace(3) global [{N} x float*] undef, align 16
+define amdgpu_kernel void @test(float* noalias align 32 %data0, half* noalias align 32 %data1, float* noalias align 32 %data2) #0
+{{
+  %local_temp0 = addrspacecast [{N} x float*] addrspace(3)* @local_temp0 to [{N} x float*]*
+  %v178 = getelementptr inbounds float, float* %local_temp0, i32 1
+  %v133 = getelementptr inbounds float, float* %data2, i32 1
+  %v134 = load float, float* %v133
+  store float %v134, float* %v178
+  ret void
+}}
+"""
+    compiler = AMDLLVMCompiler("gfx1100")
+    compiler.compile(src.format(N=65536//8))
+    with self.assertRaises(CompileError):
+      # llvm diagnostic: <unknown>:0:0: local memory (65544) exceeds limit (65536) in function 'test'
+      compiler.compile(src.format(N=65536//8+1))
+if __name__ == '__main__':
+  unittest.main()

{tinygrad-0.10.2 → tinygrad-0.11.0}/test/test_arange.py RENAMED Viewed

@@ -1,12 +1,13 @@
 import unittest, contextlib
 import numpy as np
-from tinygrad import Tensor, GlobalCounters, dtypes, nn, Device
+from tinygrad import Tensor, GlobalCounters, dtypes, nn, Device, Variable
 from tinygrad.helpers import CI, Context, getenv
 from tinygrad.engine.realize import run_schedule
-from tinygrad.codegen.kernel import Opt, OptOps, Kernel, KernelOptError
-from tinygrad.engine.realize import CompiledRunner, ExecItem
-from tinygrad.engine.search import get_kernel_actions
-from tinygrad.ops import Ops
+from tinygrad.codegen.opt.kernel import Opt, OptOps, Kernel, KernelOptError
+from tinygrad.engine.realize import CompiledRunner, ExecItem, get_program
+from tinygrad.codegen.opt.search import get_kernel_actions
+from tinygrad.uop.ops import Ops
+from tinygrad.codegen import apply_rewrites, rewrites_for_views
 class TestArange(unittest.TestCase):
   def _get_flops(self, N, opts=None):
@@ -14,41 +15,46 @@ class TestArange(unittest.TestCase):
     tt = Tensor.arange(N)
     sched = tt.schedule()
     self.assertEqual(len(sched), 1)
-    k = Kernel(sched[-1].ast)
-    if opts is not None:
-      for o in opts: k.apply_opt(o)
-    p = k.to_program()
+    p = get_program(sched[-1].ast, opts=opts)
     print(p.name)
     #print(p.src)
-    ExecItem(CompiledRunner(p), [tt.lazydata.buffer]).run()
+    ExecItem(CompiledRunner(p), [tt.uop.buffer]).run()
     np.testing.assert_equal(tt.numpy(), np.arange(N))
     return p.estimates.ops
   def test_complexity(self, opts=None, limit=None):
-    # add 1 to avoid divide by 0. arange is 0 flops now!
-    f1 = self._get_flops(256, opts) + 1
-    f2 = self._get_flops(2560, opts) + 1
+    f1 = self._get_flops(256, opts)
+    f2 = self._get_flops(2560, opts)
     print(f"{f1=}, {f2=}")
-    assert (f1 < 5000 and f2 < 5000) or (f2 / f1 < 15), f"bad complexity, flops {f2/f1:.1f}X while inputs 10X"
+    # add 1 to avoid divide by 0. arange is 0 flops now!
+    assert (f1 < 6000 and f2 < 6000) or ((f2+1) / (f1+1) < 16), f"bad complexity, flops {(f2+1) / (f1+1):.1f}X while inputs 10X"
     if limit is not None and not getenv("PTX"):
       # PTX counts index ALU in flops
       assert f1 <= limit, f"{f1=}, {limit=}"
-  def test_complexity_w_upcast(self): return self.test_complexity([Opt(OptOps.UPCAST, 0, 4)], limit=1)
-  def test_complexity_w_unroll2(self): return self.test_complexity([Opt(OptOps.UNROLL, 0, 2)], limit=1)
-  def test_complexity_w_unroll4(self): return self.test_complexity([Opt(OptOps.UNROLL, 0, 4)], limit=1)
-  def test_complexity_w_unroll8(self): return self.test_complexity([Opt(OptOps.UNROLL, 0, 8)], limit=1)
-  def test_complexity_w_upcast_and_unroll(self): return self.test_complexity([Opt(OptOps.UPCAST, 0, 4), Opt(OptOps.UNROLL, 0, 4)], limit=1)
+  def test_complexity_w_upcast(self): return self.test_complexity([Opt(OptOps.UPCAST, 0, 4)], limit=0)
+  def test_complexity_w_unroll2(self): return self.test_complexity([Opt(OptOps.UNROLL, 0, 2)], limit=0)
+  def test_complexity_w_unroll4(self): return self.test_complexity([Opt(OptOps.UNROLL, 0, 4)], limit=0)
+  def test_complexity_w_unroll8(self): return self.test_complexity([Opt(OptOps.UNROLL, 0, 8)], limit=0)
+  def test_complexity_w_upcast_and_unroll(self): return self.test_complexity([Opt(OptOps.UPCAST, 0, 4), Opt(OptOps.UNROLL, 0, 4)], limit=0)
-  @unittest.skip("doesn't work yet")
-  def test_complexity_w_local_and_padto(self): return self.test_complexity([Opt(OptOps.LOCAL, 0, 16), Opt(op=OptOps.PADTO, axis=1, arg=32)])
+  if Device.default.renderer.has_local:
+    # TODO: fix limit
+    def test_complexity_w_group(self): return self.test_complexity([Opt(OptOps.GROUP, 0, 16)], limit=81920)
+    def test_complexity_w_group_top(self): return self.test_complexity([Opt(OptOps.GROUPTOP, 0, 16)], limit=106496)
+    def test_complexity_w_local(self): return self.test_complexity([Opt(OptOps.LOCAL, 0, 16)], limit=0)
+    @unittest.skip("doesn't work yet. TODO: this absolutely should work")
+    def test_complexity_w_local_unroll4(self): return self.test_complexity([Opt(OptOps.LOCAL, 0, 16), Opt(OptOps.UNROLL, 0, 4)], limit=0)
+    @unittest.skip("doesn't work yet")
+    def test_complexity_w_local_and_padto(self): return self.test_complexity([Opt(OptOps.LOCAL, 0, 16), Opt(OptOps.PADTO, axis=1, arg=32)])
   def test_all_opts(self, opts=None, exclude=None):
-    k = Kernel(Tensor.arange(256).schedule()[-1].ast)
+    k = Kernel(apply_rewrites(Tensor.arange(256).schedule()[-1].ast, rewrites_for_views))
     if opts is not None:
       for o in opts: k.apply_opt(o)
     all_opts_256 = [kk.applied_opts for kk in get_kernel_actions(k, include_0=False).values()]
-    k = Kernel(Tensor.arange(2560).schedule()[-1].ast)
+    k = Kernel(apply_rewrites(Tensor.arange(2560).schedule()[-1].ast, rewrites_for_views))
     if opts is not None:
       for o in opts: k.apply_opt(o)
     all_opts_2560 = [kk.applied_opts for kk in get_kernel_actions(k, include_0=False).values()]
@@ -65,6 +71,24 @@ class TestArange(unittest.TestCase):
   def test_all_opts_w_upcast_and_unroll(self):
     return self.test_all_opts([Opt(OptOps.UPCAST, 0, 4), Opt(OptOps.UNROLL, 0, 4)], [Opt(op=OptOps.GROUP, axis=0, arg=0)])
+class TestRand(unittest.TestCase):
+  def test_fused_rand_less_ops(self, noopt=1):
+    GlobalCounters.reset()
+    with Context(FUSE_ARANGE=0, NOOPT=noopt):
+      out = Tensor.rand(16384)
+      out.realize()
+    unfused_ops = GlobalCounters.global_ops
+    GlobalCounters.reset()
+    with Context(FUSE_ARANGE=1, NOOPT=noopt):
+      out = Tensor.rand(16384)
+      out.realize()
+    print(f"fused {GlobalCounters.global_ops} unfused {unfused_ops}")
+    self.assertLessEqual(GlobalCounters.global_ops, unfused_ops*2)
+  def test_fused_rand_less_ops_opt(self): self.test_fused_rand_less_ops(0)
+DSET, DDIM = 2048, 32
 class TestIndexing(unittest.TestCase):
   def test_arange_2_reduce(self):
     needle = Tensor.zeros(16384, dtype=dtypes.int).contiguous()
@@ -80,52 +104,63 @@ class TestIndexing(unittest.TestCase):
   @unittest.skipIf(getenv("PTX"), "broken on ptx for some reason")
   def test_manual_index(self):
-    dataset = Tensor.rand(16384, 256).realize()
+    dataset = Tensor.rand(DSET, DDIM).realize()
     idxs = Tensor([0,3,5,6]).realize()
     real_index = dataset.numpy()[idxs.numpy()]
     print("*** indexing ***")
     with Context(NOOPT=1, FUSE_ARANGE=1):
       GlobalCounters.reset()
-      rng = Tensor.ones(4, 256, 16384, dtype=dtypes.int)._cumalu(axis=-1, op=Ops.ADD, _include_initial=True).reshape(4, 256, 16384, 1)
-      idxs = idxs.reshape(4,1,1,1).expand(4, 256, 16384, 1)
-      reshape_dataset = dataset.T.reshape(1, 256, 16384, 1).expand(4, 256, 16384, 1)
-      full = (rng==idxs).where(reshape_dataset, Tensor.zeros(4, 256, 16384, 1))
+      rng = Tensor.ones(4, DDIM, DSET, dtype=dtypes.int)._cumalu(axis=-1, op=Ops.ADD, _include_initial=True).reshape(4, DDIM, DSET, 1)
+      idxs = idxs.reshape(4,1,1,1).expand(4, DDIM, DSET, 1)
+      reshape_dataset = dataset.T.reshape(1, DDIM, DSET, 1).expand(4, DDIM, DSET, 1)
+      full = (rng==idxs).where(reshape_dataset, Tensor.zeros(4, DDIM, DSET, 1))
       X = full.sum(axis=(2,3))
       sched = X.schedule()
       self.assertEqual(len(sched), 1)
       run_schedule(sched)
-      assert GlobalCounters.global_ops < 4*16384, f"too many ops {GlobalCounters.global_ops}"
+      assert GlobalCounters.global_ops < 4*DSET, f"too many ops {GlobalCounters.global_ops}"
     np.testing.assert_allclose(real_index, X.numpy())
+  def test_index_variable(self):
+    dataset = Tensor.rand(DSET, DDIM).realize()
+    v = Variable("v", 0, DDIM-1)
+    with Context(NOOPT=1, FUSE_ARANGE=1, SPLIT_REDUCEOP=0):
+      GlobalCounters.reset()
+      vb = Tensor(v.bind(12))
+      comp = dataset[vb].numpy()
+      # no global ops because they are all indexing
+      self.assertEqual(GlobalCounters.global_ops, 0)
+    np.testing.assert_allclose(comp, dataset.numpy()[12])
   def test_index(self):
-    dataset = Tensor.rand(16384, 256).realize()
+    dataset = Tensor.rand(DSET, DDIM).realize()
     idxs = Tensor([0,3,5,6]).realize()
     real_index = dataset.numpy()[idxs.numpy()]
     print("*** indexing ***")
     with Context(NOOPT=1):
       GlobalCounters.reset()
       X = dataset[idxs]
-      assert X.shape == (4,256)
+      assert X.shape == (4,DDIM)
       sched = X.schedule()
       # TODO: enable these asserts when the scheduler can handle this
       #self.assertEqual(len(sched), 1)
       run_schedule(sched)
-      #assert GlobalCounters.global_ops < 4*16384, f"too many ops {GlobalCounters.global_ops}"
+      #assert GlobalCounters.global_ops < 4*DSET, f"too many ops {GlobalCounters.global_ops}"
     np.testing.assert_allclose(real_index, X.numpy())
   def test_index_fused(self, noopt=1):
-    dataset = Tensor.rand(16384, 256).realize()
+    dataset = Tensor.rand(DSET, DDIM).realize()
     idxs = Tensor([0,3,5,6]).realize()
     real_index = dataset.numpy()[idxs.numpy()]
     print("*** indexing ***")
     with Context(NOOPT=noopt, FUSE_ARANGE=1):
       GlobalCounters.reset()
       X = dataset[idxs]
-      assert X.shape == (4,256)
+      assert X.shape == (4,DDIM)
       sched = X.schedule()
       self.assertEqual(len(sched), 2)
       run_schedule(sched)
-      assert GlobalCounters.global_ops < 4*16384, f"too many ops {GlobalCounters.global_ops} != {4*16384}"
+      assert GlobalCounters.global_ops < 4*DSET, f"too many ops {GlobalCounters.global_ops} != {4*DSET}"
     np.testing.assert_allclose(real_index, X.numpy())
   @unittest.skip("not ready")
   def test_index_fused_opt(self): self.test_index_fused(0)
@@ -138,10 +173,12 @@ class TestIndexing(unittest.TestCase):
       np.testing.assert_equal(X.numpy(), 0)
   @unittest.skipIf(getenv("PTX"), "broken on ptx for some reason")
-  def test_index_mnist(self, noopt=1, op_limit=512*784*13):
+  def test_index_mnist(self, noopt=1, op_limit=512*784*13, split_reduceop=0):
+    # WEBGPU generates more ops due to bitpacking of < 4-byte dtypes
+    if Device.DEFAULT == "WEBGPU": op_limit *= 15
     from tinygrad.nn.datasets import mnist
     X_train, Y_train, _, _ = mnist()
-    with Context(NOOPT=noopt, FUSE_ARANGE=1, SPLIT_REDUCEOP=0):
+    with Context(NOOPT=noopt, FUSE_ARANGE=1, SPLIT_REDUCEOP=split_reduceop):
       samples = Tensor.randint(getenv("BS", 512), high=X_train.shape[0]).realize()
       GlobalCounters.reset()
       x = X_train[samples].numpy()
@@ -149,10 +186,12 @@ class TestIndexing(unittest.TestCase):
       assert GlobalCounters.global_ops < op_limit, f"too many ops {GlobalCounters.global_ops} != {op_limit}"
     np.testing.assert_allclose(X_train.numpy()[samples.numpy()], x)
     np.testing.assert_allclose(Y_train.numpy()[samples.numpy()], y)
-  @unittest.skip("not ready")
   def test_index_mnist_opt(self): self.test_index_mnist(0)
+  def test_index_mnist_split(self): self.test_index_mnist(1, split_reduceop=1)
+  def test_index_mnist_opt_split(self): self.test_index_mnist(0, split_reduceop=1)
-  @unittest.skipIf(getenv("PTX") or Device.DEFAULT == "WEBGPU", "broken on ptx and WebGPU for some reason")
+  @unittest.skipIf(getenv("PTX"), "broken on ptx for some reason")
   def test_llama_embedding(self, noopt=1, op_limit=65536):
     # llama3 is 128256
     vocab_size, embed_size = (10, 3) if CI else (32000, 4096)

{tinygrad-0.10.2 → tinygrad-0.11.0}/test/test_assign.py RENAMED Viewed

@@ -13,11 +13,11 @@ class TestAssign(unittest.TestCase):
     b = Tensor(np.arange(N*N, dtype=np.float32)).reshape(N,N)
     a.realize()
     b.realize()
-    ba1 = a.lazydata.base.realized
-    bb1 = b.lazydata.base.realized
+    ba1 = a.uop.base.realized
+    bb1 = b.uop.base.realized
     a += b
     a.realize()
-    ba2 = a.lazydata.base.realized
+    ba2 = a.uop.base.realized
     assert ba1 == ba2 and ba1 != bb1
     np.testing.assert_allclose(a.numpy(), (np.arange(N*N)*2).reshape((N,N)))
@@ -259,13 +259,13 @@ class TestAssign(unittest.TestCase):
     b = Tensor(np.arange(N*N, dtype=np.float32)).reshape(N,N)
     a.realize()
     b.realize()
-    ba1 = a.lazydata.base.realized
-    bb1 = b.lazydata.base.realized
+    ba1 = a.uop.base.realized
+    bb1 = b.uop.base.realized
     with self.assertRaises((RuntimeError, AssertionError)):
       a = a.permute(1,0)
       a += b
       a.realize()
-      ba2 = a.lazydata.base.realized
+      ba2 = a.uop.base.realized
       assert ba1 != ba2 and ba1 != bb1
       np.testing.assert_allclose(a.numpy(), np.arange(N*N).reshape((N,N)) + np.arange(N*N).reshape((N,N)).transpose(1,0))
@@ -275,12 +275,12 @@ class TestAssign(unittest.TestCase):
     a.realize()
     b.realize()
     #GlobalCounters.cache = []
-    ba1 = a.lazydata.base.realized # noqa: F841
-    bb1 = b.lazydata.base.realized # noqa: F841
+    ba1 = a.uop.base.realized # noqa: F841
+    bb1 = b.uop.base.realized # noqa: F841
     with self.assertRaisesRegex(RuntimeError, "contiguous"):
       a.assign(a.permute(1,0) + b)   # this should not work!
       a.realize()
-      ba2 = a.lazydata.base.realized # noqa: F841
+      ba2 = a.uop.base.realized # noqa: F841
       # NOTE: don't test that it's assigned
       #assert ba1 == ba2 and ba1 != bb1
       np.testing.assert_allclose(a.numpy(), np.arange(N*N).reshape((N,N)) + np.arange(N*N).reshape((N,N)).transpose(1,0))
@@ -383,10 +383,10 @@ class TestAssign(unittest.TestCase):
   def test_cast_assignment(self):
     a = Tensor(np.arange(N*N, dtype=np.float32)).reshape(N,N)
     a.realize()
-    oba1 = a.lazydata.base.output_buffer
+    oba1 = a.uop.base.output_buffer
     a.assign(a.cast(dtypes.int32).realize())
     a.realize()
-    oba2 = a.lazydata.base.output_buffer
+    oba2 = a.uop.base.output_buffer
     assert oba1 is None and oba2 is None
     np.testing.assert_allclose(a.numpy(), np.arange(N*N,dtype=np.int32).reshape((N,N)))

tinygrad 0.10.2__tar.gz → 0.11.0__tar.gz

tinygrad 0.10.2tar.gz → 0.11.0tar.gz