PyPI - tinygrad - Versions diffs - 0.9.1__tar.gz → 0.9.2__tar.gz - Mend

tinygrad 0.9.1tar.gz → 0.9.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (149) hide show

{tinygrad-0.9.1 → tinygrad-0.9.2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: tinygrad
-Version: 0.9.1
+Version: 0.9.2
 Summary: You like pytorch? You like micrograd? You love tinygrad! <3
 Author: George Hotz
 License: MIT
@@ -39,12 +39,14 @@ Requires-Dist: safetensors; extra == "testing"
 Requires-Dist: transformers; extra == "testing"
 Requires-Dist: sentencepiece; extra == "testing"
 Requires-Dist: tiktoken; extra == "testing"
+Requires-Dist: blobfile; extra == "testing"
 Requires-Dist: librosa; extra == "testing"
 Requires-Dist: networkx; extra == "testing"
 Requires-Dist: hypothesis; extra == "testing"
 Requires-Dist: nibabel; extra == "testing"
 Requires-Dist: bottle; extra == "testing"
 Provides-Extra: docs
+Requires-Dist: mkdocs; extra == "docs"
 Requires-Dist: mkdocs-material; extra == "docs"
 Requires-Dist: mkdocstrings[python]; extra == "docs"
 Requires-Dist: markdown-callouts; extra == "docs"
@@ -107,7 +109,7 @@ And we can change `DEBUG` to `4` to see the generated code.
 As it turns out, 90% of what you need for neural networks are a decent autograd/tensor library.
 Throw in an optimizer, a data loader, and some compute, and you have all you need.
-```py
+```python
 from tinygrad import Tensor, nn
 class LinearNet:
@@ -122,11 +124,12 @@ optim = nn.optim.Adam([model.l1, model.l2], lr=0.001)
 x, y = Tensor.rand(4, 1, 28, 28), Tensor([2,4,3,7])  # replace with real mnist dataloader
-for i in range(10):
-  optim.zero_grad()
-  loss = model(x).sparse_categorical_crossentropy(y).backward()
-  optim.step()
-  print(i, loss.item())
+with Tensor.train():
+  for i in range(10):
+    optim.zero_grad()
+    loss = model(x).sparse_categorical_crossentropy(y).backward()
+    optim.step()
+    print(i, loss.item())
 ```
 See [examples/beautiful_mnist.py](examples/beautiful_mnist.py) for the full version that gets 98% in ~5 seconds
@@ -169,7 +172,7 @@ Documentation along with a quick start guide can be found on the [docs website](
 ### Quick example comparing to PyTorch
-```py
+```python
 from tinygrad import Tensor
 x = Tensor.eye(3, requires_grad=True)
@@ -182,7 +185,7 @@ print(y.grad.numpy())  # dz/dy
 ```
 The same thing but in PyTorch:
-```py
+```python
 import torch
 x = torch.eye(3, requires_grad=True)
@@ -230,6 +233,4 @@ python3 -m pytest test/                 # whole test suite
 #### Process replay tests
-[Process replay](https://github.com/tinygrad/tinygrad/blob/master/test/external/process_replay/process_replay.py) detects changes in the generated kernels of CI tests by comparing them against tinygrad master. If your PR is a refactor or speedup without any expected behavior change, it should include a green process replay pass to get merged.
-You can enable process replay by adding [run_process_replay] to your PR title. [example](https://github.com/tinygrad/tinygrad/pull/4995). Note that you should keep your branch up-to-date with master.
+[Process replay](https://github.com/tinygrad/tinygrad/blob/master/test/external/process_replay/process_replay.py) compares your PR's generated kernels against master. If your PR is a refactor or speedup without any expected behavior change, It should include [run_process_replay] in the PR title, [example](https://github.com/tinygrad/tinygrad/pull/4995). Note that you should keep your branch up-to-date with master.

{tinygrad-0.9.1 → tinygrad-0.9.2}/README.md RENAMED Viewed

@@ -51,7 +51,7 @@ And we can change `DEBUG` to `4` to see the generated code.
 As it turns out, 90% of what you need for neural networks are a decent autograd/tensor library.
 Throw in an optimizer, a data loader, and some compute, and you have all you need.
-```py
+```python
 from tinygrad import Tensor, nn
 class LinearNet:
@@ -66,11 +66,12 @@ optim = nn.optim.Adam([model.l1, model.l2], lr=0.001)
 x, y = Tensor.rand(4, 1, 28, 28), Tensor([2,4,3,7])  # replace with real mnist dataloader
-for i in range(10):
-  optim.zero_grad()
-  loss = model(x).sparse_categorical_crossentropy(y).backward()
-  optim.step()
-  print(i, loss.item())
+with Tensor.train():
+  for i in range(10):
+    optim.zero_grad()
+    loss = model(x).sparse_categorical_crossentropy(y).backward()
+    optim.step()
+    print(i, loss.item())
 ```
 See [examples/beautiful_mnist.py](examples/beautiful_mnist.py) for the full version that gets 98% in ~5 seconds
@@ -113,7 +114,7 @@ Documentation along with a quick start guide can be found on the [docs website](
 ### Quick example comparing to PyTorch
-```py
+```python
 from tinygrad import Tensor
 x = Tensor.eye(3, requires_grad=True)
@@ -126,7 +127,7 @@ print(y.grad.numpy())  # dz/dy
 ```
 The same thing but in PyTorch:
-```py
+```python
 import torch
 x = torch.eye(3, requires_grad=True)
@@ -174,6 +175,4 @@ python3 -m pytest test/                 # whole test suite
 #### Process replay tests
-[Process replay](https://github.com/tinygrad/tinygrad/blob/master/test/external/process_replay/process_replay.py) detects changes in the generated kernels of CI tests by comparing them against tinygrad master. If your PR is a refactor or speedup without any expected behavior change, it should include a green process replay pass to get merged.
-You can enable process replay by adding [run_process_replay] to your PR title. [example](https://github.com/tinygrad/tinygrad/pull/4995). Note that you should keep your branch up-to-date with master.
+[Process replay](https://github.com/tinygrad/tinygrad/blob/master/test/external/process_replay/process_replay.py) compares your PR's generated kernels against master. If your PR is a refactor or speedup without any expected behavior change, It should include [run_process_replay] in the PR title, [example](https://github.com/tinygrad/tinygrad/pull/4995). Note that you should keep your branch up-to-date with master.

{tinygrad-0.9.1 → tinygrad-0.9.2}/setup.py RENAMED Viewed

@@ -8,14 +8,14 @@ with open(directory / 'README.md', encoding='utf-8') as f:
   long_description = f.read()
 setup(name='tinygrad',
-      version='0.9.1',
+      version='0.9.2',
       description='You like pytorch? You like micrograd? You love tinygrad! <3',
       author='George Hotz',
       license='MIT',
       long_description=long_description,
       long_description_content_type='text/markdown',
       packages = ['tinygrad', 'tinygrad.runtime.autogen', 'tinygrad.codegen', 'tinygrad.nn', 'tinygrad.renderer', 'tinygrad.engine',
-                  'tinygrad.runtime', 'tinygrad.runtime.driver', 'tinygrad.runtime.graph', 'tinygrad.shape'],
+                  'tinygrad.runtime', 'tinygrad.runtime.support', 'tinygrad.runtime.graph', 'tinygrad.shape'],
       classifiers=[
         "Programming Language :: Python :: 3",
         "License :: OSI Approved :: MIT License"
@@ -51,6 +51,7 @@ setup(name='tinygrad',
             "transformers",
             "sentencepiece",
             "tiktoken",
+            "blobfile",
             "librosa",
             "networkx",
             "hypothesis",
@@ -58,6 +59,7 @@ setup(name='tinygrad',
             "bottle",
         ],
         'docs': [
+            "mkdocs",
             "mkdocs-material",
             "mkdocstrings[python]",
             "markdown-callouts",

tinygrad-0.9.2/test/test_arange.py ADDED Viewed

@@ -0,0 +1,167 @@
+import unittest, contextlib
+import numpy as np
+from tinygrad import Tensor, GlobalCounters, dtypes, nn
+from tinygrad.helpers import CI, Context, getenv
+from tinygrad.engine.realize import run_schedule
+from tinygrad.codegen.kernel import Opt, OptOps, Kernel, KernelOptError
+from tinygrad.engine.realize import CompiledRunner, ExecItem
+from tinygrad.engine.search import get_kernel_actions
+class TestArange(unittest.TestCase):
+  def _get_flops(self, N, opts=None):
+    GlobalCounters.reset()
+    tt = Tensor.arange(N)
+    sched = tt.schedule()
+    self.assertEqual(len(sched), 1)
+    k = Kernel(sched[-1].ast)
+    if opts is not None:
+      for o in opts: k.apply_opt(o)
+    p = k.to_program()
+    print(p.name)
+    #print(p.src)
+    ExecItem(CompiledRunner(p), [tt.lazydata.buffer]).run()
+    np.testing.assert_equal(tt.numpy(), np.arange(N))
+    return p.op_estimate
+  def test_complexity(self, opts=None):
+    # add 1 to avoid divide by 0. arange is 0 flops now!
+    f1 = self._get_flops(256, opts) + 1
+    f2 = self._get_flops(2560, opts) + 1
+    print(f"{f1=}, {f2=}")
+    assert (f1 < 5000 and f2 < 5000) or (f2 / f1 < 15), f"bad complexity, flops {f2/f1:.1f}X while inputs 10X"
+  def test_complexity_w_upcast(self): return self.test_complexity([Opt(OptOps.UPCAST, 0, 4)])
+  def test_complexity_w_unroll(self): return self.test_complexity([Opt(OptOps.UNROLL, 0, 4)])
+  def test_complexity_w_upcast_and_unroll(self): return self.test_complexity([Opt(OptOps.UPCAST, 0, 4), Opt(OptOps.UNROLL, 0, 4)])
+  @unittest.skip("doesn't work yet")
+  def test_complexity_w_local_and_padto(self): return self.test_complexity([Opt(OptOps.LOCAL, 0, 16), Opt(op=OptOps.PADTO, axis=1, amt=32)])
+  def test_all_opts(self, opts=None, exclude=None):
+    k = Kernel(Tensor.arange(256).schedule()[-1].ast)
+    if opts is not None:
+      for o in opts: k.apply_opt(o)
+    all_opts_256 = [kk.applied_opts for kk in get_kernel_actions(k, include_0=False).values()]
+    k = Kernel(Tensor.arange(2560).schedule()[-1].ast)
+    if opts is not None:
+      for o in opts: k.apply_opt(o)
+    all_opts_2560 = [kk.applied_opts for kk in get_kernel_actions(k, include_0=False).values()]
+    all_opts = [x for x in all_opts_256 if x in all_opts_2560]
+    for opts in all_opts:
+      if exclude is not None and opts[-1] in exclude: continue
+      print(opts)
+      self.test_complexity(opts)
+  def test_all_opts_w_local(self):
+    with contextlib.suppress(KernelOptError):
+      return self.test_all_opts([Opt(OptOps.LOCAL, 0, 16)], [Opt(op=OptOps.PADTO, axis=1, amt=32)])
+  def test_all_opts_w_upcast(self): return self.test_all_opts([Opt(OptOps.UPCAST, 0, 4)])
+  def test_all_opts_w_unroll(self): return self.test_all_opts([Opt(OptOps.UNROLL, 0, 4)], [Opt(op=OptOps.GROUP, axis=0, amt=0)])
+  def test_all_opts_w_upcast_and_unroll(self):
+    return self.test_all_opts([Opt(OptOps.UPCAST, 0, 4), Opt(OptOps.UNROLL, 0, 4)], [Opt(op=OptOps.GROUP, axis=0, amt=0)])
+class TestIndexing(unittest.TestCase):
+  def test_arange_2_reduce(self):
+    needle = Tensor.zeros(16384, dtype=dtypes.int).contiguous()
+    needle[1337] = 1
+    needle.realize()
+    with Context(NOOPT=1, FUSE_ARANGE=1):
+      GlobalCounters.reset()
+      # TODO: it should work without these reshapes
+      out = ((Tensor.arange(1,16385).reshape(16384,1)-1)*needle.reshape(16384,1)).sum()
+      sched = out.schedule()
+      assert len(sched) == 1
+      run_schedule(sched)
+    assert out.item() == 1337, f"expected 1337, got {out.item()}"
+  @unittest.skipIf(getenv("PTX"), "broken on ptx for some reason")
+  def test_manual_index(self):
+    dataset = Tensor.rand(16384, 256).realize()
+    idxs = Tensor([0,3,5,6]).realize()
+    real_index = dataset.numpy()[idxs.numpy()]
+    print("*** indexing ***")
+    with Context(NOOPT=1, FUSE_ARANGE=1):
+      GlobalCounters.reset()
+      rng = Tensor.ones(4, 256, 16384, dtype=dtypes.int)._cumsum(axis=-1, _first_zero=True).reshape(4, 256, 16384, 1)
+      idxs = idxs.reshape(4,1,1,1).expand(4, 256, 16384, 1)
+      reshape_dataset = dataset.T.reshape(1, 256, 16384, 1).expand(4, 256, 16384, 1)
+      full = (rng==idxs).where(reshape_dataset, Tensor.zeros(4, 256, 16384, 1))
+      X = full.sum(axis=(2,3))
+      sched = X.schedule()
+      assert len(sched) == 1
+      run_schedule(sched)
+      assert GlobalCounters.global_ops < 4*16384, f"too many ops {GlobalCounters.global_ops}"
+    np.testing.assert_allclose(real_index, X.numpy())
+  def test_index(self):
+    dataset = Tensor.rand(16384, 256).realize()
+    idxs = Tensor([0,3,5,6]).realize()
+    real_index = dataset.numpy()[idxs.numpy()]
+    print("*** indexing ***")
+    with Context(NOOPT=1):
+      GlobalCounters.reset()
+      X = dataset[idxs]
+      assert X.shape == (4,256)
+      sched = X.schedule()
+      # TODO: enable these asserts when the scheduler can handle this
+      #assert len(sched) == 1, f"{len(sched)} != 1"
+      run_schedule(sched)
+      #assert GlobalCounters.global_ops < 4*16384, f"too many ops {GlobalCounters.global_ops}"
+    np.testing.assert_allclose(real_index, X.numpy())
+  def test_index_fused(self, noopt=1):
+    dataset = Tensor.rand(16384, 256).realize()
+    idxs = Tensor([0,3,5,6]).realize()
+    real_index = dataset.numpy()[idxs.numpy()]
+    print("*** indexing ***")
+    with Context(NOOPT=noopt, FUSE_ARANGE=1):
+      GlobalCounters.reset()
+      X = dataset[idxs]
+      assert X.shape == (4,256)
+      sched = X.schedule()
+      assert len(sched) == 2
+      run_schedule(sched)
+      assert GlobalCounters.global_ops < 4*16384, f"too many ops {GlobalCounters.global_ops} != {4*16384}"
+    np.testing.assert_allclose(real_index, X.numpy())
+  @unittest.skip("not ready")
+  def test_index_fused_opt(self): self.test_index_fused(0)
+  @unittest.skipIf(getenv("PTX"), "broken on ptx for some reason")
+  def test_index_mnist(self, noopt=1):
+    from tinygrad.nn.datasets import mnist
+    X_train, Y_train, _, _ = mnist()
+    with Context(NOOPT=noopt, FUSE_ARANGE=1, SPLIT_REDUCEOP=0):
+      GlobalCounters.reset()
+      samples = Tensor.randint(getenv("BS", 512), high=X_train.shape[0])
+      x = X_train[samples].numpy()
+      y = Y_train[samples].numpy()
+      assert GlobalCounters.global_ops < 4*16384, f"too many ops {GlobalCounters.global_ops} != {4*16384}"
+    np.testing.assert_allclose(X_train.numpy()[samples.numpy()], x)
+    np.testing.assert_allclose(Y_train.numpy()[samples.numpy()], y)
+  @unittest.skip("not ready")
+  def test_index_mnist_opt(self): self.test_index_mnist(0)
+  @unittest.skipIf(getenv("PTX"), "broken on ptx for some reason")
+  def test_llama_embedding(self, noopt=1, op_limit=0):
+    # llama3 is 128256
+    vocab_size, embed_size = (10, 3) if CI else (32000, 4096)
+    emb = nn.Embedding(vocab_size, embed_size)
+    emb_w = emb.weight.numpy()
+    x = Tensor([1,2,3,4])
+    with Context(NOOPT=noopt, FUSE_ARANGE=1):
+      GlobalCounters.reset()
+      z = emb(x).realize()
+      self.assertLessEqual(GlobalCounters.global_ops, op_limit)
+      self.assertEqual(GlobalCounters.kernel_count, 2)
+    if getenv("CHECK", 1):
+      import torch
+      with torch.no_grad():
+        torch_emb = torch.nn.Embedding(vocab_size, embed_size).eval()
+        torch_emb.weight[:] = torch.tensor(emb_w, dtype=torch.float32)
+      torch_z = torch_emb(torch.tensor(x.numpy()))
+      # TODO: reshape to match torch, should we do this in nn?
+      np.testing.assert_allclose(z.numpy().reshape(4, embed_size), torch_z.detach().numpy(), atol=1e-8, rtol=1e-8)
+  # at least the arange is being fused
+  def test_llama_embedding_opt(self): self.test_llama_embedding(0, 1736704000)
+if __name__ == "__main__":
+  unittest.main()

{tinygrad-0.9.1 → tinygrad-0.9.2}/test/test_const_folding.py RENAMED Viewed

@@ -2,14 +2,14 @@ import unittest, math
 from tinygrad import Tensor, Device, dtypes
 from tinygrad.engine.schedule import create_schedule
 from tinygrad.helpers import CI
-from tinygrad.ops import BufferOps
+from tinygrad.ops import MetaOps
 import numpy as np
 from test.helpers import is_dtype_supported
 def _check_ast_count(desired_count:int, t:Tensor):
   # NOTE: this has side effect because everything can be scheduled only once
   schedule = create_schedule(t.lazydata.lbs)
-  asts = [s for s in schedule if s.ast[0].op is BufferOps.STORE]
+  asts = [s for s in schedule if s.ast.op is MetaOps.KERNEL]
   assert len(asts) == desired_count
 class TestUnaryOpsConstFolding(unittest.TestCase):

{tinygrad-0.9.1 → tinygrad-0.9.2}/test/test_conv.py RENAMED Viewed

@@ -42,6 +42,14 @@ class TestConv(unittest.TestCase):
     print(ret.numpy())
+  def test_two_binops_no_rerun_small(self):
+    Tensor.no_grad = True
+    x = Tensor.rand(1,1,32,32)
+    w = Tensor.rand(1,1,3,3)
+    out = x.conv2d(w, padding=(1,1))
+    np.testing.assert_allclose(out.relu().numpy(), np.maximum(out.numpy(), 0))
+    Tensor.no_grad = False
   def test_two_binops_no_rerun(self):
     Tensor.no_grad = True
     x = Tensor.randn(1,12,128,256)

tinygrad-0.9.2/test/test_conv_shapetracker.py ADDED Viewed

@@ -0,0 +1,63 @@
+#!/usr/bin/env python
+import unittest
+from tinygrad.tensor import Tensor
+from tinygrad.ops import MetaOps, BufferOps
+from tinygrad.nn import Conv2d
+from tinygrad.engine.schedule import create_schedule
+from tinygrad.shape.shapetracker import ShapeTracker, View
+from tinygrad.helpers import prod
+from test.unit.test_shapetracker import shapetracker_getitem
+class TestConvShapetracker(unittest.TestCase):
+  def test_conv_3x3_one_view(self):
+    conv = Conv2d(16, 32, (3, 3))
+    seen = set()
+    # first run to init the weights, they are saved in seen
+    create_schedule([conv(Tensor.empty(1, 16, 10, 10)).lazydata], seen)
+    # run it again to get the kernels
+    sched = [si for si in create_schedule([conv(Tensor.empty(1, 16, 10, 10)).lazydata], seen) if si.ast.op is MetaOps.KERNEL]
+    assert len(sched) == 1, f"conv should only have one kernel, getting {len(sched)}"
+    for st in [x.arg.st for x in sched[0].ast.lazyops if x.op is BufferOps.LOAD]:
+      assert len(st.views) == 1
+  @unittest.expectedFailure
+  def test_conv_2x2_backward_one_view(self):
+    X = Tensor.rand(1, 1, 3, 3, requires_grad=True)
+    conv = Conv2d(1, 1, (2, 2), bias=False)
+    conv(X).mean().backward()
+    si = X.grad.schedule()[-1]
+    print(si)
+    ldb = [x for x in si.ast.lazyops if x.op is BufferOps.LOAD][0]
+    st: ShapeTracker = ldb.arg.st.simplify()
+    # NOTE: st.real_size() is broken
+    print(si.inputs[0].size)
+    #self.assertEqual(si.inputs[0].size, st.real_size())
+    for v in st.views: print(v)
+    # same st
+    test_st = ShapeTracker((
+      View(shape=(1, 1, 2, 4, 2, 4), strides=(0, 0, 2, 8, 1, 4), offset=0, mask=((0, 1), (0, 1), (0, 2), (0, 2), (0, 2), (0, 2)), contiguous=False),
+      View(shape=(1, 1, 1, 1, 3, 3, 3, 3), strides=(0, 0, 0, 0, 24, 8, 3, 1), offset=0,
+           mask=((0, 1), (0, 1), (0, 1), (0, 1), (0, 2), (0, 3), (0, 2), (0, 3)), contiguous=False)))
+    #test_st = ShapeTracker((
+    #  View(shape=(2,4), strides=(1,4), offset=0, mask=None, contiguous=False),
+    #)).simplify()
+      #View(shape=(1, 1, 2, 4, 2, 4), strides=(0, 0, 2, 8, 1, 4), offset=0, mask=((0, 1), (0, 1), (0, 2), (0, 2), (0, 2), (0, 2)), contiguous=False),
+      #View(shape=(1, 1, 1, 1, 3, 3, 3, 3), strides=(0, 0, 0, 0, 24, 8, 3, 1), offset=0,
+      #     mask=((0, 1), (0, 1), (0, 1), (0, 1), (0, 2), (0, 3), (0, 2), (0, 3)), contiguous=False))).simplify()
+    print("*** new ***")
+    for v in test_st.views: print(v)
+    for i in range(prod(st.shape)):
+      i1, i2 = shapetracker_getitem(st, i), shapetracker_getitem(test_st, i)
+      print(i, i1, i2, si.inputs[0].size, i1==i2)
+      #self.assertEqual(i1, i2)
+    for stt in [st, test_st]:
+      s,va = stt.expr_idxs()
+      print(s)
+      print(va)
+    assert len(st.views) <= 2
+if __name__ == '__main__':
+  unittest.main()

{tinygrad-0.9.1 → tinygrad-0.9.2}/test/test_custom_function.py RENAMED Viewed

@@ -31,7 +31,7 @@ def atan2_cpu(ret:Buffer, a:Buffer, b:Buffer): ret.copyin(np.require(np.arctan2(
 # NOTE: The derivative of atan2 doesn't need a custom op! https://www.liquisearch.com/atan2/derivative
 # In general, it is also optional to write a backward function, just your backward pass won't work without it
-from tinygrad.ops import LoadOps, BinaryOps, UnaryOps
+from tinygrad.ops import MetaOps, BinaryOps, UnaryOps
 from tinygrad.lazy import LazyBuffer
 from tinygrad.tensor import Function
@@ -39,7 +39,7 @@ class ATan2(Function):
   def forward(self, a:LazyBuffer, b:LazyBuffer) -> LazyBuffer:
     assert prod(a.shape) == prod(b.shape) and a.device == b.device, "shape or device mismatch"
     self.a, self.b = a, b
-    return create_lazybuffer(a.device, ShapeTracker.from_shape(a.shape), max(a.dtype, b.dtype), LoadOps.CUSTOM,
+    return create_lazybuffer(a.device, ShapeTracker.from_shape(a.shape), max(a.dtype, b.dtype), MetaOps.CUSTOM,
                              arg={"GPU": atan2_gpu, "CPU": atan2_cpu}[a.device], srcs=(a.contiguous(), b.contiguous()))
   def backward(self, grad_output:LazyBuffer) -> Tuple[Optional[LazyBuffer], Optional[LazyBuffer]]:
     recip = (self.a.e(BinaryOps.MUL, self.a)).e(BinaryOps.ADD, self.b.e(BinaryOps.MUL, self.b)).e(UnaryOps.RECIP)

{tinygrad-0.9.1 → tinygrad-0.9.2}/test/test_device_speed.py RENAMED Viewed

@@ -1,6 +1,6 @@
 import unittest
 from tinygrad import Device
-from tinygrad.codegen.uops import UOpGraph
+from tinygrad.codegen.uopgraph import UOpGraph
 from tinygrad.helpers import Timing, Profiling
 class TestDeviceSpeed(unittest.TestCase):

{tinygrad-0.9.1 → tinygrad-0.9.2}/test/test_dtype.py RENAMED Viewed

@@ -202,7 +202,7 @@ class TestFloatDType(TestDType):
 class TestDoubleDType(TestDType):
   DTYPE = dtypes.double
-  @unittest.skipIf((CI and Device.DEFAULT in {"CUDA", "NV"}) or getenv("PTX"), "conversion not supported on CUDACPU and PTX")  # TODO: why not?
+  @unittest.skipIf((CI and Device.DEFAULT in {"CUDA", "NV"}) or getenv("PTX"), "conversion not supported on CI CUDA and PTX")  # TODO: why not?
   def test_float64_increased_precision(self):
     for func in [
       lambda t: t.exp(),
@@ -267,7 +267,10 @@ class TestInt32DType(TestDType): DTYPE = dtypes.int32
 class TestUint32DType(TestDType): DTYPE = dtypes.uint32
 class TestInt64DType(TestDType): DTYPE = dtypes.int64
-class TestUint64DType(TestDType): DTYPE = dtypes.uint64
+class TestUint64DType(TestDType):
+  DTYPE = dtypes.uint64
+  def test_uint64_load(self):
+    assert Tensor(2**64 - 1, dtype=dtypes.uint64).numpy() == 2**64 - 1
 class TestBoolDType(TestDType): DTYPE = dtypes.bool
@@ -298,7 +301,7 @@ class TestEqStrDType(unittest.TestCase):
   def test_strs(self):
     if PtrDType is None: raise unittest.SkipTest("no PtrDType support")
     self.assertEqual(str(dtypes.imagef((1,2,4))), "dtypes.imagef((1, 2, 4))")
-    self.assertEqual(str(PtrDType(dtypes.float32)), "ptr.dtypes.float")
+    self.assertEqual(str(PtrDType(dtypes.float32)), "PtrDType(dtypes.float)")
 class TestHelpers(unittest.TestCase):
   signed_ints = (dtypes.int8, dtypes.int16, dtypes.int32, dtypes.int64)
@@ -347,6 +350,20 @@ class TestHelpers(unittest.TestCase):
     with self.assertRaises(RuntimeError): dtypes.from_py({})
     with self.assertRaises(RuntimeError): dtypes.from_py(set())
+  def test_dtype_range(self):
+    for dt in core_dtypes:
+      if dtypes.is_float(dt):
+        np.testing.assert_equal(dtypes.min(dt), -math.inf)
+        np.testing.assert_equal(dtypes.max(dt), math.inf)
+      elif dtypes.is_int(dt):
+        info = np.iinfo(_to_np_dtype(dt))
+        np.testing.assert_equal(dtypes.min(dt), info.min)
+        np.testing.assert_equal(dtypes.max(dt), info.max)
+      else:
+        assert dt == dtypes.bool, dt
+        np.testing.assert_equal(dtypes.min(dt), False)
+        np.testing.assert_equal(dtypes.max(dt), True)
 class TestTypeSpec(unittest.TestCase):
   def setUp(self):
     self.old_default_int, self.old_default_float = dtypes.default_int, dtypes.default_float
@@ -378,6 +395,23 @@ class TestTypeSpec(unittest.TestCase):
       subprocess.run(['DEFAULT_FLOAT=TYPO python3 -c "from tinygrad import dtypes"'],
                       shell=True, check=True)
+  def test_dtype_str_arg(self):
+    n = np.random.normal(0, 1, (10, 10)).astype(np.float32)
+    tested = 0
+    for dtype_str, dtype in [
+      ("bool", dtypes.bool), ("int8", dtypes.int8), ("int", dtypes.int), ("uint32", dtypes.uint32), ("float32", dtypes.float32)]:
+      np.testing.assert_equal(Tensor(n, dtype=dtype_str).numpy(), Tensor(n, dtype=dtype).numpy())
+      np.testing.assert_equal(Tensor(n).cast(dtype_str).numpy(), Tensor(n).cast(dtype).numpy())
+      if dtype.itemsize == 4:
+        np.testing.assert_equal(Tensor(n).bitcast(dtype_str).numpy(), Tensor(n).bitcast(dtype).numpy())
+        tested += 1
+    assert tested == 3
+    with self.assertRaises(AttributeError): Tensor([1, 2, 3], dtype="nonexistdtype")
+    with self.assertRaises(AttributeError): Tensor([1, 2, 3], dtype="")
+    np.testing.assert_equal(Tensor(n).sum(acc_dtype="int16").numpy(), Tensor(n).sum(acc_dtype=dtypes.int16).numpy())
   @given(strat.sampled_from(dtype_ints), strat.sampled_from(dtype_floats))
   def test_creation(self, default_int, default_float):
     dtypes.default_int, dtypes.default_float = default_int, default_float
@@ -439,6 +473,9 @@ class TestTypeSpec(unittest.TestCase):
       _assert_eq(Tensor.arange(5, dtype=dtypes.float16), dtypes.float16, np.arange(5))
     _assert_eq(Tensor.arange(3, 9, 0.7), dtypes.default_float, np.arange(3, 9, 0.7))
     _assert_eq(Tensor.arange(3, 8.5, 3), dtypes.default_float, np.arange(3, 8.5, 3))
+    # stop-start and step have different signs
+    _assert_eq(Tensor.arange(3, 5, -2), dtypes.default_int, np.arange(3, 5, -2))
+    _assert_eq(Tensor.arange(5.0, 3.0), dtypes.default_float, np.arange(5.0, 3.0))
   @given(strat.sampled_from(core_dtypes), strat.sampled_from([operator.gt, operator.ge, operator.le, operator.lt, operator.eq, operator.ne]))
   def test_bool_ops(self, dtype, op):

{tinygrad-0.9.1 → tinygrad-0.9.2}/test/test_dtype_alu.py RENAMED Viewed

@@ -16,7 +16,7 @@ settings.register_profile("my_profile", max_examples=200, deadline=None, derando
 settings.load_profile("my_profile")
 print(settings.default)
-dtypes_float = (dtypes.float32, dtypes.float16)
+dtypes_float = (dtypes.float16, dtypes.float32, dtypes.float64)
 dtypes_int = (dtypes.int8, dtypes.int16, dtypes.int32, dtypes.int64, dtypes.uint8, dtypes.uint16, dtypes.uint32, dtypes.uint64)
 dtypes_bool = (dtypes.bool,)
 binary_operations = [operator.add, operator.sub, operator.mul, operator.lt, operator.eq]
@@ -24,9 +24,9 @@ binary_operations = [operator.add, operator.sub, operator.mul, operator.lt, oper
 # TODO: LLVM comparing with nan is incorrect
 if Device.DEFAULT == "LLVM":
   binary_operations.remove(operator.lt)
-  binary_operations.remove(operator.eq)
-integer_binary_operations = binary_operations + [(Tensor.xor, np.bitwise_xor)]
+integer_binary_operations = binary_operations + [(Tensor.xor, np.bitwise_xor), (Tensor.bitwise_and, np.bitwise_and),
+                                                 (Tensor.bitwise_or, np.bitwise_or)]
 unary_operations = [(Tensor.exp, np.exp), (Tensor.log, np.log), operator.neg, (Tensor.sin, np.sin),
                     (Tensor.sqrt, np.sqrt), (Tensor.reciprocal, np.reciprocal)]
@@ -39,9 +39,8 @@ unary_operations = [(Tensor.exp, np.exp), (Tensor.log, np.log), operator.neg, (T
 # TODO: (a+b)/2 in tensor.py's maximum can overflow. This requires a new implementation of maximum that can be backpropagated
 #binary_operations += [(Tensor.maximum, np.maximum)]
-# TODO: CUDACPU segfaults on sin
-# TODO: METAL sin is flaky for float16
-if getenv("CUDACPU") or (getenv("MOCKGPU") and Device.DEFAULT == "NV") or Device.DEFAULT == "METAL": unary_operations.remove((Tensor.sin, np.sin))
+# TODO: CI CUDA segfaults on sin
+if getenv("MOCKGPU") and Device.DEFAULT == "NV": unary_operations.remove((Tensor.sin, np.sin))
 class ht:
   float64 = strat.floats(width=64, allow_subnormal=False)
@@ -68,7 +67,7 @@ def universal_test_unary(a, dtype, op):
   if not isinstance(op, tuple): op = (op, op)
   out: Tensor = op[0](Tensor([a], dtype=dtype))
   sched = create_schedule([out.lazydata])
-  ast = sched[-1].ast[0]
+  ast = sched[-1].ast
   run_schedule(sched)
   tensor_value = out.numpy()
   numpy_value = op[1](np.array([a]).astype(_to_np_dtype(dtype)))
@@ -145,8 +144,8 @@ class TestDTypeALU(unittest.TestCase):
   @given(ht.int32, ht.int32, ht.float32, strat.sampled_from(integer_binary_operations), strat.sampled_from(binary_operations))
   def test_int32_midcast_float(self, a, b, c, op1, op2): universal_test_midcast(a, b, c, op1, op2, dtypes.int32, dtypes.float32)
-  # Metal and CUDACPU and HIP behave differently than numpy in CI for overflows
-  skip_overflow = CI and (Device.DEFAULT in {"AMD", "NV"} or getenv("CUDACPU"))
+  # Metal and CUDA and HIP behave differently than numpy in CI for overflows
+  skip_overflow = CI and Device.DEFAULT in {"AMD", "NV"}
   @given(strat.floats(width=32, min_value=0, max_value=10.0) if skip_overflow else ht.float32,
          strat.floats(width=32, min_value=0, max_value=10.0) if skip_overflow else ht.float32,
          ht.int32, strat.sampled_from(binary_operations), strat.sampled_from(integer_binary_operations))
@@ -161,5 +160,43 @@ class TestDTypeALU(unittest.TestCase):
   @given(ht.int32, strat.sampled_from(dtypes_float+dtypes_int+dtypes_bool))
   def test_int32_cast(self, a, dtype): universal_test_cast(a, dtypes.int32, dtype)
+class TestFromFuzzer(unittest.TestCase):
+  @given(strat.sampled_from(dtypes_float))
+  def test_sin(self, dtype):
+    if not is_dtype_supported(dtype): return
+    if dtype == dtypes.float64:
+      # crashes in CI CUDA
+      if getenv("MOCKGPU") and Device.DEFAULT == "NV": return
+    def _test_value(n: float, unit: float=1.0):
+      next_float = np.nextafter(1.0, 2.0, dtype=_to_np_dtype(dtype))
+      ulp = next_float - 1.0
+      ulp = unit * ulp
+      np.testing.assert_allclose(Tensor([n], dtype=dtype).sin().numpy(), np.sin(np.array([n], dtype=_to_np_dtype(dtype))), atol=ulp, rtol=1e-5)
+    _test_value(-35.0)
+    _test_value(-25.0)
+    _test_value(25.0)
+    _test_value(30.0) # 30.0 == switch_over
+    _test_value(35.0)
+    _test_value(0.0)
+    _test_value(np.pi / 2)
+     # worst case of ulp 1.5
+    _test_value(np.pi * 2, unit=1.5)
+  @given(strat.sampled_from(dtypes_float))
+  def test_log2(self, dtype):
+    if not is_dtype_supported(dtype): return
+    if dtype == dtypes.float64:
+      # crashes in CI CUDA
+      if getenv("MOCKGPU") and Device.DEFAULT == "NV": return
+    def _test_value(n: float, unit: float=1.0):
+      next_float = np.nextafter(1.0, 2.0, dtype=_to_np_dtype(dtype))
+      ulp = next_float - 1.0
+      ulp = unit * ulp
+      np.testing.assert_allclose(Tensor([n], dtype=dtype).log2().numpy(), np.log2(np.array([n], dtype=_to_np_dtype(dtype))), atol=ulp, rtol=1e-5)
+    fmin = np.finfo(_to_np_dtype(dtype)).tiny
+    for scale in [1.0, 1e10, 1e20, 1e30]:
+      _test_value(fmin * scale)
+      _test_value(-fmin * scale)
+    _test_value(0)
 if __name__ == '__main__':
   unittest.main()

tinygrad 0.9.1__tar.gz → 0.9.2__tar.gz

tinygrad 0.9.1tar.gz → 0.9.2tar.gz