tinygrad 0.9.1__tar.gz → 0.9.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (149) hide show
  1. {tinygrad-0.9.1 → tinygrad-0.9.2}/PKG-INFO +13 -12
  2. {tinygrad-0.9.1 → tinygrad-0.9.2}/README.md +10 -11
  3. {tinygrad-0.9.1 → tinygrad-0.9.2}/setup.py +4 -2
  4. tinygrad-0.9.2/test/test_arange.py +167 -0
  5. {tinygrad-0.9.1 → tinygrad-0.9.2}/test/test_const_folding.py +2 -2
  6. {tinygrad-0.9.1 → tinygrad-0.9.2}/test/test_conv.py +8 -0
  7. tinygrad-0.9.2/test/test_conv_shapetracker.py +63 -0
  8. {tinygrad-0.9.1 → tinygrad-0.9.2}/test/test_custom_function.py +2 -2
  9. {tinygrad-0.9.1 → tinygrad-0.9.2}/test/test_device_speed.py +1 -1
  10. {tinygrad-0.9.1 → tinygrad-0.9.2}/test/test_dtype.py +40 -3
  11. {tinygrad-0.9.1 → tinygrad-0.9.2}/test/test_dtype_alu.py +46 -9
  12. tinygrad-0.9.2/test/test_hcq.py +463 -0
  13. {tinygrad-0.9.1 → tinygrad-0.9.2}/test/test_image_dtype.py +22 -10
  14. {tinygrad-0.9.1 → tinygrad-0.9.2}/test/test_jit.py +58 -0
  15. {tinygrad-0.9.1 → tinygrad-0.9.2}/test/test_lazybuffer.py +9 -9
  16. {tinygrad-0.9.1 → tinygrad-0.9.2}/test/test_lazyop.py +1 -1
  17. {tinygrad-0.9.1 → tinygrad-0.9.2}/test/test_linearizer.py +860 -561
  18. tinygrad-0.9.2/test/test_linearizer_dumb.py +104 -0
  19. {tinygrad-0.9.1 → tinygrad-0.9.2}/test/test_linearizer_failures.py +248 -36
  20. {tinygrad-0.9.1 → tinygrad-0.9.2}/test/test_linearizer_overflows.py +2 -2
  21. {tinygrad-0.9.1 → tinygrad-0.9.2}/test/test_multitensor.py +94 -28
  22. {tinygrad-0.9.1 → tinygrad-0.9.2}/test/test_nn.py +51 -8
  23. tinygrad-0.9.2/test/test_ocl.py +20 -0
  24. {tinygrad-0.9.1 → tinygrad-0.9.2}/test/test_ops.py +194 -8
  25. {tinygrad-0.9.1 → tinygrad-0.9.2}/test/test_pattern_matcher.py +59 -41
  26. {tinygrad-0.9.1 → tinygrad-0.9.2}/test/test_pickle.py +25 -3
  27. tinygrad-0.9.2/test/test_profiler.py +220 -0
  28. tinygrad-0.9.2/test/test_renderer_failures.py +43 -0
  29. {tinygrad-0.9.1 → tinygrad-0.9.2}/test/test_schedule.py +490 -57
  30. {tinygrad-0.9.1 → tinygrad-0.9.2}/test/test_search.py +23 -12
  31. {tinygrad-0.9.1 → tinygrad-0.9.2}/test/test_speed_v_torch.py +5 -14
  32. {tinygrad-0.9.1 → tinygrad-0.9.2}/test/test_subbuffer.py +1 -2
  33. {tinygrad-0.9.1 → tinygrad-0.9.2}/test/test_tensor.py +130 -5
  34. {tinygrad-0.9.1 → tinygrad-0.9.2}/test/test_tensor_data.py +12 -1
  35. tinygrad-0.9.2/test/test_transcendental.py +71 -0
  36. tinygrad-0.9.2/test/test_uop_graph.py +662 -0
  37. {tinygrad-0.9.1 → tinygrad-0.9.2}/test/test_uops.py +106 -46
  38. tinygrad-0.9.2/test/test_uops_stats.py +203 -0
  39. {tinygrad-0.9.1 → tinygrad-0.9.2}/test/test_verify_lazyop.py +22 -10
  40. {tinygrad-0.9.1 → tinygrad-0.9.2}/test/test_winograd.py +7 -6
  41. {tinygrad-0.9.1 → tinygrad-0.9.2}/tinygrad/codegen/kernel.py +248 -115
  42. tinygrad-0.9.2/tinygrad/codegen/lowerer.py +215 -0
  43. tinygrad-0.9.2/tinygrad/codegen/transcendental.py +310 -0
  44. tinygrad-0.9.2/tinygrad/codegen/uopgraph.py +622 -0
  45. tinygrad-0.9.2/tinygrad/codegen/uops.py +293 -0
  46. tinygrad-0.9.2/tinygrad/device.py +679 -0
  47. {tinygrad-0.9.1 → tinygrad-0.9.2}/tinygrad/dtype.py +18 -4
  48. {tinygrad-0.9.1 → tinygrad-0.9.2}/tinygrad/engine/graph.py +19 -32
  49. tinygrad-0.9.2/tinygrad/engine/jit.py +276 -0
  50. tinygrad-0.9.2/tinygrad/engine/realize.py +268 -0
  51. tinygrad-0.9.2/tinygrad/engine/schedule.py +413 -0
  52. {tinygrad-0.9.1 → tinygrad-0.9.2}/tinygrad/engine/search.py +29 -22
  53. {tinygrad-0.9.1 → tinygrad-0.9.2}/tinygrad/function.py +9 -0
  54. {tinygrad-0.9.1 → tinygrad-0.9.2}/tinygrad/helpers.py +87 -49
  55. {tinygrad-0.9.1 → tinygrad-0.9.2}/tinygrad/lazy.py +34 -35
  56. {tinygrad-0.9.1 → tinygrad-0.9.2}/tinygrad/multi.py +41 -36
  57. {tinygrad-0.9.1 → tinygrad-0.9.2}/tinygrad/nn/__init__.py +39 -22
  58. {tinygrad-0.9.1 → tinygrad-0.9.2}/tinygrad/nn/state.py +3 -3
  59. {tinygrad-0.9.1 → tinygrad-0.9.2}/tinygrad/ops.py +63 -62
  60. tinygrad-0.9.2/tinygrad/renderer/__init__.py +87 -0
  61. {tinygrad-0.9.1 → tinygrad-0.9.2}/tinygrad/renderer/assembly.py +104 -106
  62. {tinygrad-0.9.1 → tinygrad-0.9.2}/tinygrad/renderer/cstyle.py +87 -60
  63. {tinygrad-0.9.1 → tinygrad-0.9.2}/tinygrad/renderer/llvmir.py +21 -30
  64. tinygrad-0.9.2/tinygrad/runtime/autogen/amd_gpu.py +32858 -0
  65. {tinygrad-0.9.1 → tinygrad-0.9.2}/tinygrad/runtime/autogen/cuda.py +6 -162
  66. {tinygrad-0.9.1 → tinygrad-0.9.2}/tinygrad/runtime/autogen/kfd.py +32 -0
  67. tinygrad-0.9.2/tinygrad/runtime/autogen/libc.py +4260 -0
  68. tinygrad-0.9.2/tinygrad/runtime/autogen/nvrtc.py +579 -0
  69. {tinygrad-0.9.1 → tinygrad-0.9.2}/tinygrad/runtime/graph/clang.py +2 -2
  70. {tinygrad-0.9.1 → tinygrad-0.9.2}/tinygrad/runtime/graph/cuda.py +8 -11
  71. tinygrad-0.9.2/tinygrad/runtime/graph/hcq.py +200 -0
  72. {tinygrad-0.9.1 → tinygrad-0.9.2}/tinygrad/runtime/graph/metal.py +18 -15
  73. {tinygrad-0.9.1 → tinygrad-0.9.2}/tinygrad/runtime/ops_amd.py +197 -305
  74. {tinygrad-0.9.1 → tinygrad-0.9.2}/tinygrad/runtime/ops_clang.py +2 -2
  75. tinygrad-0.9.2/tinygrad/runtime/ops_cuda.py +127 -0
  76. {tinygrad-0.9.1 → tinygrad-0.9.2}/tinygrad/runtime/ops_disk.py +3 -7
  77. {tinygrad-0.9.1 → tinygrad-0.9.2}/tinygrad/runtime/ops_gpu.py +4 -2
  78. tinygrad-0.9.2/tinygrad/runtime/ops_hip.py +70 -0
  79. {tinygrad-0.9.1 → tinygrad-0.9.2}/tinygrad/runtime/ops_metal.py +38 -27
  80. tinygrad-0.9.2/tinygrad/runtime/ops_nv.py +545 -0
  81. {tinygrad-0.9.1 → tinygrad-0.9.2}/tinygrad/runtime/ops_python.py +26 -30
  82. tinygrad-0.9.2/tinygrad/runtime/support/compiler_cuda.py +78 -0
  83. tinygrad-0.9.1/tinygrad/runtime/driver/hip_comgr.py → tinygrad-0.9.2/tinygrad/runtime/support/compiler_hip.py +15 -1
  84. tinygrad-0.9.2/tinygrad/runtime/support/elf.py +38 -0
  85. {tinygrad-0.9.1 → tinygrad-0.9.2}/tinygrad/shape/shapetracker.py +5 -14
  86. {tinygrad-0.9.1 → tinygrad-0.9.2}/tinygrad/shape/symbolic.py +4 -8
  87. {tinygrad-0.9.1 → tinygrad-0.9.2}/tinygrad/shape/view.py +34 -22
  88. {tinygrad-0.9.1 → tinygrad-0.9.2}/tinygrad/tensor.py +399 -97
  89. {tinygrad-0.9.1 → tinygrad-0.9.2}/tinygrad.egg-info/PKG-INFO +13 -12
  90. {tinygrad-0.9.1 → tinygrad-0.9.2}/tinygrad.egg-info/SOURCES.txt +16 -4
  91. {tinygrad-0.9.1 → tinygrad-0.9.2}/tinygrad.egg-info/requires.txt +2 -0
  92. tinygrad-0.9.1/test/test_arange.py +0 -19
  93. tinygrad-0.9.1/test/test_conv_shapetracker.py +0 -22
  94. tinygrad-0.9.1/test/test_print_tree.py +0 -66
  95. tinygrad-0.9.1/test/test_uop_graph.py +0 -190
  96. tinygrad-0.9.1/test/test_uops_stats.py +0 -81
  97. tinygrad-0.9.1/tinygrad/codegen/linearizer.py +0 -528
  98. tinygrad-0.9.1/tinygrad/codegen/uops.py +0 -451
  99. tinygrad-0.9.1/tinygrad/device.py +0 -320
  100. tinygrad-0.9.1/tinygrad/engine/jit.py +0 -198
  101. tinygrad-0.9.1/tinygrad/engine/realize.py +0 -192
  102. tinygrad-0.9.1/tinygrad/engine/schedule.py +0 -370
  103. tinygrad-0.9.1/tinygrad/renderer/__init__.py +0 -65
  104. tinygrad-0.9.1/tinygrad/runtime/autogen/amd_gpu.py +0 -13403
  105. tinygrad-0.9.1/tinygrad/runtime/graph/hcq.py +0 -187
  106. tinygrad-0.9.1/tinygrad/runtime/ops_cuda.py +0 -185
  107. tinygrad-0.9.1/tinygrad/runtime/ops_nv.py +0 -625
  108. {tinygrad-0.9.1 → tinygrad-0.9.2}/LICENSE +0 -0
  109. {tinygrad-0.9.1 → tinygrad-0.9.2}/setup.cfg +0 -0
  110. {tinygrad-0.9.1 → tinygrad-0.9.2}/test/test_assign.py +0 -0
  111. {tinygrad-0.9.1 → tinygrad-0.9.2}/test/test_copy_speed.py +0 -0
  112. {tinygrad-0.9.1 → tinygrad-0.9.2}/test/test_fusion_op.py +0 -0
  113. {tinygrad-0.9.1 → tinygrad-0.9.2}/test/test_fuzz_shape_ops.py +0 -0
  114. {tinygrad-0.9.1 → tinygrad-0.9.2}/test/test_gc.py +0 -0
  115. {tinygrad-0.9.1 → tinygrad-0.9.2}/test/test_graph.py +0 -0
  116. {tinygrad-0.9.1 → tinygrad-0.9.2}/test/test_kernel_cache.py +0 -0
  117. {tinygrad-0.9.1 → tinygrad-0.9.2}/test/test_masked_st.py +0 -0
  118. {tinygrad-0.9.1 → tinygrad-0.9.2}/test/test_method_cache.py +0 -0
  119. {tinygrad-0.9.1 → tinygrad-0.9.2}/test/test_net_speed.py +0 -0
  120. {tinygrad-0.9.1 → tinygrad-0.9.2}/test/test_optim.py +0 -0
  121. {tinygrad-0.9.1 → tinygrad-0.9.2}/test/test_randomness.py +0 -0
  122. {tinygrad-0.9.1 → tinygrad-0.9.2}/test/test_sample.py +0 -0
  123. {tinygrad-0.9.1 → tinygrad-0.9.2}/test/test_setitem.py +0 -0
  124. {tinygrad-0.9.1 → tinygrad-0.9.2}/test/test_specific_conv.py +0 -0
  125. {tinygrad-0.9.1 → tinygrad-0.9.2}/test/test_symbolic_jit.py +0 -0
  126. {tinygrad-0.9.1 → tinygrad-0.9.2}/test/test_symbolic_ops.py +0 -0
  127. {tinygrad-0.9.1 → tinygrad-0.9.2}/test/test_symbolic_shapetracker.py +0 -0
  128. {tinygrad-0.9.1 → tinygrad-0.9.2}/test/test_tensor_variable.py +0 -0
  129. {tinygrad-0.9.1 → tinygrad-0.9.2}/test/test_to_numpy.py +0 -0
  130. {tinygrad-0.9.1 → tinygrad-0.9.2}/test/test_zero_copy.py +0 -0
  131. {tinygrad-0.9.1 → tinygrad-0.9.2}/tinygrad/__init__.py +0 -0
  132. {tinygrad-0.9.1 → tinygrad-0.9.2}/tinygrad/codegen/__init__.py +0 -0
  133. {tinygrad-0.9.1 → tinygrad-0.9.2}/tinygrad/engine/__init__.py +0 -0
  134. {tinygrad-0.9.1 → tinygrad-0.9.2}/tinygrad/nn/datasets.py +0 -0
  135. {tinygrad-0.9.1 → tinygrad-0.9.2}/tinygrad/nn/optim.py +0 -0
  136. {tinygrad-0.9.1 → tinygrad-0.9.2}/tinygrad/runtime/__init__.py +0 -0
  137. {tinygrad-0.9.1 → tinygrad-0.9.2}/tinygrad/runtime/autogen/comgr.py +0 -0
  138. {tinygrad-0.9.1 → tinygrad-0.9.2}/tinygrad/runtime/autogen/hip.py +0 -0
  139. {tinygrad-0.9.1 → tinygrad-0.9.2}/tinygrad/runtime/autogen/hsa.py +0 -0
  140. {tinygrad-0.9.1 → tinygrad-0.9.2}/tinygrad/runtime/autogen/io_uring.py +0 -0
  141. {tinygrad-0.9.1 → tinygrad-0.9.2}/tinygrad/runtime/autogen/nv_gpu.py +0 -0
  142. {tinygrad-0.9.1 → tinygrad-0.9.2}/tinygrad/runtime/autogen/opencl.py +0 -0
  143. {tinygrad-0.9.1/tinygrad/runtime/driver → tinygrad-0.9.2/tinygrad/runtime/graph}/__init__.py +0 -0
  144. {tinygrad-0.9.1 → tinygrad-0.9.2}/tinygrad/runtime/ops_llvm.py +0 -0
  145. {tinygrad-0.9.1 → tinygrad-0.9.2}/tinygrad/runtime/ops_npy.py +0 -0
  146. {tinygrad-0.9.1/tinygrad/runtime/graph → tinygrad-0.9.2/tinygrad/runtime/support}/__init__.py +0 -0
  147. {tinygrad-0.9.1 → tinygrad-0.9.2}/tinygrad/shape/__init__.py +0 -0
  148. {tinygrad-0.9.1 → tinygrad-0.9.2}/tinygrad.egg-info/dependency_links.txt +0 -0
  149. {tinygrad-0.9.1 → tinygrad-0.9.2}/tinygrad.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: tinygrad
3
- Version: 0.9.1
3
+ Version: 0.9.2
4
4
  Summary: You like pytorch? You like micrograd? You love tinygrad! <3
5
5
  Author: George Hotz
6
6
  License: MIT
@@ -39,12 +39,14 @@ Requires-Dist: safetensors; extra == "testing"
39
39
  Requires-Dist: transformers; extra == "testing"
40
40
  Requires-Dist: sentencepiece; extra == "testing"
41
41
  Requires-Dist: tiktoken; extra == "testing"
42
+ Requires-Dist: blobfile; extra == "testing"
42
43
  Requires-Dist: librosa; extra == "testing"
43
44
  Requires-Dist: networkx; extra == "testing"
44
45
  Requires-Dist: hypothesis; extra == "testing"
45
46
  Requires-Dist: nibabel; extra == "testing"
46
47
  Requires-Dist: bottle; extra == "testing"
47
48
  Provides-Extra: docs
49
+ Requires-Dist: mkdocs; extra == "docs"
48
50
  Requires-Dist: mkdocs-material; extra == "docs"
49
51
  Requires-Dist: mkdocstrings[python]; extra == "docs"
50
52
  Requires-Dist: markdown-callouts; extra == "docs"
@@ -107,7 +109,7 @@ And we can change `DEBUG` to `4` to see the generated code.
107
109
  As it turns out, 90% of what you need for neural networks are a decent autograd/tensor library.
108
110
  Throw in an optimizer, a data loader, and some compute, and you have all you need.
109
111
 
110
- ```py
112
+ ```python
111
113
  from tinygrad import Tensor, nn
112
114
 
113
115
  class LinearNet:
@@ -122,11 +124,12 @@ optim = nn.optim.Adam([model.l1, model.l2], lr=0.001)
122
124
 
123
125
  x, y = Tensor.rand(4, 1, 28, 28), Tensor([2,4,3,7]) # replace with real mnist dataloader
124
126
 
125
- for i in range(10):
126
- optim.zero_grad()
127
- loss = model(x).sparse_categorical_crossentropy(y).backward()
128
- optim.step()
129
- print(i, loss.item())
127
+ with Tensor.train():
128
+ for i in range(10):
129
+ optim.zero_grad()
130
+ loss = model(x).sparse_categorical_crossentropy(y).backward()
131
+ optim.step()
132
+ print(i, loss.item())
130
133
  ```
131
134
 
132
135
  See [examples/beautiful_mnist.py](examples/beautiful_mnist.py) for the full version that gets 98% in ~5 seconds
@@ -169,7 +172,7 @@ Documentation along with a quick start guide can be found on the [docs website](
169
172
 
170
173
  ### Quick example comparing to PyTorch
171
174
 
172
- ```py
175
+ ```python
173
176
  from tinygrad import Tensor
174
177
 
175
178
  x = Tensor.eye(3, requires_grad=True)
@@ -182,7 +185,7 @@ print(y.grad.numpy()) # dz/dy
182
185
  ```
183
186
 
184
187
  The same thing but in PyTorch:
185
- ```py
188
+ ```python
186
189
  import torch
187
190
 
188
191
  x = torch.eye(3, requires_grad=True)
@@ -230,6 +233,4 @@ python3 -m pytest test/ # whole test suite
230
233
 
231
234
  #### Process replay tests
232
235
 
233
- [Process replay](https://github.com/tinygrad/tinygrad/blob/master/test/external/process_replay/process_replay.py) detects changes in the generated kernels of CI tests by comparing them against tinygrad master. If your PR is a refactor or speedup without any expected behavior change, it should include a green process replay pass to get merged.
234
-
235
- You can enable process replay by adding [run_process_replay] to your PR title. [example](https://github.com/tinygrad/tinygrad/pull/4995). Note that you should keep your branch up-to-date with master.
236
+ [Process replay](https://github.com/tinygrad/tinygrad/blob/master/test/external/process_replay/process_replay.py) compares your PR's generated kernels against master. If your PR is a refactor or speedup without any expected behavior change, It should include [run_process_replay] in the PR title, [example](https://github.com/tinygrad/tinygrad/pull/4995). Note that you should keep your branch up-to-date with master.
@@ -51,7 +51,7 @@ And we can change `DEBUG` to `4` to see the generated code.
51
51
  As it turns out, 90% of what you need for neural networks are a decent autograd/tensor library.
52
52
  Throw in an optimizer, a data loader, and some compute, and you have all you need.
53
53
 
54
- ```py
54
+ ```python
55
55
  from tinygrad import Tensor, nn
56
56
 
57
57
  class LinearNet:
@@ -66,11 +66,12 @@ optim = nn.optim.Adam([model.l1, model.l2], lr=0.001)
66
66
 
67
67
  x, y = Tensor.rand(4, 1, 28, 28), Tensor([2,4,3,7]) # replace with real mnist dataloader
68
68
 
69
- for i in range(10):
70
- optim.zero_grad()
71
- loss = model(x).sparse_categorical_crossentropy(y).backward()
72
- optim.step()
73
- print(i, loss.item())
69
+ with Tensor.train():
70
+ for i in range(10):
71
+ optim.zero_grad()
72
+ loss = model(x).sparse_categorical_crossentropy(y).backward()
73
+ optim.step()
74
+ print(i, loss.item())
74
75
  ```
75
76
 
76
77
  See [examples/beautiful_mnist.py](examples/beautiful_mnist.py) for the full version that gets 98% in ~5 seconds
@@ -113,7 +114,7 @@ Documentation along with a quick start guide can be found on the [docs website](
113
114
 
114
115
  ### Quick example comparing to PyTorch
115
116
 
116
- ```py
117
+ ```python
117
118
  from tinygrad import Tensor
118
119
 
119
120
  x = Tensor.eye(3, requires_grad=True)
@@ -126,7 +127,7 @@ print(y.grad.numpy()) # dz/dy
126
127
  ```
127
128
 
128
129
  The same thing but in PyTorch:
129
- ```py
130
+ ```python
130
131
  import torch
131
132
 
132
133
  x = torch.eye(3, requires_grad=True)
@@ -174,6 +175,4 @@ python3 -m pytest test/ # whole test suite
174
175
 
175
176
  #### Process replay tests
176
177
 
177
- [Process replay](https://github.com/tinygrad/tinygrad/blob/master/test/external/process_replay/process_replay.py) detects changes in the generated kernels of CI tests by comparing them against tinygrad master. If your PR is a refactor or speedup without any expected behavior change, it should include a green process replay pass to get merged.
178
-
179
- You can enable process replay by adding [run_process_replay] to your PR title. [example](https://github.com/tinygrad/tinygrad/pull/4995). Note that you should keep your branch up-to-date with master.
178
+ [Process replay](https://github.com/tinygrad/tinygrad/blob/master/test/external/process_replay/process_replay.py) compares your PR's generated kernels against master. If your PR is a refactor or speedup without any expected behavior change, It should include [run_process_replay] in the PR title, [example](https://github.com/tinygrad/tinygrad/pull/4995). Note that you should keep your branch up-to-date with master.
@@ -8,14 +8,14 @@ with open(directory / 'README.md', encoding='utf-8') as f:
8
8
  long_description = f.read()
9
9
 
10
10
  setup(name='tinygrad',
11
- version='0.9.1',
11
+ version='0.9.2',
12
12
  description='You like pytorch? You like micrograd? You love tinygrad! <3',
13
13
  author='George Hotz',
14
14
  license='MIT',
15
15
  long_description=long_description,
16
16
  long_description_content_type='text/markdown',
17
17
  packages = ['tinygrad', 'tinygrad.runtime.autogen', 'tinygrad.codegen', 'tinygrad.nn', 'tinygrad.renderer', 'tinygrad.engine',
18
- 'tinygrad.runtime', 'tinygrad.runtime.driver', 'tinygrad.runtime.graph', 'tinygrad.shape'],
18
+ 'tinygrad.runtime', 'tinygrad.runtime.support', 'tinygrad.runtime.graph', 'tinygrad.shape'],
19
19
  classifiers=[
20
20
  "Programming Language :: Python :: 3",
21
21
  "License :: OSI Approved :: MIT License"
@@ -51,6 +51,7 @@ setup(name='tinygrad',
51
51
  "transformers",
52
52
  "sentencepiece",
53
53
  "tiktoken",
54
+ "blobfile",
54
55
  "librosa",
55
56
  "networkx",
56
57
  "hypothesis",
@@ -58,6 +59,7 @@ setup(name='tinygrad',
58
59
  "bottle",
59
60
  ],
60
61
  'docs': [
62
+ "mkdocs",
61
63
  "mkdocs-material",
62
64
  "mkdocstrings[python]",
63
65
  "markdown-callouts",
@@ -0,0 +1,167 @@
1
+ import unittest, contextlib
2
+ import numpy as np
3
+ from tinygrad import Tensor, GlobalCounters, dtypes, nn
4
+ from tinygrad.helpers import CI, Context, getenv
5
+ from tinygrad.engine.realize import run_schedule
6
+ from tinygrad.codegen.kernel import Opt, OptOps, Kernel, KernelOptError
7
+ from tinygrad.engine.realize import CompiledRunner, ExecItem
8
+ from tinygrad.engine.search import get_kernel_actions
9
+
10
+ class TestArange(unittest.TestCase):
11
+ def _get_flops(self, N, opts=None):
12
+ GlobalCounters.reset()
13
+ tt = Tensor.arange(N)
14
+ sched = tt.schedule()
15
+ self.assertEqual(len(sched), 1)
16
+ k = Kernel(sched[-1].ast)
17
+ if opts is not None:
18
+ for o in opts: k.apply_opt(o)
19
+ p = k.to_program()
20
+ print(p.name)
21
+ #print(p.src)
22
+ ExecItem(CompiledRunner(p), [tt.lazydata.buffer]).run()
23
+ np.testing.assert_equal(tt.numpy(), np.arange(N))
24
+ return p.op_estimate
25
+
26
+ def test_complexity(self, opts=None):
27
+ # add 1 to avoid divide by 0. arange is 0 flops now!
28
+ f1 = self._get_flops(256, opts) + 1
29
+ f2 = self._get_flops(2560, opts) + 1
30
+ print(f"{f1=}, {f2=}")
31
+ assert (f1 < 5000 and f2 < 5000) or (f2 / f1 < 15), f"bad complexity, flops {f2/f1:.1f}X while inputs 10X"
32
+
33
+ def test_complexity_w_upcast(self): return self.test_complexity([Opt(OptOps.UPCAST, 0, 4)])
34
+ def test_complexity_w_unroll(self): return self.test_complexity([Opt(OptOps.UNROLL, 0, 4)])
35
+ def test_complexity_w_upcast_and_unroll(self): return self.test_complexity([Opt(OptOps.UPCAST, 0, 4), Opt(OptOps.UNROLL, 0, 4)])
36
+
37
+ @unittest.skip("doesn't work yet")
38
+ def test_complexity_w_local_and_padto(self): return self.test_complexity([Opt(OptOps.LOCAL, 0, 16), Opt(op=OptOps.PADTO, axis=1, amt=32)])
39
+
40
+ def test_all_opts(self, opts=None, exclude=None):
41
+ k = Kernel(Tensor.arange(256).schedule()[-1].ast)
42
+ if opts is not None:
43
+ for o in opts: k.apply_opt(o)
44
+ all_opts_256 = [kk.applied_opts for kk in get_kernel_actions(k, include_0=False).values()]
45
+ k = Kernel(Tensor.arange(2560).schedule()[-1].ast)
46
+ if opts is not None:
47
+ for o in opts: k.apply_opt(o)
48
+ all_opts_2560 = [kk.applied_opts for kk in get_kernel_actions(k, include_0=False).values()]
49
+ all_opts = [x for x in all_opts_256 if x in all_opts_2560]
50
+ for opts in all_opts:
51
+ if exclude is not None and opts[-1] in exclude: continue
52
+ print(opts)
53
+ self.test_complexity(opts)
54
+ def test_all_opts_w_local(self):
55
+ with contextlib.suppress(KernelOptError):
56
+ return self.test_all_opts([Opt(OptOps.LOCAL, 0, 16)], [Opt(op=OptOps.PADTO, axis=1, amt=32)])
57
+ def test_all_opts_w_upcast(self): return self.test_all_opts([Opt(OptOps.UPCAST, 0, 4)])
58
+ def test_all_opts_w_unroll(self): return self.test_all_opts([Opt(OptOps.UNROLL, 0, 4)], [Opt(op=OptOps.GROUP, axis=0, amt=0)])
59
+ def test_all_opts_w_upcast_and_unroll(self):
60
+ return self.test_all_opts([Opt(OptOps.UPCAST, 0, 4), Opt(OptOps.UNROLL, 0, 4)], [Opt(op=OptOps.GROUP, axis=0, amt=0)])
61
+
62
+ class TestIndexing(unittest.TestCase):
63
+ def test_arange_2_reduce(self):
64
+ needle = Tensor.zeros(16384, dtype=dtypes.int).contiguous()
65
+ needle[1337] = 1
66
+ needle.realize()
67
+ with Context(NOOPT=1, FUSE_ARANGE=1):
68
+ GlobalCounters.reset()
69
+ # TODO: it should work without these reshapes
70
+ out = ((Tensor.arange(1,16385).reshape(16384,1)-1)*needle.reshape(16384,1)).sum()
71
+ sched = out.schedule()
72
+ assert len(sched) == 1
73
+ run_schedule(sched)
74
+ assert out.item() == 1337, f"expected 1337, got {out.item()}"
75
+
76
+ @unittest.skipIf(getenv("PTX"), "broken on ptx for some reason")
77
+ def test_manual_index(self):
78
+ dataset = Tensor.rand(16384, 256).realize()
79
+ idxs = Tensor([0,3,5,6]).realize()
80
+ real_index = dataset.numpy()[idxs.numpy()]
81
+ print("*** indexing ***")
82
+ with Context(NOOPT=1, FUSE_ARANGE=1):
83
+ GlobalCounters.reset()
84
+ rng = Tensor.ones(4, 256, 16384, dtype=dtypes.int)._cumsum(axis=-1, _first_zero=True).reshape(4, 256, 16384, 1)
85
+ idxs = idxs.reshape(4,1,1,1).expand(4, 256, 16384, 1)
86
+ reshape_dataset = dataset.T.reshape(1, 256, 16384, 1).expand(4, 256, 16384, 1)
87
+ full = (rng==idxs).where(reshape_dataset, Tensor.zeros(4, 256, 16384, 1))
88
+ X = full.sum(axis=(2,3))
89
+ sched = X.schedule()
90
+ assert len(sched) == 1
91
+ run_schedule(sched)
92
+ assert GlobalCounters.global_ops < 4*16384, f"too many ops {GlobalCounters.global_ops}"
93
+ np.testing.assert_allclose(real_index, X.numpy())
94
+
95
+ def test_index(self):
96
+ dataset = Tensor.rand(16384, 256).realize()
97
+ idxs = Tensor([0,3,5,6]).realize()
98
+ real_index = dataset.numpy()[idxs.numpy()]
99
+ print("*** indexing ***")
100
+ with Context(NOOPT=1):
101
+ GlobalCounters.reset()
102
+ X = dataset[idxs]
103
+ assert X.shape == (4,256)
104
+ sched = X.schedule()
105
+ # TODO: enable these asserts when the scheduler can handle this
106
+ #assert len(sched) == 1, f"{len(sched)} != 1"
107
+ run_schedule(sched)
108
+ #assert GlobalCounters.global_ops < 4*16384, f"too many ops {GlobalCounters.global_ops}"
109
+ np.testing.assert_allclose(real_index, X.numpy())
110
+
111
+ def test_index_fused(self, noopt=1):
112
+ dataset = Tensor.rand(16384, 256).realize()
113
+ idxs = Tensor([0,3,5,6]).realize()
114
+ real_index = dataset.numpy()[idxs.numpy()]
115
+ print("*** indexing ***")
116
+ with Context(NOOPT=noopt, FUSE_ARANGE=1):
117
+ GlobalCounters.reset()
118
+ X = dataset[idxs]
119
+ assert X.shape == (4,256)
120
+ sched = X.schedule()
121
+ assert len(sched) == 2
122
+ run_schedule(sched)
123
+ assert GlobalCounters.global_ops < 4*16384, f"too many ops {GlobalCounters.global_ops} != {4*16384}"
124
+ np.testing.assert_allclose(real_index, X.numpy())
125
+ @unittest.skip("not ready")
126
+ def test_index_fused_opt(self): self.test_index_fused(0)
127
+
128
+ @unittest.skipIf(getenv("PTX"), "broken on ptx for some reason")
129
+ def test_index_mnist(self, noopt=1):
130
+ from tinygrad.nn.datasets import mnist
131
+ X_train, Y_train, _, _ = mnist()
132
+ with Context(NOOPT=noopt, FUSE_ARANGE=1, SPLIT_REDUCEOP=0):
133
+ GlobalCounters.reset()
134
+ samples = Tensor.randint(getenv("BS", 512), high=X_train.shape[0])
135
+ x = X_train[samples].numpy()
136
+ y = Y_train[samples].numpy()
137
+ assert GlobalCounters.global_ops < 4*16384, f"too many ops {GlobalCounters.global_ops} != {4*16384}"
138
+ np.testing.assert_allclose(X_train.numpy()[samples.numpy()], x)
139
+ np.testing.assert_allclose(Y_train.numpy()[samples.numpy()], y)
140
+ @unittest.skip("not ready")
141
+ def test_index_mnist_opt(self): self.test_index_mnist(0)
142
+
143
+ @unittest.skipIf(getenv("PTX"), "broken on ptx for some reason")
144
+ def test_llama_embedding(self, noopt=1, op_limit=0):
145
+ # llama3 is 128256
146
+ vocab_size, embed_size = (10, 3) if CI else (32000, 4096)
147
+ emb = nn.Embedding(vocab_size, embed_size)
148
+ emb_w = emb.weight.numpy()
149
+ x = Tensor([1,2,3,4])
150
+ with Context(NOOPT=noopt, FUSE_ARANGE=1):
151
+ GlobalCounters.reset()
152
+ z = emb(x).realize()
153
+ self.assertLessEqual(GlobalCounters.global_ops, op_limit)
154
+ self.assertEqual(GlobalCounters.kernel_count, 2)
155
+ if getenv("CHECK", 1):
156
+ import torch
157
+ with torch.no_grad():
158
+ torch_emb = torch.nn.Embedding(vocab_size, embed_size).eval()
159
+ torch_emb.weight[:] = torch.tensor(emb_w, dtype=torch.float32)
160
+ torch_z = torch_emb(torch.tensor(x.numpy()))
161
+ # TODO: reshape to match torch, should we do this in nn?
162
+ np.testing.assert_allclose(z.numpy().reshape(4, embed_size), torch_z.detach().numpy(), atol=1e-8, rtol=1e-8)
163
+ # at least the arange is being fused
164
+ def test_llama_embedding_opt(self): self.test_llama_embedding(0, 1736704000)
165
+
166
+ if __name__ == "__main__":
167
+ unittest.main()
@@ -2,14 +2,14 @@ import unittest, math
2
2
  from tinygrad import Tensor, Device, dtypes
3
3
  from tinygrad.engine.schedule import create_schedule
4
4
  from tinygrad.helpers import CI
5
- from tinygrad.ops import BufferOps
5
+ from tinygrad.ops import MetaOps
6
6
  import numpy as np
7
7
  from test.helpers import is_dtype_supported
8
8
 
9
9
  def _check_ast_count(desired_count:int, t:Tensor):
10
10
  # NOTE: this has side effect because everything can be scheduled only once
11
11
  schedule = create_schedule(t.lazydata.lbs)
12
- asts = [s for s in schedule if s.ast[0].op is BufferOps.STORE]
12
+ asts = [s for s in schedule if s.ast.op is MetaOps.KERNEL]
13
13
  assert len(asts) == desired_count
14
14
 
15
15
  class TestUnaryOpsConstFolding(unittest.TestCase):
@@ -42,6 +42,14 @@ class TestConv(unittest.TestCase):
42
42
 
43
43
  print(ret.numpy())
44
44
 
45
+ def test_two_binops_no_rerun_small(self):
46
+ Tensor.no_grad = True
47
+ x = Tensor.rand(1,1,32,32)
48
+ w = Tensor.rand(1,1,3,3)
49
+ out = x.conv2d(w, padding=(1,1))
50
+ np.testing.assert_allclose(out.relu().numpy(), np.maximum(out.numpy(), 0))
51
+ Tensor.no_grad = False
52
+
45
53
  def test_two_binops_no_rerun(self):
46
54
  Tensor.no_grad = True
47
55
  x = Tensor.randn(1,12,128,256)
@@ -0,0 +1,63 @@
1
+ #!/usr/bin/env python
2
+ import unittest
3
+ from tinygrad.tensor import Tensor
4
+ from tinygrad.ops import MetaOps, BufferOps
5
+ from tinygrad.nn import Conv2d
6
+ from tinygrad.engine.schedule import create_schedule
7
+ from tinygrad.shape.shapetracker import ShapeTracker, View
8
+ from tinygrad.helpers import prod
9
+ from test.unit.test_shapetracker import shapetracker_getitem
10
+
11
+ class TestConvShapetracker(unittest.TestCase):
12
+ def test_conv_3x3_one_view(self):
13
+ conv = Conv2d(16, 32, (3, 3))
14
+ seen = set()
15
+
16
+ # first run to init the weights, they are saved in seen
17
+ create_schedule([conv(Tensor.empty(1, 16, 10, 10)).lazydata], seen)
18
+ # run it again to get the kernels
19
+ sched = [si for si in create_schedule([conv(Tensor.empty(1, 16, 10, 10)).lazydata], seen) if si.ast.op is MetaOps.KERNEL]
20
+ assert len(sched) == 1, f"conv should only have one kernel, getting {len(sched)}"
21
+ for st in [x.arg.st for x in sched[0].ast.lazyops if x.op is BufferOps.LOAD]:
22
+ assert len(st.views) == 1
23
+
24
+ @unittest.expectedFailure
25
+ def test_conv_2x2_backward_one_view(self):
26
+ X = Tensor.rand(1, 1, 3, 3, requires_grad=True)
27
+ conv = Conv2d(1, 1, (2, 2), bias=False)
28
+ conv(X).mean().backward()
29
+ si = X.grad.schedule()[-1]
30
+ print(si)
31
+ ldb = [x for x in si.ast.lazyops if x.op is BufferOps.LOAD][0]
32
+ st: ShapeTracker = ldb.arg.st.simplify()
33
+ # NOTE: st.real_size() is broken
34
+ print(si.inputs[0].size)
35
+ #self.assertEqual(si.inputs[0].size, st.real_size())
36
+ for v in st.views: print(v)
37
+
38
+ # same st
39
+ test_st = ShapeTracker((
40
+ View(shape=(1, 1, 2, 4, 2, 4), strides=(0, 0, 2, 8, 1, 4), offset=0, mask=((0, 1), (0, 1), (0, 2), (0, 2), (0, 2), (0, 2)), contiguous=False),
41
+ View(shape=(1, 1, 1, 1, 3, 3, 3, 3), strides=(0, 0, 0, 0, 24, 8, 3, 1), offset=0,
42
+ mask=((0, 1), (0, 1), (0, 1), (0, 1), (0, 2), (0, 3), (0, 2), (0, 3)), contiguous=False)))
43
+ #test_st = ShapeTracker((
44
+ # View(shape=(2,4), strides=(1,4), offset=0, mask=None, contiguous=False),
45
+ #)).simplify()
46
+ #View(shape=(1, 1, 2, 4, 2, 4), strides=(0, 0, 2, 8, 1, 4), offset=0, mask=((0, 1), (0, 1), (0, 2), (0, 2), (0, 2), (0, 2)), contiguous=False),
47
+ #View(shape=(1, 1, 1, 1, 3, 3, 3, 3), strides=(0, 0, 0, 0, 24, 8, 3, 1), offset=0,
48
+ # mask=((0, 1), (0, 1), (0, 1), (0, 1), (0, 2), (0, 3), (0, 2), (0, 3)), contiguous=False))).simplify()
49
+ print("*** new ***")
50
+ for v in test_st.views: print(v)
51
+ for i in range(prod(st.shape)):
52
+ i1, i2 = shapetracker_getitem(st, i), shapetracker_getitem(test_st, i)
53
+ print(i, i1, i2, si.inputs[0].size, i1==i2)
54
+ #self.assertEqual(i1, i2)
55
+
56
+ for stt in [st, test_st]:
57
+ s,va = stt.expr_idxs()
58
+ print(s)
59
+ print(va)
60
+ assert len(st.views) <= 2
61
+
62
+ if __name__ == '__main__':
63
+ unittest.main()
@@ -31,7 +31,7 @@ def atan2_cpu(ret:Buffer, a:Buffer, b:Buffer): ret.copyin(np.require(np.arctan2(
31
31
  # NOTE: The derivative of atan2 doesn't need a custom op! https://www.liquisearch.com/atan2/derivative
32
32
  # In general, it is also optional to write a backward function, just your backward pass won't work without it
33
33
 
34
- from tinygrad.ops import LoadOps, BinaryOps, UnaryOps
34
+ from tinygrad.ops import MetaOps, BinaryOps, UnaryOps
35
35
  from tinygrad.lazy import LazyBuffer
36
36
  from tinygrad.tensor import Function
37
37
 
@@ -39,7 +39,7 @@ class ATan2(Function):
39
39
  def forward(self, a:LazyBuffer, b:LazyBuffer) -> LazyBuffer:
40
40
  assert prod(a.shape) == prod(b.shape) and a.device == b.device, "shape or device mismatch"
41
41
  self.a, self.b = a, b
42
- return create_lazybuffer(a.device, ShapeTracker.from_shape(a.shape), max(a.dtype, b.dtype), LoadOps.CUSTOM,
42
+ return create_lazybuffer(a.device, ShapeTracker.from_shape(a.shape), max(a.dtype, b.dtype), MetaOps.CUSTOM,
43
43
  arg={"GPU": atan2_gpu, "CPU": atan2_cpu}[a.device], srcs=(a.contiguous(), b.contiguous()))
44
44
  def backward(self, grad_output:LazyBuffer) -> Tuple[Optional[LazyBuffer], Optional[LazyBuffer]]:
45
45
  recip = (self.a.e(BinaryOps.MUL, self.a)).e(BinaryOps.ADD, self.b.e(BinaryOps.MUL, self.b)).e(UnaryOps.RECIP)
@@ -1,6 +1,6 @@
1
1
  import unittest
2
2
  from tinygrad import Device
3
- from tinygrad.codegen.uops import UOpGraph
3
+ from tinygrad.codegen.uopgraph import UOpGraph
4
4
  from tinygrad.helpers import Timing, Profiling
5
5
 
6
6
  class TestDeviceSpeed(unittest.TestCase):
@@ -202,7 +202,7 @@ class TestFloatDType(TestDType):
202
202
 
203
203
  class TestDoubleDType(TestDType):
204
204
  DTYPE = dtypes.double
205
- @unittest.skipIf((CI and Device.DEFAULT in {"CUDA", "NV"}) or getenv("PTX"), "conversion not supported on CUDACPU and PTX") # TODO: why not?
205
+ @unittest.skipIf((CI and Device.DEFAULT in {"CUDA", "NV"}) or getenv("PTX"), "conversion not supported on CI CUDA and PTX") # TODO: why not?
206
206
  def test_float64_increased_precision(self):
207
207
  for func in [
208
208
  lambda t: t.exp(),
@@ -267,7 +267,10 @@ class TestInt32DType(TestDType): DTYPE = dtypes.int32
267
267
  class TestUint32DType(TestDType): DTYPE = dtypes.uint32
268
268
 
269
269
  class TestInt64DType(TestDType): DTYPE = dtypes.int64
270
- class TestUint64DType(TestDType): DTYPE = dtypes.uint64
270
+ class TestUint64DType(TestDType):
271
+ DTYPE = dtypes.uint64
272
+ def test_uint64_load(self):
273
+ assert Tensor(2**64 - 1, dtype=dtypes.uint64).numpy() == 2**64 - 1
271
274
 
272
275
  class TestBoolDType(TestDType): DTYPE = dtypes.bool
273
276
 
@@ -298,7 +301,7 @@ class TestEqStrDType(unittest.TestCase):
298
301
  def test_strs(self):
299
302
  if PtrDType is None: raise unittest.SkipTest("no PtrDType support")
300
303
  self.assertEqual(str(dtypes.imagef((1,2,4))), "dtypes.imagef((1, 2, 4))")
301
- self.assertEqual(str(PtrDType(dtypes.float32)), "ptr.dtypes.float")
304
+ self.assertEqual(str(PtrDType(dtypes.float32)), "PtrDType(dtypes.float)")
302
305
 
303
306
  class TestHelpers(unittest.TestCase):
304
307
  signed_ints = (dtypes.int8, dtypes.int16, dtypes.int32, dtypes.int64)
@@ -347,6 +350,20 @@ class TestHelpers(unittest.TestCase):
347
350
  with self.assertRaises(RuntimeError): dtypes.from_py({})
348
351
  with self.assertRaises(RuntimeError): dtypes.from_py(set())
349
352
 
353
+ def test_dtype_range(self):
354
+ for dt in core_dtypes:
355
+ if dtypes.is_float(dt):
356
+ np.testing.assert_equal(dtypes.min(dt), -math.inf)
357
+ np.testing.assert_equal(dtypes.max(dt), math.inf)
358
+ elif dtypes.is_int(dt):
359
+ info = np.iinfo(_to_np_dtype(dt))
360
+ np.testing.assert_equal(dtypes.min(dt), info.min)
361
+ np.testing.assert_equal(dtypes.max(dt), info.max)
362
+ else:
363
+ assert dt == dtypes.bool, dt
364
+ np.testing.assert_equal(dtypes.min(dt), False)
365
+ np.testing.assert_equal(dtypes.max(dt), True)
366
+
350
367
  class TestTypeSpec(unittest.TestCase):
351
368
  def setUp(self):
352
369
  self.old_default_int, self.old_default_float = dtypes.default_int, dtypes.default_float
@@ -378,6 +395,23 @@ class TestTypeSpec(unittest.TestCase):
378
395
  subprocess.run(['DEFAULT_FLOAT=TYPO python3 -c "from tinygrad import dtypes"'],
379
396
  shell=True, check=True)
380
397
 
398
+ def test_dtype_str_arg(self):
399
+ n = np.random.normal(0, 1, (10, 10)).astype(np.float32)
400
+ tested = 0
401
+ for dtype_str, dtype in [
402
+ ("bool", dtypes.bool), ("int8", dtypes.int8), ("int", dtypes.int), ("uint32", dtypes.uint32), ("float32", dtypes.float32)]:
403
+ np.testing.assert_equal(Tensor(n, dtype=dtype_str).numpy(), Tensor(n, dtype=dtype).numpy())
404
+ np.testing.assert_equal(Tensor(n).cast(dtype_str).numpy(), Tensor(n).cast(dtype).numpy())
405
+ if dtype.itemsize == 4:
406
+ np.testing.assert_equal(Tensor(n).bitcast(dtype_str).numpy(), Tensor(n).bitcast(dtype).numpy())
407
+ tested += 1
408
+ assert tested == 3
409
+
410
+ with self.assertRaises(AttributeError): Tensor([1, 2, 3], dtype="nonexistdtype")
411
+ with self.assertRaises(AttributeError): Tensor([1, 2, 3], dtype="")
412
+
413
+ np.testing.assert_equal(Tensor(n).sum(acc_dtype="int16").numpy(), Tensor(n).sum(acc_dtype=dtypes.int16).numpy())
414
+
381
415
  @given(strat.sampled_from(dtype_ints), strat.sampled_from(dtype_floats))
382
416
  def test_creation(self, default_int, default_float):
383
417
  dtypes.default_int, dtypes.default_float = default_int, default_float
@@ -439,6 +473,9 @@ class TestTypeSpec(unittest.TestCase):
439
473
  _assert_eq(Tensor.arange(5, dtype=dtypes.float16), dtypes.float16, np.arange(5))
440
474
  _assert_eq(Tensor.arange(3, 9, 0.7), dtypes.default_float, np.arange(3, 9, 0.7))
441
475
  _assert_eq(Tensor.arange(3, 8.5, 3), dtypes.default_float, np.arange(3, 8.5, 3))
476
+ # stop-start and step have different signs
477
+ _assert_eq(Tensor.arange(3, 5, -2), dtypes.default_int, np.arange(3, 5, -2))
478
+ _assert_eq(Tensor.arange(5.0, 3.0), dtypes.default_float, np.arange(5.0, 3.0))
442
479
 
443
480
  @given(strat.sampled_from(core_dtypes), strat.sampled_from([operator.gt, operator.ge, operator.le, operator.lt, operator.eq, operator.ne]))
444
481
  def test_bool_ops(self, dtype, op):
@@ -16,7 +16,7 @@ settings.register_profile("my_profile", max_examples=200, deadline=None, derando
16
16
  settings.load_profile("my_profile")
17
17
  print(settings.default)
18
18
 
19
- dtypes_float = (dtypes.float32, dtypes.float16)
19
+ dtypes_float = (dtypes.float16, dtypes.float32, dtypes.float64)
20
20
  dtypes_int = (dtypes.int8, dtypes.int16, dtypes.int32, dtypes.int64, dtypes.uint8, dtypes.uint16, dtypes.uint32, dtypes.uint64)
21
21
  dtypes_bool = (dtypes.bool,)
22
22
  binary_operations = [operator.add, operator.sub, operator.mul, operator.lt, operator.eq]
@@ -24,9 +24,9 @@ binary_operations = [operator.add, operator.sub, operator.mul, operator.lt, oper
24
24
  # TODO: LLVM comparing with nan is incorrect
25
25
  if Device.DEFAULT == "LLVM":
26
26
  binary_operations.remove(operator.lt)
27
- binary_operations.remove(operator.eq)
28
27
 
29
- integer_binary_operations = binary_operations + [(Tensor.xor, np.bitwise_xor)]
28
+ integer_binary_operations = binary_operations + [(Tensor.xor, np.bitwise_xor), (Tensor.bitwise_and, np.bitwise_and),
29
+ (Tensor.bitwise_or, np.bitwise_or)]
30
30
  unary_operations = [(Tensor.exp, np.exp), (Tensor.log, np.log), operator.neg, (Tensor.sin, np.sin),
31
31
  (Tensor.sqrt, np.sqrt), (Tensor.reciprocal, np.reciprocal)]
32
32
 
@@ -39,9 +39,8 @@ unary_operations = [(Tensor.exp, np.exp), (Tensor.log, np.log), operator.neg, (T
39
39
  # TODO: (a+b)/2 in tensor.py's maximum can overflow. This requires a new implementation of maximum that can be backpropagated
40
40
  #binary_operations += [(Tensor.maximum, np.maximum)]
41
41
 
42
- # TODO: CUDACPU segfaults on sin
43
- # TODO: METAL sin is flaky for float16
44
- if getenv("CUDACPU") or (getenv("MOCKGPU") and Device.DEFAULT == "NV") or Device.DEFAULT == "METAL": unary_operations.remove((Tensor.sin, np.sin))
42
+ # TODO: CI CUDA segfaults on sin
43
+ if getenv("MOCKGPU") and Device.DEFAULT == "NV": unary_operations.remove((Tensor.sin, np.sin))
45
44
 
46
45
  class ht:
47
46
  float64 = strat.floats(width=64, allow_subnormal=False)
@@ -68,7 +67,7 @@ def universal_test_unary(a, dtype, op):
68
67
  if not isinstance(op, tuple): op = (op, op)
69
68
  out: Tensor = op[0](Tensor([a], dtype=dtype))
70
69
  sched = create_schedule([out.lazydata])
71
- ast = sched[-1].ast[0]
70
+ ast = sched[-1].ast
72
71
  run_schedule(sched)
73
72
  tensor_value = out.numpy()
74
73
  numpy_value = op[1](np.array([a]).astype(_to_np_dtype(dtype)))
@@ -145,8 +144,8 @@ class TestDTypeALU(unittest.TestCase):
145
144
  @given(ht.int32, ht.int32, ht.float32, strat.sampled_from(integer_binary_operations), strat.sampled_from(binary_operations))
146
145
  def test_int32_midcast_float(self, a, b, c, op1, op2): universal_test_midcast(a, b, c, op1, op2, dtypes.int32, dtypes.float32)
147
146
 
148
- # Metal and CUDACPU and HIP behave differently than numpy in CI for overflows
149
- skip_overflow = CI and (Device.DEFAULT in {"AMD", "NV"} or getenv("CUDACPU"))
147
+ # Metal and CUDA and HIP behave differently than numpy in CI for overflows
148
+ skip_overflow = CI and Device.DEFAULT in {"AMD", "NV"}
150
149
  @given(strat.floats(width=32, min_value=0, max_value=10.0) if skip_overflow else ht.float32,
151
150
  strat.floats(width=32, min_value=0, max_value=10.0) if skip_overflow else ht.float32,
152
151
  ht.int32, strat.sampled_from(binary_operations), strat.sampled_from(integer_binary_operations))
@@ -161,5 +160,43 @@ class TestDTypeALU(unittest.TestCase):
161
160
  @given(ht.int32, strat.sampled_from(dtypes_float+dtypes_int+dtypes_bool))
162
161
  def test_int32_cast(self, a, dtype): universal_test_cast(a, dtypes.int32, dtype)
163
162
 
163
+ class TestFromFuzzer(unittest.TestCase):
164
+ @given(strat.sampled_from(dtypes_float))
165
+ def test_sin(self, dtype):
166
+ if not is_dtype_supported(dtype): return
167
+ if dtype == dtypes.float64:
168
+ # crashes in CI CUDA
169
+ if getenv("MOCKGPU") and Device.DEFAULT == "NV": return
170
+ def _test_value(n: float, unit: float=1.0):
171
+ next_float = np.nextafter(1.0, 2.0, dtype=_to_np_dtype(dtype))
172
+ ulp = next_float - 1.0
173
+ ulp = unit * ulp
174
+ np.testing.assert_allclose(Tensor([n], dtype=dtype).sin().numpy(), np.sin(np.array([n], dtype=_to_np_dtype(dtype))), atol=ulp, rtol=1e-5)
175
+ _test_value(-35.0)
176
+ _test_value(-25.0)
177
+ _test_value(25.0)
178
+ _test_value(30.0) # 30.0 == switch_over
179
+ _test_value(35.0)
180
+ _test_value(0.0)
181
+ _test_value(np.pi / 2)
182
+ # worst case of ulp 1.5
183
+ _test_value(np.pi * 2, unit=1.5)
184
+ @given(strat.sampled_from(dtypes_float))
185
+ def test_log2(self, dtype):
186
+ if not is_dtype_supported(dtype): return
187
+ if dtype == dtypes.float64:
188
+ # crashes in CI CUDA
189
+ if getenv("MOCKGPU") and Device.DEFAULT == "NV": return
190
+ def _test_value(n: float, unit: float=1.0):
191
+ next_float = np.nextafter(1.0, 2.0, dtype=_to_np_dtype(dtype))
192
+ ulp = next_float - 1.0
193
+ ulp = unit * ulp
194
+ np.testing.assert_allclose(Tensor([n], dtype=dtype).log2().numpy(), np.log2(np.array([n], dtype=_to_np_dtype(dtype))), atol=ulp, rtol=1e-5)
195
+ fmin = np.finfo(_to_np_dtype(dtype)).tiny
196
+ for scale in [1.0, 1e10, 1e20, 1e30]:
197
+ _test_value(fmin * scale)
198
+ _test_value(-fmin * scale)
199
+ _test_value(0)
200
+
164
201
  if __name__ == '__main__':
165
202
  unittest.main()