tinygrad 0.9.1__tar.gz → 0.10.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (179) hide show
  1. {tinygrad-0.9.1 → tinygrad-0.10.0}/PKG-INFO +21 -17
  2. {tinygrad-0.9.1 → tinygrad-0.10.0}/README.md +13 -11
  3. {tinygrad-0.9.1 → tinygrad-0.10.0}/setup.py +13 -9
  4. tinygrad-0.10.0/test/test_arange.py +179 -0
  5. {tinygrad-0.9.1 → tinygrad-0.10.0}/test/test_assign.py +17 -4
  6. tinygrad-0.10.0/test/test_compile_failures.py +18 -0
  7. {tinygrad-0.9.1 → tinygrad-0.10.0}/test/test_const_folding.py +27 -12
  8. {tinygrad-0.9.1 → tinygrad-0.10.0}/test/test_conv.py +8 -0
  9. tinygrad-0.10.0/test/test_conv_shapetracker.py +58 -0
  10. {tinygrad-0.9.1 → tinygrad-0.10.0}/test/test_copy_speed.py +1 -1
  11. {tinygrad-0.9.1 → tinygrad-0.10.0}/test/test_device_speed.py +1 -2
  12. {tinygrad-0.9.1 → tinygrad-0.10.0}/test/test_dtype.py +172 -35
  13. {tinygrad-0.9.1 → tinygrad-0.10.0}/test/test_dtype_alu.py +27 -15
  14. {tinygrad-0.9.1 → tinygrad-0.10.0}/test/test_fusion_op.py +28 -9
  15. {tinygrad-0.9.1 → tinygrad-0.10.0}/test/test_fuzz_shape_ops.py +2 -2
  16. tinygrad-0.10.0/test/test_gc.py +67 -0
  17. {tinygrad-0.9.1 → tinygrad-0.10.0}/test/test_graph.py +1 -2
  18. tinygrad-0.10.0/test/test_hcq.py +475 -0
  19. {tinygrad-0.9.1 → tinygrad-0.10.0}/test/test_image_dtype.py +51 -11
  20. {tinygrad-0.9.1 → tinygrad-0.10.0}/test/test_jit.py +79 -2
  21. {tinygrad-0.9.1 → tinygrad-0.10.0}/test/test_lazybuffer.py +34 -13
  22. tinygrad-0.10.0/test/test_linearizer.py +2174 -0
  23. tinygrad-0.10.0/test/test_linearizer_dumb.py +223 -0
  24. tinygrad-0.10.0/test/test_linearizer_failures.py +1435 -0
  25. tinygrad-0.10.0/test/test_linearizer_overflows.py +196 -0
  26. tinygrad-0.10.0/test/test_metal.py +77 -0
  27. {tinygrad-0.9.1 → tinygrad-0.10.0}/test/test_multitensor.py +202 -47
  28. {tinygrad-0.9.1 → tinygrad-0.10.0}/test/test_nn.py +259 -30
  29. tinygrad-0.10.0/test/test_ocl.py +31 -0
  30. {tinygrad-0.9.1 → tinygrad-0.10.0}/test/test_ops.py +615 -52
  31. {tinygrad-0.9.1 → tinygrad-0.10.0}/test/test_optim.py +1 -1
  32. {tinygrad-0.9.1 → tinygrad-0.10.0}/test/test_pickle.py +52 -6
  33. tinygrad-0.10.0/test/test_profiler.py +221 -0
  34. {tinygrad-0.9.1 → tinygrad-0.10.0}/test/test_randomness.py +146 -28
  35. tinygrad-0.10.0/test/test_rearrange_einops.py +321 -0
  36. tinygrad-0.10.0/test/test_renderer_failures.py +68 -0
  37. {tinygrad-0.9.1 → tinygrad-0.10.0}/test/test_sample.py +1 -2
  38. tinygrad-0.10.0/test/test_schedule.py +1859 -0
  39. tinygrad-0.10.0/test/test_search.py +158 -0
  40. {tinygrad-0.9.1 → tinygrad-0.10.0}/test/test_setitem.py +23 -8
  41. {tinygrad-0.9.1 → tinygrad-0.10.0}/test/test_specific_conv.py +1 -1
  42. {tinygrad-0.9.1 → tinygrad-0.10.0}/test/test_speed_v_torch.py +9 -16
  43. {tinygrad-0.9.1 → tinygrad-0.10.0}/test/test_subbuffer.py +2 -3
  44. {tinygrad-0.9.1 → tinygrad-0.10.0}/test/test_symbolic_jit.py +1 -3
  45. {tinygrad-0.9.1 → tinygrad-0.10.0}/test/test_symbolic_ops.py +2 -2
  46. {tinygrad-0.9.1 → tinygrad-0.10.0}/test/test_symbolic_shapetracker.py +37 -40
  47. {tinygrad-0.9.1 → tinygrad-0.10.0}/test/test_tensor.py +182 -11
  48. {tinygrad-0.9.1 → tinygrad-0.10.0}/test/test_tensor_data.py +12 -1
  49. {tinygrad-0.9.1 → tinygrad-0.10.0}/test/test_tensor_variable.py +36 -20
  50. tinygrad-0.10.0/test/test_tiny.py +84 -0
  51. tinygrad-0.10.0/test/test_transcendental.py +121 -0
  52. tinygrad-0.10.0/test/test_uop_graph.py +716 -0
  53. tinygrad-0.10.0/test/test_uops.py +454 -0
  54. tinygrad-0.10.0/test/test_uops_stats.py +224 -0
  55. tinygrad-0.10.0/test/test_viz.py +93 -0
  56. {tinygrad-0.9.1 → tinygrad-0.10.0}/test/test_winograd.py +7 -6
  57. tinygrad-0.10.0/tinygrad/__init__.py +11 -0
  58. {tinygrad-0.9.1 → tinygrad-0.10.0}/tinygrad/codegen/kernel.py +308 -175
  59. tinygrad-0.10.0/tinygrad/codegen/linearize.py +95 -0
  60. tinygrad-0.10.0/tinygrad/codegen/lowerer.py +143 -0
  61. tinygrad-0.10.0/tinygrad/codegen/transcendental.py +257 -0
  62. tinygrad-0.10.0/tinygrad/codegen/uopgraph.py +506 -0
  63. tinygrad-0.10.0/tinygrad/device.py +221 -0
  64. tinygrad-0.10.0/tinygrad/dtype.py +188 -0
  65. tinygrad-0.10.0/tinygrad/engine/jit.py +295 -0
  66. {tinygrad-0.9.1/tinygrad → tinygrad-0.10.0/tinygrad/engine}/lazy.py +74 -66
  67. tinygrad-0.10.0/tinygrad/engine/memory.py +51 -0
  68. {tinygrad-0.9.1 → tinygrad-0.10.0}/tinygrad/engine/realize.py +86 -61
  69. tinygrad-0.10.0/tinygrad/engine/schedule.py +419 -0
  70. {tinygrad-0.9.1 → tinygrad-0.10.0}/tinygrad/engine/search.py +58 -47
  71. {tinygrad-0.9.1 → tinygrad-0.10.0}/tinygrad/function.py +59 -58
  72. {tinygrad-0.9.1 → tinygrad-0.10.0}/tinygrad/helpers.py +120 -102
  73. {tinygrad-0.9.1 → tinygrad-0.10.0}/tinygrad/multi.py +82 -78
  74. {tinygrad-0.9.1 → tinygrad-0.10.0}/tinygrad/nn/__init__.py +116 -67
  75. tinygrad-0.10.0/tinygrad/nn/datasets.py +15 -0
  76. {tinygrad-0.9.1 → tinygrad-0.10.0}/tinygrad/nn/optim.py +1 -1
  77. {tinygrad-0.9.1 → tinygrad-0.10.0}/tinygrad/nn/state.py +91 -6
  78. tinygrad-0.10.0/tinygrad/ops.py +1152 -0
  79. tinygrad-0.10.0/tinygrad/renderer/__init__.py +89 -0
  80. tinygrad-0.10.0/tinygrad/renderer/cstyle.py +462 -0
  81. tinygrad-0.10.0/tinygrad/renderer/llvmir.py +142 -0
  82. tinygrad-0.10.0/tinygrad/renderer/ptx.py +225 -0
  83. tinygrad-0.10.0/tinygrad/runtime/autogen/adreno.py +17904 -0
  84. tinygrad-0.10.0/tinygrad/runtime/autogen/amd_gpu.py +48384 -0
  85. {tinygrad-0.9.1 → tinygrad-0.10.0}/tinygrad/runtime/autogen/cuda.py +6 -162
  86. {tinygrad-0.9.1 → tinygrad-0.10.0}/tinygrad/runtime/autogen/io_uring.py +97 -63
  87. {tinygrad-0.9.1 → tinygrad-0.10.0}/tinygrad/runtime/autogen/kfd.py +60 -47
  88. tinygrad-0.10.0/tinygrad/runtime/autogen/kgsl.py +1386 -0
  89. tinygrad-0.10.0/tinygrad/runtime/autogen/libc.py +5462 -0
  90. {tinygrad-0.9.1 → tinygrad-0.10.0}/tinygrad/runtime/autogen/nv_gpu.py +1976 -1957
  91. tinygrad-0.10.0/tinygrad/runtime/autogen/nvrtc.py +579 -0
  92. {tinygrad-0.9.1 → tinygrad-0.10.0}/tinygrad/runtime/autogen/opencl.py +11 -11
  93. tinygrad-0.10.0/tinygrad/runtime/autogen/qcom_dsp.py +1739 -0
  94. {tinygrad-0.9.1 → tinygrad-0.10.0}/tinygrad/runtime/graph/clang.py +3 -3
  95. {tinygrad-0.9.1 → tinygrad-0.10.0}/tinygrad/runtime/graph/cuda.py +11 -15
  96. tinygrad-0.10.0/tinygrad/runtime/graph/hcq.py +200 -0
  97. tinygrad-0.10.0/tinygrad/runtime/graph/metal.py +103 -0
  98. tinygrad-0.10.0/tinygrad/runtime/ops_amd.py +471 -0
  99. {tinygrad-0.9.1 → tinygrad-0.10.0}/tinygrad/runtime/ops_clang.py +12 -5
  100. tinygrad-0.10.0/tinygrad/runtime/ops_cloud.py +220 -0
  101. tinygrad-0.10.0/tinygrad/runtime/ops_cuda.py +128 -0
  102. {tinygrad-0.9.1 → tinygrad-0.10.0}/tinygrad/runtime/ops_disk.py +25 -26
  103. tinygrad-0.10.0/tinygrad/runtime/ops_dsp.py +181 -0
  104. {tinygrad-0.9.1 → tinygrad-0.10.0}/tinygrad/runtime/ops_gpu.py +29 -16
  105. tinygrad-0.10.0/tinygrad/runtime/ops_hip.py +68 -0
  106. {tinygrad-0.9.1 → tinygrad-0.10.0}/tinygrad/runtime/ops_llvm.py +15 -10
  107. tinygrad-0.10.0/tinygrad/runtime/ops_metal.py +188 -0
  108. tinygrad-0.10.0/tinygrad/runtime/ops_nv.py +584 -0
  109. {tinygrad-0.9.1 → tinygrad-0.10.0}/tinygrad/runtime/ops_python.py +78 -79
  110. tinygrad-0.10.0/tinygrad/runtime/ops_qcom.py +405 -0
  111. tinygrad-0.10.0/tinygrad/runtime/support/compiler_cuda.py +77 -0
  112. tinygrad-0.9.1/tinygrad/runtime/driver/hip_comgr.py → tinygrad-0.10.0/tinygrad/runtime/support/compiler_hip.py +13 -1
  113. tinygrad-0.10.0/tinygrad/runtime/support/elf.py +38 -0
  114. tinygrad-0.10.0/tinygrad/runtime/support/hcq.py +539 -0
  115. tinygrad-0.10.0/tinygrad/shape/__init__.py +0 -0
  116. {tinygrad-0.9.1 → tinygrad-0.10.0}/tinygrad/shape/shapetracker.py +40 -50
  117. {tinygrad-0.9.1 → tinygrad-0.10.0}/tinygrad/shape/view.py +102 -63
  118. {tinygrad-0.9.1 → tinygrad-0.10.0}/tinygrad/tensor.py +1109 -365
  119. {tinygrad-0.9.1 → tinygrad-0.10.0}/tinygrad.egg-info/PKG-INFO +21 -17
  120. {tinygrad-0.9.1 → tinygrad-0.10.0}/tinygrad.egg-info/SOURCES.txt +33 -13
  121. {tinygrad-0.9.1 → tinygrad-0.10.0}/tinygrad.egg-info/requires.txt +6 -6
  122. tinygrad-0.9.1/test/test_arange.py +0 -19
  123. tinygrad-0.9.1/test/test_conv_shapetracker.py +0 -22
  124. tinygrad-0.9.1/test/test_custom_function.py +0 -106
  125. tinygrad-0.9.1/test/test_gc.py +0 -37
  126. tinygrad-0.9.1/test/test_lazyop.py +0 -34
  127. tinygrad-0.9.1/test/test_linearizer.py +0 -1778
  128. tinygrad-0.9.1/test/test_linearizer_failures.py +0 -255
  129. tinygrad-0.9.1/test/test_linearizer_overflows.py +0 -89
  130. tinygrad-0.9.1/test/test_pattern_matcher.py +0 -168
  131. tinygrad-0.9.1/test/test_print_tree.py +0 -66
  132. tinygrad-0.9.1/test/test_schedule.py +0 -1156
  133. tinygrad-0.9.1/test/test_search.py +0 -101
  134. tinygrad-0.9.1/test/test_uop_graph.py +0 -190
  135. tinygrad-0.9.1/test/test_uops.py +0 -319
  136. tinygrad-0.9.1/test/test_uops_stats.py +0 -81
  137. tinygrad-0.9.1/test/test_verify_lazyop.py +0 -64
  138. tinygrad-0.9.1/tinygrad/__init__.py +0 -6
  139. tinygrad-0.9.1/tinygrad/codegen/linearizer.py +0 -528
  140. tinygrad-0.9.1/tinygrad/codegen/uops.py +0 -451
  141. tinygrad-0.9.1/tinygrad/device.py +0 -320
  142. tinygrad-0.9.1/tinygrad/dtype.py +0 -113
  143. tinygrad-0.9.1/tinygrad/engine/graph.py +0 -100
  144. tinygrad-0.9.1/tinygrad/engine/jit.py +0 -198
  145. tinygrad-0.9.1/tinygrad/engine/schedule.py +0 -370
  146. tinygrad-0.9.1/tinygrad/nn/datasets.py +0 -8
  147. tinygrad-0.9.1/tinygrad/ops.py +0 -169
  148. tinygrad-0.9.1/tinygrad/renderer/__init__.py +0 -65
  149. tinygrad-0.9.1/tinygrad/renderer/assembly.py +0 -269
  150. tinygrad-0.9.1/tinygrad/renderer/cstyle.py +0 -389
  151. tinygrad-0.9.1/tinygrad/renderer/llvmir.py +0 -160
  152. tinygrad-0.9.1/tinygrad/runtime/autogen/amd_gpu.py +0 -13403
  153. tinygrad-0.9.1/tinygrad/runtime/graph/hcq.py +0 -187
  154. tinygrad-0.9.1/tinygrad/runtime/graph/metal.py +0 -75
  155. tinygrad-0.9.1/tinygrad/runtime/ops_amd.py +0 -550
  156. tinygrad-0.9.1/tinygrad/runtime/ops_cuda.py +0 -185
  157. tinygrad-0.9.1/tinygrad/runtime/ops_metal.py +0 -105
  158. tinygrad-0.9.1/tinygrad/runtime/ops_nv.py +0 -625
  159. tinygrad-0.9.1/tinygrad/shape/symbolic.py +0 -327
  160. {tinygrad-0.9.1 → tinygrad-0.10.0}/LICENSE +0 -0
  161. {tinygrad-0.9.1 → tinygrad-0.10.0}/setup.cfg +0 -0
  162. {tinygrad-0.9.1 → tinygrad-0.10.0}/test/test_kernel_cache.py +0 -0
  163. {tinygrad-0.9.1 → tinygrad-0.10.0}/test/test_masked_st.py +0 -0
  164. {tinygrad-0.9.1 → tinygrad-0.10.0}/test/test_method_cache.py +0 -0
  165. {tinygrad-0.9.1 → tinygrad-0.10.0}/test/test_net_speed.py +0 -0
  166. {tinygrad-0.9.1 → tinygrad-0.10.0}/test/test_to_numpy.py +0 -0
  167. {tinygrad-0.9.1 → tinygrad-0.10.0}/test/test_zero_copy.py +0 -0
  168. {tinygrad-0.9.1 → tinygrad-0.10.0}/tinygrad/codegen/__init__.py +0 -0
  169. {tinygrad-0.9.1 → tinygrad-0.10.0}/tinygrad/engine/__init__.py +0 -0
  170. /tinygrad-0.9.1/tinygrad/runtime/__init__.py → /tinygrad-0.10.0/tinygrad/py.typed +0 -0
  171. {tinygrad-0.9.1/tinygrad/runtime/driver → tinygrad-0.10.0/tinygrad/runtime}/__init__.py +0 -0
  172. {tinygrad-0.9.1 → tinygrad-0.10.0}/tinygrad/runtime/autogen/comgr.py +0 -0
  173. {tinygrad-0.9.1 → tinygrad-0.10.0}/tinygrad/runtime/autogen/hip.py +0 -0
  174. {tinygrad-0.9.1 → tinygrad-0.10.0}/tinygrad/runtime/autogen/hsa.py +0 -0
  175. {tinygrad-0.9.1 → tinygrad-0.10.0}/tinygrad/runtime/graph/__init__.py +0 -0
  176. {tinygrad-0.9.1 → tinygrad-0.10.0}/tinygrad/runtime/ops_npy.py +0 -0
  177. {tinygrad-0.9.1/tinygrad/shape → tinygrad-0.10.0/tinygrad/runtime/support}/__init__.py +0 -0
  178. {tinygrad-0.9.1 → tinygrad-0.10.0}/tinygrad.egg-info/dependency_links.txt +0 -0
  179. {tinygrad-0.9.1 → tinygrad-0.10.0}/tinygrad.egg-info/top_level.txt +0 -0
@@ -1,17 +1,14 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: tinygrad
3
- Version: 0.9.1
3
+ Version: 0.10.0
4
4
  Summary: You like pytorch? You like micrograd? You love tinygrad! <3
5
5
  Author: George Hotz
6
6
  License: MIT
7
7
  Classifier: Programming Language :: Python :: 3
8
8
  Classifier: License :: OSI Approved :: MIT License
9
- Requires-Python: >=3.8
9
+ Requires-Python: >=3.10
10
10
  Description-Content-Type: text/markdown
11
11
  License-File: LICENSE
12
- Requires-Dist: numpy
13
- Requires-Dist: pyobjc-framework-Metal; platform_system == "Darwin"
14
- Requires-Dist: pyobjc-framework-libdispatch; platform_system == "Darwin"
15
12
  Provides-Extra: llvm
16
13
  Requires-Dist: llvmlite; extra == "llvm"
17
14
  Provides-Extra: arm
@@ -20,12 +17,13 @@ Provides-Extra: triton
20
17
  Requires-Dist: triton-nightly>=2.1.0.dev20231014192330; extra == "triton"
21
18
  Provides-Extra: linting
22
19
  Requires-Dist: pylint; extra == "linting"
23
- Requires-Dist: mypy; extra == "linting"
20
+ Requires-Dist: mypy==1.11.2; extra == "linting"
24
21
  Requires-Dist: typing-extensions; extra == "linting"
25
22
  Requires-Dist: pre-commit; extra == "linting"
26
23
  Requires-Dist: ruff; extra == "linting"
27
24
  Requires-Dist: types-tqdm; extra == "linting"
28
25
  Provides-Extra: testing
26
+ Requires-Dist: numpy; extra == "testing"
29
27
  Requires-Dist: torch; extra == "testing"
30
28
  Requires-Dist: pillow; extra == "testing"
31
29
  Requires-Dist: pytest; extra == "testing"
@@ -39,17 +37,21 @@ Requires-Dist: safetensors; extra == "testing"
39
37
  Requires-Dist: transformers; extra == "testing"
40
38
  Requires-Dist: sentencepiece; extra == "testing"
41
39
  Requires-Dist: tiktoken; extra == "testing"
40
+ Requires-Dist: blobfile; extra == "testing"
42
41
  Requires-Dist: librosa; extra == "testing"
43
42
  Requires-Dist: networkx; extra == "testing"
44
43
  Requires-Dist: hypothesis; extra == "testing"
45
44
  Requires-Dist: nibabel; extra == "testing"
46
45
  Requires-Dist: bottle; extra == "testing"
46
+ Requires-Dist: ggml-python; extra == "testing"
47
47
  Provides-Extra: docs
48
+ Requires-Dist: mkdocs; extra == "docs"
48
49
  Requires-Dist: mkdocs-material; extra == "docs"
49
50
  Requires-Dist: mkdocstrings[python]; extra == "docs"
50
51
  Requires-Dist: markdown-callouts; extra == "docs"
51
52
  Requires-Dist: markdown-exec[ansi]; extra == "docs"
52
53
  Requires-Dist: black; extra == "docs"
54
+ Requires-Dist: numpy; extra == "docs"
53
55
  Provides-Extra: testing-tf
54
56
  Requires-Dist: tensorflow==2.15.1; extra == "testing-tf"
55
57
  Requires-Dist: tensorflow_addons; extra == "testing-tf"
@@ -107,7 +109,7 @@ And we can change `DEBUG` to `4` to see the generated code.
107
109
  As it turns out, 90% of what you need for neural networks are a decent autograd/tensor library.
108
110
  Throw in an optimizer, a data loader, and some compute, and you have all you need.
109
111
 
110
- ```py
112
+ ```python
111
113
  from tinygrad import Tensor, nn
112
114
 
113
115
  class LinearNet:
@@ -122,11 +124,12 @@ optim = nn.optim.Adam([model.l1, model.l2], lr=0.001)
122
124
 
123
125
  x, y = Tensor.rand(4, 1, 28, 28), Tensor([2,4,3,7]) # replace with real mnist dataloader
124
126
 
125
- for i in range(10):
126
- optim.zero_grad()
127
- loss = model(x).sparse_categorical_crossentropy(y).backward()
128
- optim.step()
129
- print(i, loss.item())
127
+ with Tensor.train():
128
+ for i in range(10):
129
+ optim.zero_grad()
130
+ loss = model(x).sparse_categorical_crossentropy(y).backward()
131
+ optim.step()
132
+ print(i, loss.item())
130
133
  ```
131
134
 
132
135
  See [examples/beautiful_mnist.py](examples/beautiful_mnist.py) for the full version that gets 98% in ~5 seconds
@@ -142,9 +145,12 @@ tinygrad already supports numerous accelerators, including:
142
145
  - [x] [CUDA](tinygrad/runtime/ops_cuda.py)
143
146
  - [x] [AMD](tinygrad/runtime/ops_amd.py)
144
147
  - [x] [NV](tinygrad/runtime/ops_nv.py)
148
+ - [x] [QCOM](tinygrad/runtime/ops_qcom.py)
145
149
 
146
150
  And it is easy to add more! Your accelerator of choice only needs to support a total of ~25 low level ops.
147
151
 
152
+ To check default accelerator run: `python3 -c "from tinygrad import Device; print(Device.DEFAULT)"`
153
+
148
154
  ## Installation
149
155
 
150
156
  The current recommended way to install tinygrad is from source.
@@ -169,7 +175,7 @@ Documentation along with a quick start guide can be found on the [docs website](
169
175
 
170
176
  ### Quick example comparing to PyTorch
171
177
 
172
- ```py
178
+ ```python
173
179
  from tinygrad import Tensor
174
180
 
175
181
  x = Tensor.eye(3, requires_grad=True)
@@ -182,7 +188,7 @@ print(y.grad.numpy()) # dz/dy
182
188
  ```
183
189
 
184
190
  The same thing but in PyTorch:
185
- ```py
191
+ ```python
186
192
  import torch
187
193
 
188
194
  x = torch.eye(3, requires_grad=True)
@@ -230,6 +236,4 @@ python3 -m pytest test/ # whole test suite
230
236
 
231
237
  #### Process replay tests
232
238
 
233
- [Process replay](https://github.com/tinygrad/tinygrad/blob/master/test/external/process_replay/process_replay.py) detects changes in the generated kernels of CI tests by comparing them against tinygrad master. If your PR is a refactor or speedup without any expected behavior change, it should include a green process replay pass to get merged.
234
-
235
- You can enable process replay by adding [run_process_replay] to your PR title. [example](https://github.com/tinygrad/tinygrad/pull/4995). Note that you should keep your branch up-to-date with master.
239
+ [Process replay](https://github.com/tinygrad/tinygrad/blob/master/test/external/process_replay/README.md) compares your PR's generated kernels against master. If your PR is a refactor or speedup without any expected behavior change, It should include [pr] in the pull request title.
@@ -51,7 +51,7 @@ And we can change `DEBUG` to `4` to see the generated code.
51
51
  As it turns out, 90% of what you need for neural networks are a decent autograd/tensor library.
52
52
  Throw in an optimizer, a data loader, and some compute, and you have all you need.
53
53
 
54
- ```py
54
+ ```python
55
55
  from tinygrad import Tensor, nn
56
56
 
57
57
  class LinearNet:
@@ -66,11 +66,12 @@ optim = nn.optim.Adam([model.l1, model.l2], lr=0.001)
66
66
 
67
67
  x, y = Tensor.rand(4, 1, 28, 28), Tensor([2,4,3,7]) # replace with real mnist dataloader
68
68
 
69
- for i in range(10):
70
- optim.zero_grad()
71
- loss = model(x).sparse_categorical_crossentropy(y).backward()
72
- optim.step()
73
- print(i, loss.item())
69
+ with Tensor.train():
70
+ for i in range(10):
71
+ optim.zero_grad()
72
+ loss = model(x).sparse_categorical_crossentropy(y).backward()
73
+ optim.step()
74
+ print(i, loss.item())
74
75
  ```
75
76
 
76
77
  See [examples/beautiful_mnist.py](examples/beautiful_mnist.py) for the full version that gets 98% in ~5 seconds
@@ -86,9 +87,12 @@ tinygrad already supports numerous accelerators, including:
86
87
  - [x] [CUDA](tinygrad/runtime/ops_cuda.py)
87
88
  - [x] [AMD](tinygrad/runtime/ops_amd.py)
88
89
  - [x] [NV](tinygrad/runtime/ops_nv.py)
90
+ - [x] [QCOM](tinygrad/runtime/ops_qcom.py)
89
91
 
90
92
  And it is easy to add more! Your accelerator of choice only needs to support a total of ~25 low level ops.
91
93
 
94
+ To check default accelerator run: `python3 -c "from tinygrad import Device; print(Device.DEFAULT)"`
95
+
92
96
  ## Installation
93
97
 
94
98
  The current recommended way to install tinygrad is from source.
@@ -113,7 +117,7 @@ Documentation along with a quick start guide can be found on the [docs website](
113
117
 
114
118
  ### Quick example comparing to PyTorch
115
119
 
116
- ```py
120
+ ```python
117
121
  from tinygrad import Tensor
118
122
 
119
123
  x = Tensor.eye(3, requires_grad=True)
@@ -126,7 +130,7 @@ print(y.grad.numpy()) # dz/dy
126
130
  ```
127
131
 
128
132
  The same thing but in PyTorch:
129
- ```py
133
+ ```python
130
134
  import torch
131
135
 
132
136
  x = torch.eye(3, requires_grad=True)
@@ -174,6 +178,4 @@ python3 -m pytest test/ # whole test suite
174
178
 
175
179
  #### Process replay tests
176
180
 
177
- [Process replay](https://github.com/tinygrad/tinygrad/blob/master/test/external/process_replay/process_replay.py) detects changes in the generated kernels of CI tests by comparing them against tinygrad master. If your PR is a refactor or speedup without any expected behavior change, it should include a green process replay pass to get merged.
178
-
179
- You can enable process replay by adding [run_process_replay] to your PR title. [example](https://github.com/tinygrad/tinygrad/pull/4995). Note that you should keep your branch up-to-date with master.
181
+ [Process replay](https://github.com/tinygrad/tinygrad/blob/master/test/external/process_replay/README.md) compares your PR's generated kernels against master. If your PR is a refactor or speedup without any expected behavior change, It should include [pr] in the pull request title.
@@ -8,36 +8,36 @@ with open(directory / 'README.md', encoding='utf-8') as f:
8
8
  long_description = f.read()
9
9
 
10
10
  setup(name='tinygrad',
11
- version='0.9.1',
11
+ version='0.10.0',
12
12
  description='You like pytorch? You like micrograd? You love tinygrad! <3',
13
13
  author='George Hotz',
14
14
  license='MIT',
15
15
  long_description=long_description,
16
16
  long_description_content_type='text/markdown',
17
17
  packages = ['tinygrad', 'tinygrad.runtime.autogen', 'tinygrad.codegen', 'tinygrad.nn', 'tinygrad.renderer', 'tinygrad.engine',
18
- 'tinygrad.runtime', 'tinygrad.runtime.driver', 'tinygrad.runtime.graph', 'tinygrad.shape'],
18
+ 'tinygrad.runtime', 'tinygrad.runtime.support', 'tinygrad.runtime.graph', 'tinygrad.shape'],
19
+ package_data = {'tinygrad': ['py.typed']},
19
20
  classifiers=[
20
21
  "Programming Language :: Python :: 3",
21
22
  "License :: OSI Approved :: MIT License"
22
23
  ],
23
- install_requires=["numpy",
24
- "pyobjc-framework-Metal; platform_system=='Darwin'",
25
- "pyobjc-framework-libdispatch; platform_system=='Darwin'"],
26
- python_requires='>=3.8',
24
+ install_requires=[],
25
+ python_requires='>=3.10',
27
26
  extras_require={
28
27
  'llvm': ["llvmlite"],
29
28
  'arm': ["unicorn"],
30
29
  'triton': ["triton-nightly>=2.1.0.dev20231014192330"],
31
30
  'linting': [
32
31
  "pylint",
33
- "mypy",
32
+ "mypy==1.11.2",
34
33
  "typing-extensions",
35
34
  "pre-commit",
36
35
  "ruff",
37
36
  "types-tqdm",
38
37
  ],
39
- #'mlperf': ["mlperf-logging @ git+https://github.com/mlperf/logging.git@4.0.0-rc2"],
38
+ #'mlperf': ["mlperf-logging @ git+https://github.com/mlperf/logging.git@4.1.0-rc3"],
40
39
  'testing': [
40
+ "numpy",
41
41
  "torch",
42
42
  "pillow",
43
43
  "pytest",
@@ -51,18 +51,22 @@ setup(name='tinygrad',
51
51
  "transformers",
52
52
  "sentencepiece",
53
53
  "tiktoken",
54
+ "blobfile",
54
55
  "librosa",
55
56
  "networkx",
56
57
  "hypothesis",
57
58
  "nibabel",
58
59
  "bottle",
60
+ "ggml-python"
59
61
  ],
60
62
  'docs': [
63
+ "mkdocs",
61
64
  "mkdocs-material",
62
65
  "mkdocstrings[python]",
63
66
  "markdown-callouts",
64
67
  "markdown-exec[ansi]",
65
- "black"
68
+ "black",
69
+ "numpy",
66
70
  ],
67
71
  'testing_tf': [
68
72
  "tensorflow==2.15.1",
@@ -0,0 +1,179 @@
1
+ import unittest, contextlib
2
+ import numpy as np
3
+ from tinygrad import Tensor, GlobalCounters, dtypes, nn
4
+ from tinygrad.helpers import CI, Context, getenv
5
+ from tinygrad.engine.realize import run_schedule
6
+ from tinygrad.codegen.kernel import Opt, OptOps, Kernel, KernelOptError
7
+ from tinygrad.engine.realize import CompiledRunner, ExecItem
8
+ from tinygrad.engine.search import get_kernel_actions
9
+
10
+ class TestArange(unittest.TestCase):
11
+ def _get_flops(self, N, opts=None):
12
+ GlobalCounters.reset()
13
+ tt = Tensor.arange(N)
14
+ sched = tt.schedule()
15
+ self.assertEqual(len(sched), 1)
16
+ k = Kernel(sched[-1].ast)
17
+ if opts is not None:
18
+ for o in opts: k.apply_opt(o)
19
+ p = k.to_program()
20
+ print(p.name)
21
+ #print(p.src)
22
+ ExecItem(CompiledRunner(p), [tt.lazydata.buffer]).run()
23
+ np.testing.assert_equal(tt.numpy(), np.arange(N))
24
+ return p.op_estimate
25
+
26
+ def test_complexity(self, opts=None, limit=None):
27
+ # add 1 to avoid divide by 0. arange is 0 flops now!
28
+ f1 = self._get_flops(256, opts) + 1
29
+ f2 = self._get_flops(2560, opts) + 1
30
+ print(f"{f1=}, {f2=}")
31
+ assert (f1 < 5000 and f2 < 5000) or (f2 / f1 < 15), f"bad complexity, flops {f2/f1:.1f}X while inputs 10X"
32
+ if limit is not None and not getenv("PTX"):
33
+ # PTX counts index ALU in flops
34
+ assert f1 <= limit, f"{f1=}, {limit=}"
35
+
36
+ def test_complexity_w_upcast(self): return self.test_complexity([Opt(OptOps.UPCAST, 0, 4)], limit=1)
37
+ def test_complexity_w_unroll2(self): return self.test_complexity([Opt(OptOps.UNROLL, 0, 2)], limit=1)
38
+ def test_complexity_w_unroll4(self): return self.test_complexity([Opt(OptOps.UNROLL, 0, 4)], limit=1)
39
+ def test_complexity_w_unroll8(self): return self.test_complexity([Opt(OptOps.UNROLL, 0, 8)], limit=1)
40
+ def test_complexity_w_upcast_and_unroll(self): return self.test_complexity([Opt(OptOps.UPCAST, 0, 4), Opt(OptOps.UNROLL, 0, 4)], limit=1)
41
+
42
+ @unittest.skip("doesn't work yet")
43
+ def test_complexity_w_local_and_padto(self): return self.test_complexity([Opt(OptOps.LOCAL, 0, 16), Opt(op=OptOps.PADTO, axis=1, amt=32)])
44
+
45
+ def test_all_opts(self, opts=None, exclude=None):
46
+ k = Kernel(Tensor.arange(256).schedule()[-1].ast)
47
+ if opts is not None:
48
+ for o in opts: k.apply_opt(o)
49
+ all_opts_256 = [kk.applied_opts for kk in get_kernel_actions(k, include_0=False).values()]
50
+ k = Kernel(Tensor.arange(2560).schedule()[-1].ast)
51
+ if opts is not None:
52
+ for o in opts: k.apply_opt(o)
53
+ all_opts_2560 = [kk.applied_opts for kk in get_kernel_actions(k, include_0=False).values()]
54
+ all_opts = [x for x in all_opts_256 if x in all_opts_2560]
55
+ for opts in all_opts:
56
+ if exclude is not None and opts[-1] in exclude: continue
57
+ print(opts)
58
+ self.test_complexity(opts)
59
+ def test_all_opts_w_local(self):
60
+ with contextlib.suppress(KernelOptError):
61
+ return self.test_all_opts([Opt(OptOps.LOCAL, 0, 16)], [Opt(op=OptOps.PADTO, axis=1, amt=32)])
62
+ def test_all_opts_w_upcast(self): return self.test_all_opts([Opt(OptOps.UPCAST, 0, 4)])
63
+ def test_all_opts_w_unroll(self): return self.test_all_opts([Opt(OptOps.UNROLL, 0, 4)], [Opt(op=OptOps.GROUP, axis=0, amt=0)])
64
+ def test_all_opts_w_upcast_and_unroll(self):
65
+ return self.test_all_opts([Opt(OptOps.UPCAST, 0, 4), Opt(OptOps.UNROLL, 0, 4)], [Opt(op=OptOps.GROUP, axis=0, amt=0)])
66
+
67
+ class TestIndexing(unittest.TestCase):
68
+ def test_arange_2_reduce(self):
69
+ needle = Tensor.zeros(16384, dtype=dtypes.int).contiguous()
70
+ needle[1337] = 1
71
+ needle.realize()
72
+ with Context(NOOPT=1, FUSE_ARANGE=1):
73
+ GlobalCounters.reset()
74
+ # TODO: it should work without these reshapes
75
+ out = ((Tensor.arange(1,16385).reshape(16384,1)-1)*needle.reshape(16384,1)).sum()
76
+ sched = out.schedule()
77
+ assert len(sched) == 1
78
+ run_schedule(sched)
79
+ assert out.item() == 1337, f"expected 1337, got {out.item()}"
80
+
81
+ @unittest.skipIf(getenv("PTX"), "broken on ptx for some reason")
82
+ def test_manual_index(self):
83
+ dataset = Tensor.rand(16384, 256).realize()
84
+ idxs = Tensor([0,3,5,6]).realize()
85
+ real_index = dataset.numpy()[idxs.numpy()]
86
+ print("*** indexing ***")
87
+ with Context(NOOPT=1, FUSE_ARANGE=1):
88
+ GlobalCounters.reset()
89
+ rng = Tensor.ones(4, 256, 16384, dtype=dtypes.int)._cumsum(axis=-1, _first_zero=True).reshape(4, 256, 16384, 1)
90
+ idxs = idxs.reshape(4,1,1,1).expand(4, 256, 16384, 1)
91
+ reshape_dataset = dataset.T.reshape(1, 256, 16384, 1).expand(4, 256, 16384, 1)
92
+ full = (rng==idxs).where(reshape_dataset, Tensor.zeros(4, 256, 16384, 1))
93
+ X = full.sum(axis=(2,3))
94
+ sched = X.schedule()
95
+ assert len(sched) == 1
96
+ run_schedule(sched)
97
+ assert GlobalCounters.global_ops < 4*16384, f"too many ops {GlobalCounters.global_ops}"
98
+ np.testing.assert_allclose(real_index, X.numpy())
99
+
100
+ def test_index(self):
101
+ dataset = Tensor.rand(16384, 256).realize()
102
+ idxs = Tensor([0,3,5,6]).realize()
103
+ real_index = dataset.numpy()[idxs.numpy()]
104
+ print("*** indexing ***")
105
+ with Context(NOOPT=1):
106
+ GlobalCounters.reset()
107
+ X = dataset[idxs]
108
+ assert X.shape == (4,256)
109
+ sched = X.schedule()
110
+ # TODO: enable these asserts when the scheduler can handle this
111
+ #assert len(sched) == 1, f"{len(sched)} != 1"
112
+ run_schedule(sched)
113
+ #assert GlobalCounters.global_ops < 4*16384, f"too many ops {GlobalCounters.global_ops}"
114
+ np.testing.assert_allclose(real_index, X.numpy())
115
+
116
+ def test_index_fused(self, noopt=1):
117
+ dataset = Tensor.rand(16384, 256).realize()
118
+ idxs = Tensor([0,3,5,6]).realize()
119
+ real_index = dataset.numpy()[idxs.numpy()]
120
+ print("*** indexing ***")
121
+ with Context(NOOPT=noopt, FUSE_ARANGE=1):
122
+ GlobalCounters.reset()
123
+ X = dataset[idxs]
124
+ assert X.shape == (4,256)
125
+ sched = X.schedule()
126
+ assert len(sched) == 2
127
+ run_schedule(sched)
128
+ assert GlobalCounters.global_ops < 4*16384, f"too many ops {GlobalCounters.global_ops} != {4*16384}"
129
+ np.testing.assert_allclose(real_index, X.numpy())
130
+ @unittest.skip("not ready")
131
+ def test_index_fused_opt(self): self.test_index_fused(0)
132
+
133
+ def test_index_fused_out_of_bounds(self):
134
+ dataset = Tensor.rand(256, 256).realize()
135
+ idxs = Tensor([-19238, -257, 256, 495, 10982377]).realize()
136
+ with Context(NOOPT=1, FUSE_ARANGE=1):
137
+ X = dataset[idxs]
138
+ np.testing.assert_equal(X.numpy(), 0)
139
+
140
+ @unittest.skipIf(getenv("PTX"), "broken on ptx for some reason")
141
+ def test_index_mnist(self, noopt=1, op_limit=512*784*5):
142
+ from tinygrad.nn.datasets import mnist
143
+ X_train, Y_train, _, _ = mnist()
144
+ with Context(NOOPT=noopt, FUSE_ARANGE=1, SPLIT_REDUCEOP=0):
145
+ samples = Tensor.randint(getenv("BS", 512), high=X_train.shape[0]).realize()
146
+ GlobalCounters.reset()
147
+ x = X_train[samples].numpy()
148
+ y = Y_train[samples].numpy()
149
+ assert GlobalCounters.global_ops < op_limit, f"too many ops {GlobalCounters.global_ops} != {op_limit}"
150
+ np.testing.assert_allclose(X_train.numpy()[samples.numpy()], x)
151
+ np.testing.assert_allclose(Y_train.numpy()[samples.numpy()], y)
152
+ @unittest.skip("not ready")
153
+ def test_index_mnist_opt(self): self.test_index_mnist(0)
154
+
155
+ @unittest.skipIf(getenv("PTX"), "broken on ptx for some reason")
156
+ def test_llama_embedding(self, noopt=1, op_limit=65536):
157
+ # llama3 is 128256
158
+ vocab_size, embed_size = (10, 3) if CI else (32000, 4096)
159
+ emb = nn.Embedding(vocab_size, embed_size)
160
+ emb_w = emb.weight.numpy()
161
+ x = Tensor([1,2,3,4])
162
+ with Context(NOOPT=noopt, FUSE_ARANGE=1):
163
+ GlobalCounters.reset()
164
+ z = emb(x).realize()
165
+ self.assertLessEqual(GlobalCounters.global_ops, op_limit)
166
+ self.assertEqual(GlobalCounters.kernel_count, 2)
167
+ if getenv("CHECK", 1):
168
+ import torch
169
+ with torch.no_grad():
170
+ torch_emb = torch.nn.Embedding(vocab_size, embed_size).eval()
171
+ torch_emb.weight[:] = torch.tensor(emb_w, dtype=torch.float32)
172
+ torch_z = torch_emb(torch.tensor(x.numpy()))
173
+ # TODO: reshape to match torch, should we do this in nn?
174
+ np.testing.assert_allclose(z.numpy().reshape(4, embed_size), torch_z.detach().numpy(), atol=1e-8, rtol=1e-8)
175
+ # at least the arange is being fused
176
+ def test_llama_embedding_opt(self): self.test_llama_embedding(0, 1_736_704_000 if CI else 5_898_240_000)
177
+
178
+ if __name__ == "__main__":
179
+ unittest.main()
@@ -2,6 +2,7 @@
2
2
  import unittest
3
3
  import numpy as np
4
4
  from tinygrad import dtypes, Tensor, TinyJit, GlobalCounters, Variable
5
+ from tinygrad.engine.schedule import create_schedule
5
6
 
6
7
  N = 200 # has to be bigger than the cache to fail
7
8
 
@@ -57,10 +58,12 @@ class TestAssign(unittest.TestCase):
57
58
  x.realize()
58
59
  x = Tensor([0])
59
60
  f(x)
60
- assert (out:=x.item()) == 1, f"expected 1, got {out}"
61
+ out = x.item()
62
+ assert out == 1, f"expected 1, got {out}"
61
63
  x = Tensor([0])
62
64
  f(x)
63
- assert (out:=x.item()) == 1, f"expected 1, got {out}"
65
+ out = x.item()
66
+ assert out == 1, f"expected 1, got {out}"
64
67
 
65
68
  def test_assign_add_jit(self):
66
69
  @TinyJit
@@ -165,6 +168,16 @@ class TestAssign(unittest.TestCase):
165
168
  a += 1
166
169
  np.testing.assert_allclose(a.numpy(), 3)
167
170
 
171
+ # NOTE: this is similar to the resnet failure
172
+ #@unittest.expectedFailure
173
+ def test_double_assign_alt(self):
174
+ a = Tensor.ones(4).contiguous().realize()
175
+ b = Tensor([1, 2, 3, 4]).realize().lazydata
176
+ a1 = a.lazydata.assign(b)
177
+ a2 = a.lazydata.assign(b)
178
+ sched = create_schedule([a1, a2])
179
+ self.assertEqual(len(sched), 1)
180
+
168
181
  def test_crossover_assign(self):
169
182
  a = Tensor.full((4,), 2).contiguous().realize()
170
183
  b = Tensor.full((4,), 3).contiguous().realize()
@@ -347,7 +360,7 @@ class TestAssign(unittest.TestCase):
347
360
 
348
361
  def test_permuted_assignment_masked_view_possible(self):
349
362
  a = Tensor.ones(4, 4).contiguous().realize()
350
- b = a.shrink((None, (0, 2))).pad((None, (0, 2)), 2)
363
+ b = a.shrink((None, (0, 2))).pad((None, (0, 2)), value=2)
351
364
  a.assign(a + b)
352
365
  kc = GlobalCounters.kernel_count
353
366
  a.realize()
@@ -357,7 +370,7 @@ class TestAssign(unittest.TestCase):
357
370
  def test_permuted_assignment_masked_view_not_contiguous(self):
358
371
  a = Tensor.ones(4, 4).contiguous().realize()
359
372
  with self.assertRaisesRegex(RuntimeError, "contiguous"):
360
- b = a.shrink((None, (0, 2))).pad((None, (0, 2)), 2).permute(1, 0)
373
+ b = a.shrink((None, (0, 2))).pad((None, (0, 2)), value=2).permute(1, 0)
361
374
  a.assign(a + b)
362
375
  a.realize()
363
376
 
@@ -0,0 +1,18 @@
1
+ import unittest
2
+ from tinygrad import Tensor, dtypes, Device
3
+ from tinygrad.engine.realize import lower_schedule
4
+ from tinygrad.device import is_dtype_supported
5
+
6
+ class TestCompileFailures(unittest.TestCase):
7
+ def compile(self, out:Tensor):
8
+ for _ in lower_schedule(out.schedule()): pass
9
+
10
+ @unittest.skipUnless(is_dtype_supported(dtypes.uchar, Device.DEFAULT), f"no uint8 on {Device.DEFAULT}")
11
+ def test_interpolate_atari(self):
12
+ self.compile(Tensor.empty(210, 160, dtype='uint8').interpolate((64, 64)))
13
+
14
+ def test_add_max_uchar(self):
15
+ self.compile((Tensor.empty(1024, dtype='uint8') + Tensor.empty(1024, dtype='uint8')).max())
16
+
17
+ if __name__ == '__main__':
18
+ unittest.main()
@@ -1,15 +1,15 @@
1
1
  import unittest, math
2
2
  from tinygrad import Tensor, Device, dtypes
3
+ from tinygrad.ops import Ops
3
4
  from tinygrad.engine.schedule import create_schedule
4
5
  from tinygrad.helpers import CI
5
- from tinygrad.ops import BufferOps
6
6
  import numpy as np
7
- from test.helpers import is_dtype_supported
7
+ from tinygrad.device import is_dtype_supported
8
8
 
9
9
  def _check_ast_count(desired_count:int, t:Tensor):
10
10
  # NOTE: this has side effect because everything can be scheduled only once
11
11
  schedule = create_schedule(t.lazydata.lbs)
12
- asts = [s for s in schedule if s.ast[0].op is BufferOps.STORE]
12
+ asts = [s for s in schedule if s.ast.op is Ops.SINK]
13
13
  assert len(asts) == desired_count
14
14
 
15
15
  class TestUnaryOpsConstFolding(unittest.TestCase):
@@ -23,6 +23,7 @@ class TestUnaryOpsConstFolding(unittest.TestCase):
23
23
  _check_ast_count(0, Tensor.ones(4).cast(dtypes.int16))
24
24
  _check_ast_count(0, Tensor.full(4, fill_value=-1).cast(dtypes.uint16))
25
25
 
26
+ @unittest.expectedFailure # no two level fold at lazybuffer
26
27
  def test_neg_folding(self):
27
28
  _check_ast_count(0, Tensor([1, 2, 3]).mul(-1).neg())
28
29
  _check_ast_count(0, Tensor([1, 2, 3]).neg().mul(-1))
@@ -78,6 +79,11 @@ class TestBinaryOpsConstFolding(unittest.TestCase):
78
79
  def test_div_tensor_one(self):
79
80
  _check_ast_count(0, Tensor([1.0, 2, 3, 4]) / Tensor.ones(4))
80
81
 
82
+ def test_idiv_literal_one(self):
83
+ _check_ast_count(0, Tensor([1, 2, 3, 4]) // 1)
84
+ def test_idiv_tensor_one(self):
85
+ _check_ast_count(0, Tensor([1, 2, 3, 4]) // Tensor.ones(4, dtype=dtypes.int32))
86
+
81
87
  def test_pow_literal_zero(self):
82
88
  _check_ast_count(0, Tensor([1.0, 2, 3, 4]) ** 0)
83
89
  def test_pow_tensor_zero(self):
@@ -124,13 +130,16 @@ class TestMovedConstFolding(unittest.TestCase):
124
130
 
125
131
  def test_cast_padded(self):
126
132
  # NOTE: this is folded due to CAST_BEFORE_VIEW
127
- _check_ast_count(0, Tensor.ones(4).pad(((1, 1),)).cast(dtypes.int16))
128
- np.testing.assert_equal(Tensor.ones(4).pad(((1, 1),)).cast(dtypes.int16).numpy(), [0, 1, 1, 1, 1, 0])
129
- _check_ast_count(0, Tensor.full(4, fill_value=-1).pad(((1, 1),)).cast(dtypes.uint16))
130
- np.testing.assert_equal(Tensor.full(4, fill_value=-1).pad(((1, 1),)).cast(dtypes.uint16).numpy(), [0, 65535, 65535, 65535, 65535, 0])
133
+ if is_dtype_supported(dtypes.int16):
134
+ _check_ast_count(0, Tensor.ones(4).pad(((1, 1),)).cast(dtypes.int16))
135
+ np.testing.assert_equal(Tensor.ones(4).pad(((1, 1),)).cast(dtypes.int16).numpy(), [0, 1, 1, 1, 1, 0])
136
+ if is_dtype_supported(dtypes.uint16):
137
+ _check_ast_count(0, Tensor.full(4, fill_value=-1).pad(((1, 1),)).cast(dtypes.uint16))
138
+ np.testing.assert_equal(Tensor.full(4, fill_value=-1).pad(((1, 1),)).cast(dtypes.uint16).numpy(), [0, 65535, 65535, 65535, 65535, 0])
131
139
  # not folded
132
- _check_ast_count(1, Tensor.ones(4).pad(((1, 1),)).cast(dtypes.int64))
133
- np.testing.assert_equal(Tensor.ones(4).pad(((1, 1),)).cast(dtypes.int64).numpy(), [0, 1, 1, 1, 1, 0])
140
+ if is_dtype_supported(dtypes.int64):
141
+ _check_ast_count(1, Tensor.ones(4).pad(((1, 1),)).cast(dtypes.int64))
142
+ np.testing.assert_equal(Tensor.ones(4).pad(((1, 1),)).cast(dtypes.int64).numpy(), [0, 1, 1, 1, 1, 0])
134
143
 
135
144
  class TestReduceOpsConstFolding(unittest.TestCase):
136
145
  def test_const_sum(self):
@@ -145,10 +154,18 @@ class TestReduceOpsConstFolding(unittest.TestCase):
145
154
  _check_ast_count(1, Tensor.ones(4).pad(((1, 1),)).sum())
146
155
  np.testing.assert_equal(Tensor.ones(4).pad(((1, 1),)).sum().numpy(), 4)
147
156
 
148
- # NOTE: cannot just count the non-padded area because some UnaryOps f do not have f(0) = 0.
157
+ # NOTE: cannot just count the non-padded area because some Ops f do not have f(0) = 0.
149
158
  _check_ast_count(1, Tensor.ones(4).pad(((1, 1),)).exp().sum())
150
159
  np.testing.assert_allclose(Tensor.ones(4).pad(((1, 1),)).exp().sum().numpy(), 4 * math.e + 2)
151
160
 
161
+ def test_const_prod(self):
162
+ _check_ast_count(0, Tensor.full((2, 3), fill_value=2).prod())
163
+ np.testing.assert_equal(Tensor.full((2, 3), fill_value=2).prod().numpy(), 2**(2*3))
164
+ _check_ast_count(0, Tensor.full((4, 5, 6), fill_value=2).prod(axis=0))
165
+ np.testing.assert_equal(Tensor.full((4, 5, 6), fill_value=2).prod(axis=0).numpy(), np.full((5, 6), 2**4))
166
+ _check_ast_count(0, Tensor(4).prod())
167
+ np.testing.assert_equal(Tensor(4).prod().numpy(), 4)
168
+
152
169
  def test_const_max(self):
153
170
  _check_ast_count(0, Tensor.ones(4, 5, 6).max())
154
171
  np.testing.assert_equal(Tensor.ones(4, 5, 6).max().numpy(), 1)
@@ -234,7 +251,6 @@ class TestTautologicalCompare(unittest.TestCase):
234
251
  np.testing.assert_equal((Tensor(True) < Tensor(False)).numpy(), False)
235
252
  np.testing.assert_equal((Tensor(True) < Tensor(True)).numpy(), False)
236
253
 
237
- @unittest.skip("not implemented yet")
238
254
  def test_a_eq_a(self):
239
255
  # self eq is always true for int or bool
240
256
  a = Tensor([1, 2, 3])
@@ -244,7 +260,6 @@ class TestTautologicalCompare(unittest.TestCase):
244
260
  a = Tensor([math.nan, 1.0, 2.0])
245
261
  np.testing.assert_equal((a == a).numpy(), [False, True, True])
246
262
 
247
- @unittest.skip("not implemented yet")
248
263
  def test_a_ne_a(self):
249
264
  # self not eq is always false for int or bool
250
265
  a = Tensor([1, 2, 3])
@@ -42,6 +42,14 @@ class TestConv(unittest.TestCase):
42
42
 
43
43
  print(ret.numpy())
44
44
 
45
+ def test_two_binops_no_rerun_small(self):
46
+ Tensor.no_grad = True
47
+ x = Tensor.rand(1,1,32,32)
48
+ w = Tensor.rand(1,1,3,3)
49
+ out = x.conv2d(w, padding=(1,1))
50
+ np.testing.assert_allclose(out.relu().numpy(), np.maximum(out.numpy(), 0))
51
+ Tensor.no_grad = False
52
+
45
53
  def test_two_binops_no_rerun(self):
46
54
  Tensor.no_grad = True
47
55
  x = Tensor.randn(1,12,128,256)