tinygrad 0.10.0__tar.gz → 0.10.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (186) hide show
  1. {tinygrad-0.10.0 → tinygrad-0.10.2}/PKG-INFO +36 -13
  2. {tinygrad-0.10.0 → tinygrad-0.10.2}/README.md +7 -6
  3. {tinygrad-0.10.0 → tinygrad-0.10.2}/setup.py +23 -14
  4. {tinygrad-0.10.0 → tinygrad-0.10.2}/test/test_arange.py +18 -17
  5. {tinygrad-0.10.0 → tinygrad-0.10.2}/test/test_assign.py +18 -11
  6. {tinygrad-0.10.0 → tinygrad-0.10.2}/test/test_const_folding.py +80 -10
  7. {tinygrad-0.10.0 → tinygrad-0.10.2}/test/test_conv_shapetracker.py +5 -8
  8. {tinygrad-0.10.0 → tinygrad-0.10.2}/test/test_copy_speed.py +5 -5
  9. {tinygrad-0.10.0 → tinygrad-0.10.2}/test/test_device_speed.py +1 -1
  10. {tinygrad-0.10.0 → tinygrad-0.10.2}/test/test_dtype.py +55 -17
  11. {tinygrad-0.10.0 → tinygrad-0.10.2}/test/test_dtype_alu.py +41 -9
  12. {tinygrad-0.10.0 → tinygrad-0.10.2}/test/test_fusion_op.py +8 -9
  13. {tinygrad-0.10.0 → tinygrad-0.10.2}/test/test_fuzz_shape_ops.py +1 -1
  14. {tinygrad-0.10.0 → tinygrad-0.10.2}/test/test_gc.py +24 -8
  15. {tinygrad-0.10.0 → tinygrad-0.10.2}/test/test_graph.py +2 -3
  16. {tinygrad-0.10.0 → tinygrad-0.10.2}/test/test_hcq.py +105 -77
  17. {tinygrad-0.10.0 → tinygrad-0.10.2}/test/test_image_dtype.py +63 -8
  18. {tinygrad-0.10.0 → tinygrad-0.10.2}/test/test_jit.py +102 -3
  19. tinygrad-0.10.2/test/test_kernel_cache.py +29 -0
  20. {tinygrad-0.10.0 → tinygrad-0.10.2}/test/test_linearizer.py +209 -180
  21. {tinygrad-0.10.0 → tinygrad-0.10.2}/test/test_linearizer_dumb.py +9 -7
  22. {tinygrad-0.10.0 → tinygrad-0.10.2}/test/test_linearizer_failures.py +99 -119
  23. {tinygrad-0.10.0 → tinygrad-0.10.2}/test/test_linearizer_overflows.py +11 -11
  24. {tinygrad-0.10.0 → tinygrad-0.10.2}/test/test_metal.py +3 -5
  25. {tinygrad-0.10.0 → tinygrad-0.10.2}/test/test_multitensor.py +171 -53
  26. {tinygrad-0.10.0 → tinygrad-0.10.2}/test/test_net_speed.py +1 -1
  27. {tinygrad-0.10.0 → tinygrad-0.10.2}/test/test_nn.py +48 -36
  28. {tinygrad-0.10.0 → tinygrad-0.10.2}/test/test_ops.py +520 -95
  29. {tinygrad-0.10.0 → tinygrad-0.10.2}/test/test_optim.py +1 -1
  30. {tinygrad-0.10.0 → tinygrad-0.10.2}/test/test_pickle.py +60 -6
  31. tinygrad-0.10.2/test/test_profiler.py +163 -0
  32. tinygrad-0.10.2/test/test_quantize_onnx.py +212 -0
  33. {tinygrad-0.10.0 → tinygrad-0.10.2}/test/test_randomness.py +4 -3
  34. {tinygrad-0.10.0 → tinygrad-0.10.2}/test/test_renderer_failures.py +17 -9
  35. {tinygrad-0.10.0 → tinygrad-0.10.2}/test/test_sample.py +2 -1
  36. {tinygrad-0.10.0 → tinygrad-0.10.2}/test/test_schedule.py +944 -183
  37. {tinygrad-0.10.0 → tinygrad-0.10.2}/test/test_search.py +43 -11
  38. {tinygrad-0.10.0 → tinygrad-0.10.2}/test/test_setitem.py +24 -9
  39. {tinygrad-0.10.0 → tinygrad-0.10.2}/test/test_speed_v_torch.py +10 -3
  40. {tinygrad-0.10.0 → tinygrad-0.10.2}/test/test_subbuffer.py +20 -3
  41. {tinygrad-0.10.0 → tinygrad-0.10.2}/test/test_symbolic_ops.py +0 -2
  42. {tinygrad-0.10.0 → tinygrad-0.10.2}/test/test_symbolic_shapetracker.py +12 -1
  43. {tinygrad-0.10.0 → tinygrad-0.10.2}/test/test_tensor.py +142 -58
  44. {tinygrad-0.10.0 → tinygrad-0.10.2}/test/test_tensor_data.py +14 -0
  45. tinygrad-0.10.0/test/test_lazybuffer.py → tinygrad-0.10.2/test/test_tensor_uop.py +28 -45
  46. {tinygrad-0.10.0 → tinygrad-0.10.2}/test/test_tiny.py +44 -10
  47. {tinygrad-0.10.0 → tinygrad-0.10.2}/test/test_transcendental.py +19 -8
  48. {tinygrad-0.10.0 → tinygrad-0.10.2}/test/test_uop_graph.py +128 -67
  49. {tinygrad-0.10.0 → tinygrad-0.10.2}/test/test_uops.py +229 -61
  50. {tinygrad-0.10.0 → tinygrad-0.10.2}/test/test_uops_stats.py +32 -18
  51. {tinygrad-0.10.0 → tinygrad-0.10.2}/test/test_winograd.py +11 -4
  52. {tinygrad-0.10.0 → tinygrad-0.10.2}/test/test_zero_copy.py +1 -1
  53. tinygrad-0.10.2/tinygrad/codegen/devectorizer.py +247 -0
  54. tinygrad-0.10.2/tinygrad/codegen/expander.py +121 -0
  55. {tinygrad-0.10.0 → tinygrad-0.10.2}/tinygrad/codegen/kernel.py +141 -201
  56. tinygrad-0.10.2/tinygrad/codegen/linearize.py +234 -0
  57. {tinygrad-0.10.0 → tinygrad-0.10.2}/tinygrad/codegen/lowerer.py +60 -42
  58. tinygrad-0.10.2/tinygrad/codegen/symbolic.py +476 -0
  59. {tinygrad-0.10.0 → tinygrad-0.10.2}/tinygrad/codegen/transcendental.py +22 -13
  60. tinygrad-0.10.2/tinygrad/device.py +361 -0
  61. {tinygrad-0.10.0 → tinygrad-0.10.2}/tinygrad/dtype.py +39 -28
  62. {tinygrad-0.10.0 → tinygrad-0.10.2}/tinygrad/engine/jit.py +83 -65
  63. {tinygrad-0.10.0 → tinygrad-0.10.2}/tinygrad/engine/memory.py +4 -5
  64. tinygrad-0.10.2/tinygrad/engine/multi.py +161 -0
  65. tinygrad-0.10.2/tinygrad/engine/realize.py +171 -0
  66. tinygrad-0.10.2/tinygrad/engine/schedule.py +458 -0
  67. {tinygrad-0.10.0 → tinygrad-0.10.2}/tinygrad/engine/search.py +55 -66
  68. tinygrad-0.10.2/tinygrad/gradient.py +73 -0
  69. {tinygrad-0.10.0 → tinygrad-0.10.2}/tinygrad/helpers.py +81 -59
  70. {tinygrad-0.10.0 → tinygrad-0.10.2}/tinygrad/nn/__init__.py +30 -32
  71. {tinygrad-0.10.0 → tinygrad-0.10.2}/tinygrad/nn/datasets.py +1 -2
  72. {tinygrad-0.10.0 → tinygrad-0.10.2}/tinygrad/nn/optim.py +22 -26
  73. {tinygrad-0.10.0 → tinygrad-0.10.2}/tinygrad/nn/state.py +91 -66
  74. tinygrad-0.10.2/tinygrad/ops.py +1003 -0
  75. tinygrad-0.10.2/tinygrad/renderer/__init__.py +148 -0
  76. {tinygrad-0.10.0 → tinygrad-0.10.2}/tinygrad/renderer/cstyle.py +99 -92
  77. {tinygrad-0.10.0 → tinygrad-0.10.2}/tinygrad/renderer/llvmir.py +83 -34
  78. {tinygrad-0.10.0 → tinygrad-0.10.2}/tinygrad/renderer/ptx.py +83 -99
  79. tinygrad-0.10.2/tinygrad/renderer/wgsl.py +95 -0
  80. {tinygrad-0.10.0 → tinygrad-0.10.2}/tinygrad/runtime/autogen/amd_gpu.py +39507 -12
  81. {tinygrad-0.10.0 → tinygrad-0.10.2}/tinygrad/runtime/autogen/comgr.py +2 -0
  82. {tinygrad-0.10.0 → tinygrad-0.10.2}/tinygrad/runtime/autogen/kfd.py +4 -3
  83. {tinygrad-0.10.0 → tinygrad-0.10.2}/tinygrad/runtime/autogen/kgsl.py +1 -1
  84. {tinygrad-0.10.0 → tinygrad-0.10.2}/tinygrad/runtime/autogen/libc.py +404 -71
  85. tinygrad-0.10.2/tinygrad/runtime/autogen/llvm.py +11379 -0
  86. tinygrad-0.10.2/tinygrad/runtime/autogen/pci.py +1333 -0
  87. tinygrad-0.10.2/tinygrad/runtime/autogen/vfio.py +891 -0
  88. tinygrad-0.10.2/tinygrad/runtime/autogen/webgpu.py +6985 -0
  89. {tinygrad-0.10.0 → tinygrad-0.10.2}/tinygrad/runtime/graph/cuda.py +8 -9
  90. tinygrad-0.10.2/tinygrad/runtime/graph/hcq.py +205 -0
  91. tinygrad-0.10.2/tinygrad/runtime/graph/metal.py +100 -0
  92. tinygrad-0.10.2/tinygrad/runtime/ops_amd.py +635 -0
  93. {tinygrad-0.10.0 → tinygrad-0.10.2}/tinygrad/runtime/ops_cloud.py +34 -34
  94. tinygrad-0.10.2/tinygrad/runtime/ops_cpu.py +24 -0
  95. {tinygrad-0.10.0 → tinygrad-0.10.2}/tinygrad/runtime/ops_cuda.py +30 -27
  96. {tinygrad-0.10.0 → tinygrad-0.10.2}/tinygrad/runtime/ops_disk.py +62 -63
  97. tinygrad-0.10.2/tinygrad/runtime/ops_dsp.py +298 -0
  98. {tinygrad-0.10.0 → tinygrad-0.10.2}/tinygrad/runtime/ops_gpu.py +30 -30
  99. {tinygrad-0.10.0 → tinygrad-0.10.2}/tinygrad/runtime/ops_hip.py +29 -31
  100. tinygrad-0.10.2/tinygrad/runtime/ops_llvm.py +58 -0
  101. tinygrad-0.10.2/tinygrad/runtime/ops_metal.py +224 -0
  102. {tinygrad-0.10.0 → tinygrad-0.10.2}/tinygrad/runtime/ops_npy.py +2 -2
  103. {tinygrad-0.10.0 → tinygrad-0.10.2}/tinygrad/runtime/ops_nv.py +238 -273
  104. {tinygrad-0.10.0 → tinygrad-0.10.2}/tinygrad/runtime/ops_python.py +55 -50
  105. {tinygrad-0.10.0 → tinygrad-0.10.2}/tinygrad/runtime/ops_qcom.py +129 -157
  106. tinygrad-0.10.2/tinygrad/runtime/ops_webgpu.py +225 -0
  107. tinygrad-0.10.2/tinygrad/runtime/support/allocator.py +94 -0
  108. tinygrad-0.10.2/tinygrad/runtime/support/am/amdev.py +396 -0
  109. tinygrad-0.10.2/tinygrad/runtime/support/am/ip.py +463 -0
  110. {tinygrad-0.10.0 → tinygrad-0.10.2}/tinygrad/runtime/support/compiler_cuda.py +4 -2
  111. {tinygrad-0.10.0 → tinygrad-0.10.2}/tinygrad/runtime/support/elf.py +28 -4
  112. tinygrad-0.10.2/tinygrad/runtime/support/hcq.py +471 -0
  113. tinygrad-0.10.2/tinygrad/runtime/support/llvm.py +26 -0
  114. tinygrad-0.10.2/tinygrad/shape/__init__.py +0 -0
  115. tinygrad-0.10.2/tinygrad/shape/shapetracker.py +143 -0
  116. {tinygrad-0.10.0 → tinygrad-0.10.2}/tinygrad/shape/view.py +104 -140
  117. tinygrad-0.10.2/tinygrad/spec.py +155 -0
  118. {tinygrad-0.10.0 → tinygrad-0.10.2}/tinygrad/tensor.py +835 -527
  119. tinygrad-0.10.2/tinygrad/viz/assets/cdnjs.cloudflare.com/ajax/libs/highlight.js/11.10.0/highlight.min.js +1232 -0
  120. tinygrad-0.10.2/tinygrad/viz/assets/cdnjs.cloudflare.com/ajax/libs/highlight.js/11.10.0/languages/cpp.min.js +47 -0
  121. tinygrad-0.10.2/tinygrad/viz/assets/cdnjs.cloudflare.com/ajax/libs/highlight.js/11.10.0/languages/python.min.js +42 -0
  122. tinygrad-0.10.2/tinygrad/viz/assets/cdnjs.cloudflare.com/ajax/libs/highlight.js/11.10.0/styles/default.min.css +9 -0
  123. tinygrad-0.10.2/tinygrad/viz/assets/d3js.org/d3.v5.min.js +2 -0
  124. tinygrad-0.10.2/tinygrad/viz/assets/dagrejs.github.io/project/dagre-d3/latest/dagre-d3.min.js +4816 -0
  125. tinygrad-0.10.2/tinygrad/viz/assets/unpkg.com/@highlightjs/cdn-assets@11.10.0/styles/tokyo-night-dark.min.css +8 -0
  126. tinygrad-0.10.2/tinygrad/viz/index.html +544 -0
  127. tinygrad-0.10.2/tinygrad/viz/perfetto.html +178 -0
  128. tinygrad-0.10.2/tinygrad/viz/serve.py +205 -0
  129. {tinygrad-0.10.0 → tinygrad-0.10.2}/tinygrad.egg-info/PKG-INFO +36 -13
  130. {tinygrad-0.10.0 → tinygrad-0.10.2}/tinygrad.egg-info/SOURCES.txt +31 -9
  131. {tinygrad-0.10.0 → tinygrad-0.10.2}/tinygrad.egg-info/requires.txt +21 -6
  132. tinygrad-0.10.0/test/test_kernel_cache.py +0 -27
  133. tinygrad-0.10.0/test/test_profiler.py +0 -221
  134. tinygrad-0.10.0/test/test_viz.py +0 -93
  135. tinygrad-0.10.0/tinygrad/codegen/linearize.py +0 -95
  136. tinygrad-0.10.0/tinygrad/codegen/uopgraph.py +0 -506
  137. tinygrad-0.10.0/tinygrad/device.py +0 -221
  138. tinygrad-0.10.0/tinygrad/engine/lazy.py +0 -228
  139. tinygrad-0.10.0/tinygrad/engine/realize.py +0 -217
  140. tinygrad-0.10.0/tinygrad/engine/schedule.py +0 -419
  141. tinygrad-0.10.0/tinygrad/function.py +0 -212
  142. tinygrad-0.10.0/tinygrad/multi.py +0 -177
  143. tinygrad-0.10.0/tinygrad/ops.py +0 -1152
  144. tinygrad-0.10.0/tinygrad/renderer/__init__.py +0 -89
  145. tinygrad-0.10.0/tinygrad/runtime/graph/clang.py +0 -39
  146. tinygrad-0.10.0/tinygrad/runtime/graph/hcq.py +0 -200
  147. tinygrad-0.10.0/tinygrad/runtime/graph/metal.py +0 -103
  148. tinygrad-0.10.0/tinygrad/runtime/ops_amd.py +0 -471
  149. tinygrad-0.10.0/tinygrad/runtime/ops_clang.py +0 -35
  150. tinygrad-0.10.0/tinygrad/runtime/ops_dsp.py +0 -181
  151. tinygrad-0.10.0/tinygrad/runtime/ops_llvm.py +0 -51
  152. tinygrad-0.10.0/tinygrad/runtime/ops_metal.py +0 -188
  153. tinygrad-0.10.0/tinygrad/runtime/support/hcq.py +0 -539
  154. tinygrad-0.10.0/tinygrad/shape/shapetracker.py +0 -111
  155. {tinygrad-0.10.0 → tinygrad-0.10.2}/LICENSE +0 -0
  156. {tinygrad-0.10.0 → tinygrad-0.10.2}/setup.cfg +0 -0
  157. {tinygrad-0.10.0 → tinygrad-0.10.2}/test/test_compile_failures.py +0 -0
  158. {tinygrad-0.10.0 → tinygrad-0.10.2}/test/test_conv.py +0 -0
  159. {tinygrad-0.10.0 → tinygrad-0.10.2}/test/test_masked_st.py +0 -0
  160. {tinygrad-0.10.0 → tinygrad-0.10.2}/test/test_method_cache.py +0 -0
  161. {tinygrad-0.10.0 → tinygrad-0.10.2}/test/test_ocl.py +0 -0
  162. {tinygrad-0.10.0 → tinygrad-0.10.2}/test/test_rearrange_einops.py +0 -0
  163. {tinygrad-0.10.0 → tinygrad-0.10.2}/test/test_specific_conv.py +0 -0
  164. {tinygrad-0.10.0 → tinygrad-0.10.2}/test/test_symbolic_jit.py +0 -0
  165. {tinygrad-0.10.0 → tinygrad-0.10.2}/test/test_tensor_variable.py +0 -0
  166. {tinygrad-0.10.0 → tinygrad-0.10.2}/test/test_to_numpy.py +0 -0
  167. {tinygrad-0.10.0 → tinygrad-0.10.2}/tinygrad/__init__.py +0 -0
  168. {tinygrad-0.10.0 → tinygrad-0.10.2}/tinygrad/codegen/__init__.py +0 -0
  169. {tinygrad-0.10.0 → tinygrad-0.10.2}/tinygrad/engine/__init__.py +0 -0
  170. {tinygrad-0.10.0 → tinygrad-0.10.2}/tinygrad/py.typed +0 -0
  171. {tinygrad-0.10.0 → tinygrad-0.10.2}/tinygrad/runtime/__init__.py +0 -0
  172. {tinygrad-0.10.0 → tinygrad-0.10.2}/tinygrad/runtime/autogen/adreno.py +0 -0
  173. {tinygrad-0.10.0 → tinygrad-0.10.2}/tinygrad/runtime/autogen/cuda.py +0 -0
  174. {tinygrad-0.10.0 → tinygrad-0.10.2}/tinygrad/runtime/autogen/hip.py +0 -0
  175. {tinygrad-0.10.0 → tinygrad-0.10.2}/tinygrad/runtime/autogen/hsa.py +0 -0
  176. {tinygrad-0.10.0 → tinygrad-0.10.2}/tinygrad/runtime/autogen/io_uring.py +0 -0
  177. {tinygrad-0.10.0 → tinygrad-0.10.2}/tinygrad/runtime/autogen/nv_gpu.py +0 -0
  178. {tinygrad-0.10.0 → tinygrad-0.10.2}/tinygrad/runtime/autogen/nvrtc.py +0 -0
  179. {tinygrad-0.10.0 → tinygrad-0.10.2}/tinygrad/runtime/autogen/opencl.py +0 -0
  180. {tinygrad-0.10.0 → tinygrad-0.10.2}/tinygrad/runtime/autogen/qcom_dsp.py +0 -0
  181. {tinygrad-0.10.0 → tinygrad-0.10.2}/tinygrad/runtime/graph/__init__.py +0 -0
  182. {tinygrad-0.10.0 → tinygrad-0.10.2}/tinygrad/runtime/support/__init__.py +0 -0
  183. {tinygrad-0.10.0/tinygrad/shape → tinygrad-0.10.2/tinygrad/runtime/support/am}/__init__.py +0 -0
  184. {tinygrad-0.10.0 → tinygrad-0.10.2}/tinygrad/runtime/support/compiler_hip.py +0 -0
  185. {tinygrad-0.10.0 → tinygrad-0.10.2}/tinygrad.egg-info/dependency_links.txt +0 -0
  186. {tinygrad-0.10.0 → tinygrad-0.10.2}/tinygrad.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.2
2
2
  Name: tinygrad
3
- Version: 0.10.0
3
+ Version: 0.10.2
4
4
  Summary: You like pytorch? You like micrograd? You love tinygrad! <3
5
5
  Author: George Hotz
6
6
  License: MIT
@@ -9,25 +9,39 @@ Classifier: License :: OSI Approved :: MIT License
9
9
  Requires-Python: >=3.10
10
10
  Description-Content-Type: text/markdown
11
11
  License-File: LICENSE
12
- Provides-Extra: llvm
13
- Requires-Dist: llvmlite; extra == "llvm"
14
12
  Provides-Extra: arm
15
13
  Requires-Dist: unicorn; extra == "arm"
16
14
  Provides-Extra: triton
17
15
  Requires-Dist: triton-nightly>=2.1.0.dev20231014192330; extra == "triton"
18
16
  Provides-Extra: linting
19
17
  Requires-Dist: pylint; extra == "linting"
20
- Requires-Dist: mypy==1.11.2; extra == "linting"
18
+ Requires-Dist: mypy==1.13.0; extra == "linting"
21
19
  Requires-Dist: typing-extensions; extra == "linting"
22
20
  Requires-Dist: pre-commit; extra == "linting"
23
21
  Requires-Dist: ruff; extra == "linting"
24
22
  Requires-Dist: types-tqdm; extra == "linting"
23
+ Provides-Extra: testing-minimal
24
+ Requires-Dist: numpy; extra == "testing-minimal"
25
+ Requires-Dist: torch; extra == "testing-minimal"
26
+ Requires-Dist: pytest; extra == "testing-minimal"
27
+ Requires-Dist: pytest-xdist; extra == "testing-minimal"
28
+ Requires-Dist: hypothesis; extra == "testing-minimal"
29
+ Provides-Extra: testing-unit
30
+ Requires-Dist: numpy; extra == "testing-unit"
31
+ Requires-Dist: torch; extra == "testing-unit"
32
+ Requires-Dist: pytest; extra == "testing-unit"
33
+ Requires-Dist: pytest-xdist; extra == "testing-unit"
34
+ Requires-Dist: hypothesis; extra == "testing-unit"
35
+ Requires-Dist: tqdm; extra == "testing-unit"
36
+ Requires-Dist: safetensors; extra == "testing-unit"
37
+ Requires-Dist: tabulate; extra == "testing-unit"
25
38
  Provides-Extra: testing
26
39
  Requires-Dist: numpy; extra == "testing"
27
40
  Requires-Dist: torch; extra == "testing"
28
- Requires-Dist: pillow; extra == "testing"
29
41
  Requires-Dist: pytest; extra == "testing"
30
42
  Requires-Dist: pytest-xdist; extra == "testing"
43
+ Requires-Dist: hypothesis; extra == "testing"
44
+ Requires-Dist: pillow; extra == "testing"
31
45
  Requires-Dist: onnx==1.16.0; extra == "testing"
32
46
  Requires-Dist: onnx2torch; extra == "testing"
33
47
  Requires-Dist: opencv-python; extra == "testing"
@@ -40,10 +54,10 @@ Requires-Dist: tiktoken; extra == "testing"
40
54
  Requires-Dist: blobfile; extra == "testing"
41
55
  Requires-Dist: librosa; extra == "testing"
42
56
  Requires-Dist: networkx; extra == "testing"
43
- Requires-Dist: hypothesis; extra == "testing"
44
57
  Requires-Dist: nibabel; extra == "testing"
45
58
  Requires-Dist: bottle; extra == "testing"
46
59
  Requires-Dist: ggml-python; extra == "testing"
60
+ Requires-Dist: capstone; extra == "testing"
47
61
  Provides-Extra: docs
48
62
  Requires-Dist: mkdocs; extra == "docs"
49
63
  Requires-Dist: mkdocs-material; extra == "docs"
@@ -55,6 +69,14 @@ Requires-Dist: numpy; extra == "docs"
55
69
  Provides-Extra: testing-tf
56
70
  Requires-Dist: tensorflow==2.15.1; extra == "testing-tf"
57
71
  Requires-Dist: tensorflow_addons; extra == "testing-tf"
72
+ Dynamic: author
73
+ Dynamic: classifier
74
+ Dynamic: description
75
+ Dynamic: description-content-type
76
+ Dynamic: license
77
+ Dynamic: provides-extra
78
+ Dynamic: requires-python
79
+ Dynamic: summary
58
80
 
59
81
  <div align="center">
60
82
 
@@ -139,13 +161,14 @@ See [examples/beautiful_mnist.py](examples/beautiful_mnist.py) for the full vers
139
161
  tinygrad already supports numerous accelerators, including:
140
162
 
141
163
  - [x] [GPU (OpenCL)](tinygrad/runtime/ops_gpu.py)
142
- - [x] [CLANG (C Code)](tinygrad/runtime/ops_clang.py)
164
+ - [x] [CPU (C Code)](tinygrad/runtime/ops_cpu.py)
143
165
  - [x] [LLVM](tinygrad/runtime/ops_llvm.py)
144
166
  - [x] [METAL](tinygrad/runtime/ops_metal.py)
145
167
  - [x] [CUDA](tinygrad/runtime/ops_cuda.py)
146
168
  - [x] [AMD](tinygrad/runtime/ops_amd.py)
147
169
  - [x] [NV](tinygrad/runtime/ops_nv.py)
148
170
  - [x] [QCOM](tinygrad/runtime/ops_qcom.py)
171
+ - [x] [WEBGPU](tinygrad/runtime/ops_webgpu.py)
149
172
 
150
173
  And it is easy to add more! Your accelerator of choice only needs to support a total of ~25 low level ops.
151
174
 
@@ -183,8 +206,8 @@ y = Tensor([[2.0,0,-2.0]], requires_grad=True)
183
206
  z = y.matmul(x).sum()
184
207
  z.backward()
185
208
 
186
- print(x.grad.numpy()) # dz/dx
187
- print(y.grad.numpy()) # dz/dy
209
+ print(x.grad.tolist()) # dz/dx
210
+ print(y.grad.tolist()) # dz/dy
188
211
  ```
189
212
 
190
213
  The same thing but in PyTorch:
@@ -196,8 +219,8 @@ y = torch.tensor([[2.0,0,-2.0]], requires_grad=True)
196
219
  z = y.matmul(x).sum()
197
220
  z.backward()
198
221
 
199
- print(x.grad.numpy()) # dz/dx
200
- print(y.grad.numpy()) # dz/dy
222
+ print(x.grad.tolist()) # dz/dx
223
+ print(y.grad.tolist()) # dz/dy
201
224
  ```
202
225
 
203
226
  ## Contributing
@@ -208,7 +231,7 @@ We'll start with what will get your PR closed with a pointer to this section:
208
231
 
209
232
  - No code golf! While low line count is a guiding light of this project, anything that remotely looks like code golf will be closed. The true goal is reducing complexity and increasing readability, and deleting `\n`s does nothing to help with that.
210
233
  - All docs and whitespace changes will be closed unless you are a well-known contributor. The people writing the docs should be those who know the codebase the absolute best. People who have not demonstrated that shouldn't be messing with docs. Whitespace changes are both useless *and* carry a risk of introducing bugs.
211
- - Anything you claim is a "speedup" must be benchmarked. In general, the goal is simplicity, so even if your PR makes things marginally faster, you have to consider the tradeoff with maintainablity and readablity.
234
+ - Anything you claim is a "speedup" must be benchmarked. In general, the goal is simplicity, so even if your PR makes things marginally faster, you have to consider the tradeoff with maintainability and readability.
212
235
  - In general, the code outside the core `tinygrad/` folder is not well tested, so unless the current code there is broken, you shouldn't be changing it.
213
236
  - If your PR looks "complex", is a big diff, or adds lots of lines, it won't be reviewed or merged. Consider breaking it up into smaller PRs that are individually clear wins. A common pattern I see is prerequisite refactors before adding new functionality. If you can (cleanly) refactor to the point that the feature is a 3 line change, this is great, and something easy for us to review.
214
237
 
@@ -81,13 +81,14 @@ See [examples/beautiful_mnist.py](examples/beautiful_mnist.py) for the full vers
81
81
  tinygrad already supports numerous accelerators, including:
82
82
 
83
83
  - [x] [GPU (OpenCL)](tinygrad/runtime/ops_gpu.py)
84
- - [x] [CLANG (C Code)](tinygrad/runtime/ops_clang.py)
84
+ - [x] [CPU (C Code)](tinygrad/runtime/ops_cpu.py)
85
85
  - [x] [LLVM](tinygrad/runtime/ops_llvm.py)
86
86
  - [x] [METAL](tinygrad/runtime/ops_metal.py)
87
87
  - [x] [CUDA](tinygrad/runtime/ops_cuda.py)
88
88
  - [x] [AMD](tinygrad/runtime/ops_amd.py)
89
89
  - [x] [NV](tinygrad/runtime/ops_nv.py)
90
90
  - [x] [QCOM](tinygrad/runtime/ops_qcom.py)
91
+ - [x] [WEBGPU](tinygrad/runtime/ops_webgpu.py)
91
92
 
92
93
  And it is easy to add more! Your accelerator of choice only needs to support a total of ~25 low level ops.
93
94
 
@@ -125,8 +126,8 @@ y = Tensor([[2.0,0,-2.0]], requires_grad=True)
125
126
  z = y.matmul(x).sum()
126
127
  z.backward()
127
128
 
128
- print(x.grad.numpy()) # dz/dx
129
- print(y.grad.numpy()) # dz/dy
129
+ print(x.grad.tolist()) # dz/dx
130
+ print(y.grad.tolist()) # dz/dy
130
131
  ```
131
132
 
132
133
  The same thing but in PyTorch:
@@ -138,8 +139,8 @@ y = torch.tensor([[2.0,0,-2.0]], requires_grad=True)
138
139
  z = y.matmul(x).sum()
139
140
  z.backward()
140
141
 
141
- print(x.grad.numpy()) # dz/dx
142
- print(y.grad.numpy()) # dz/dy
142
+ print(x.grad.tolist()) # dz/dx
143
+ print(y.grad.tolist()) # dz/dy
143
144
  ```
144
145
 
145
146
  ## Contributing
@@ -150,7 +151,7 @@ We'll start with what will get your PR closed with a pointer to this section:
150
151
 
151
152
  - No code golf! While low line count is a guiding light of this project, anything that remotely looks like code golf will be closed. The true goal is reducing complexity and increasing readability, and deleting `\n`s does nothing to help with that.
152
153
  - All docs and whitespace changes will be closed unless you are a well-known contributor. The people writing the docs should be those who know the codebase the absolute best. People who have not demonstrated that shouldn't be messing with docs. Whitespace changes are both useless *and* carry a risk of introducing bugs.
153
- - Anything you claim is a "speedup" must be benchmarked. In general, the goal is simplicity, so even if your PR makes things marginally faster, you have to consider the tradeoff with maintainablity and readablity.
154
+ - Anything you claim is a "speedup" must be benchmarked. In general, the goal is simplicity, so even if your PR makes things marginally faster, you have to consider the tradeoff with maintainability and readability.
154
155
  - In general, the code outside the core `tinygrad/` folder is not well tested, so unless the current code there is broken, you shouldn't be changing it.
155
156
  - If your PR looks "complex", is a big diff, or adds lots of lines, it won't be reviewed or merged. Consider breaking it up into smaller PRs that are individually clear wins. A common pattern I see is prerequisite refactors before adding new functionality. If you can (cleanly) refactor to the point that the feature is a 3 line change, this is great, and something easy for us to review.
156
157
 
@@ -7,16 +7,24 @@ directory = Path(__file__).resolve().parent
7
7
  with open(directory / 'README.md', encoding='utf-8') as f:
8
8
  long_description = f.read()
9
9
 
10
+ testing_minimal = [
11
+ "numpy",
12
+ "torch",
13
+ "pytest",
14
+ "pytest-xdist",
15
+ "hypothesis",
16
+ ]
17
+
10
18
  setup(name='tinygrad',
11
- version='0.10.0',
19
+ version='0.10.2',
12
20
  description='You like pytorch? You like micrograd? You love tinygrad! <3',
13
21
  author='George Hotz',
14
22
  license='MIT',
15
23
  long_description=long_description,
16
24
  long_description_content_type='text/markdown',
17
- packages = ['tinygrad', 'tinygrad.runtime.autogen', 'tinygrad.codegen', 'tinygrad.nn', 'tinygrad.renderer', 'tinygrad.engine',
18
- 'tinygrad.runtime', 'tinygrad.runtime.support', 'tinygrad.runtime.graph', 'tinygrad.shape'],
19
- package_data = {'tinygrad': ['py.typed']},
25
+ packages = ['tinygrad', 'tinygrad.runtime.autogen', 'tinygrad.codegen', 'tinygrad.nn', 'tinygrad.renderer', 'tinygrad.engine', 'tinygrad.viz',
26
+ 'tinygrad.runtime', 'tinygrad.runtime.support', 'tinygrad.runtime.support.am', 'tinygrad.runtime.graph', 'tinygrad.shape'],
27
+ package_data = {'tinygrad': ['py.typed'], 'tinygrad.viz': ['index.html', 'perfetto.html', 'assets/**/*']},
20
28
  classifiers=[
21
29
  "Programming Language :: Python :: 3",
22
30
  "License :: OSI Approved :: MIT License"
@@ -24,24 +32,25 @@ setup(name='tinygrad',
24
32
  install_requires=[],
25
33
  python_requires='>=3.10',
26
34
  extras_require={
27
- 'llvm': ["llvmlite"],
28
35
  'arm': ["unicorn"],
29
36
  'triton': ["triton-nightly>=2.1.0.dev20231014192330"],
30
37
  'linting': [
31
38
  "pylint",
32
- "mypy==1.11.2",
39
+ "mypy==1.13.0",
33
40
  "typing-extensions",
34
41
  "pre-commit",
35
42
  "ruff",
36
43
  "types-tqdm",
37
44
  ],
38
45
  #'mlperf': ["mlperf-logging @ git+https://github.com/mlperf/logging.git@4.1.0-rc3"],
39
- 'testing': [
40
- "numpy",
41
- "torch",
46
+ 'testing_minimal': testing_minimal,
47
+ 'testing_unit': testing_minimal + [
48
+ "tqdm",
49
+ "safetensors",
50
+ "tabulate" # for sz.py
51
+ ],
52
+ 'testing': testing_minimal + [
42
53
  "pillow",
43
- "pytest",
44
- "pytest-xdist",
45
54
  "onnx==1.16.0",
46
55
  "onnx2torch",
47
56
  "opencv-python",
@@ -54,10 +63,10 @@ setup(name='tinygrad',
54
63
  "blobfile",
55
64
  "librosa",
56
65
  "networkx",
57
- "hypothesis",
58
66
  "nibabel",
59
67
  "bottle",
60
- "ggml-python"
68
+ "ggml-python",
69
+ "capstone"
61
70
  ],
62
71
  'docs': [
63
72
  "mkdocs",
@@ -71,6 +80,6 @@ setup(name='tinygrad',
71
80
  'testing_tf': [
72
81
  "tensorflow==2.15.1",
73
82
  "tensorflow_addons",
74
- ]
83
+ ],
75
84
  },
76
85
  include_package_data=True)
@@ -1,11 +1,12 @@
1
1
  import unittest, contextlib
2
2
  import numpy as np
3
- from tinygrad import Tensor, GlobalCounters, dtypes, nn
3
+ from tinygrad import Tensor, GlobalCounters, dtypes, nn, Device
4
4
  from tinygrad.helpers import CI, Context, getenv
5
5
  from tinygrad.engine.realize import run_schedule
6
6
  from tinygrad.codegen.kernel import Opt, OptOps, Kernel, KernelOptError
7
7
  from tinygrad.engine.realize import CompiledRunner, ExecItem
8
8
  from tinygrad.engine.search import get_kernel_actions
9
+ from tinygrad.ops import Ops
9
10
 
10
11
  class TestArange(unittest.TestCase):
11
12
  def _get_flops(self, N, opts=None):
@@ -21,7 +22,7 @@ class TestArange(unittest.TestCase):
21
22
  #print(p.src)
22
23
  ExecItem(CompiledRunner(p), [tt.lazydata.buffer]).run()
23
24
  np.testing.assert_equal(tt.numpy(), np.arange(N))
24
- return p.op_estimate
25
+ return p.estimates.ops
25
26
 
26
27
  def test_complexity(self, opts=None, limit=None):
27
28
  # add 1 to avoid divide by 0. arange is 0 flops now!
@@ -40,7 +41,7 @@ class TestArange(unittest.TestCase):
40
41
  def test_complexity_w_upcast_and_unroll(self): return self.test_complexity([Opt(OptOps.UPCAST, 0, 4), Opt(OptOps.UNROLL, 0, 4)], limit=1)
41
42
 
42
43
  @unittest.skip("doesn't work yet")
43
- def test_complexity_w_local_and_padto(self): return self.test_complexity([Opt(OptOps.LOCAL, 0, 16), Opt(op=OptOps.PADTO, axis=1, amt=32)])
44
+ def test_complexity_w_local_and_padto(self): return self.test_complexity([Opt(OptOps.LOCAL, 0, 16), Opt(op=OptOps.PADTO, axis=1, arg=32)])
44
45
 
45
46
  def test_all_opts(self, opts=None, exclude=None):
46
47
  k = Kernel(Tensor.arange(256).schedule()[-1].ast)
@@ -58,11 +59,11 @@ class TestArange(unittest.TestCase):
58
59
  self.test_complexity(opts)
59
60
  def test_all_opts_w_local(self):
60
61
  with contextlib.suppress(KernelOptError):
61
- return self.test_all_opts([Opt(OptOps.LOCAL, 0, 16)], [Opt(op=OptOps.PADTO, axis=1, amt=32)])
62
+ return self.test_all_opts([Opt(OptOps.LOCAL, 0, 16)], [Opt(op=OptOps.PADTO, axis=1, arg=32)])
62
63
  def test_all_opts_w_upcast(self): return self.test_all_opts([Opt(OptOps.UPCAST, 0, 4)])
63
- def test_all_opts_w_unroll(self): return self.test_all_opts([Opt(OptOps.UNROLL, 0, 4)], [Opt(op=OptOps.GROUP, axis=0, amt=0)])
64
+ def test_all_opts_w_unroll(self): return self.test_all_opts([Opt(OptOps.UNROLL, 0, 4)], [Opt(op=OptOps.GROUP, axis=0, arg=0)])
64
65
  def test_all_opts_w_upcast_and_unroll(self):
65
- return self.test_all_opts([Opt(OptOps.UPCAST, 0, 4), Opt(OptOps.UNROLL, 0, 4)], [Opt(op=OptOps.GROUP, axis=0, amt=0)])
66
+ return self.test_all_opts([Opt(OptOps.UPCAST, 0, 4), Opt(OptOps.UNROLL, 0, 4)], [Opt(op=OptOps.GROUP, axis=0, arg=0)])
66
67
 
67
68
  class TestIndexing(unittest.TestCase):
68
69
  def test_arange_2_reduce(self):
@@ -71,12 +72,11 @@ class TestIndexing(unittest.TestCase):
71
72
  needle.realize()
72
73
  with Context(NOOPT=1, FUSE_ARANGE=1):
73
74
  GlobalCounters.reset()
74
- # TODO: it should work without these reshapes
75
- out = ((Tensor.arange(1,16385).reshape(16384,1)-1)*needle.reshape(16384,1)).sum()
75
+ out = ((Tensor.arange(1,16385)-1)*needle).sum()
76
76
  sched = out.schedule()
77
- assert len(sched) == 1
77
+ self.assertEqual(len(sched), 1)
78
78
  run_schedule(sched)
79
- assert out.item() == 1337, f"expected 1337, got {out.item()}"
79
+ self.assertEqual(out.item(), 1337)
80
80
 
81
81
  @unittest.skipIf(getenv("PTX"), "broken on ptx for some reason")
82
82
  def test_manual_index(self):
@@ -86,13 +86,13 @@ class TestIndexing(unittest.TestCase):
86
86
  print("*** indexing ***")
87
87
  with Context(NOOPT=1, FUSE_ARANGE=1):
88
88
  GlobalCounters.reset()
89
- rng = Tensor.ones(4, 256, 16384, dtype=dtypes.int)._cumsum(axis=-1, _first_zero=True).reshape(4, 256, 16384, 1)
89
+ rng = Tensor.ones(4, 256, 16384, dtype=dtypes.int)._cumalu(axis=-1, op=Ops.ADD, _include_initial=True).reshape(4, 256, 16384, 1)
90
90
  idxs = idxs.reshape(4,1,1,1).expand(4, 256, 16384, 1)
91
91
  reshape_dataset = dataset.T.reshape(1, 256, 16384, 1).expand(4, 256, 16384, 1)
92
92
  full = (rng==idxs).where(reshape_dataset, Tensor.zeros(4, 256, 16384, 1))
93
93
  X = full.sum(axis=(2,3))
94
94
  sched = X.schedule()
95
- assert len(sched) == 1
95
+ self.assertEqual(len(sched), 1)
96
96
  run_schedule(sched)
97
97
  assert GlobalCounters.global_ops < 4*16384, f"too many ops {GlobalCounters.global_ops}"
98
98
  np.testing.assert_allclose(real_index, X.numpy())
@@ -108,7 +108,7 @@ class TestIndexing(unittest.TestCase):
108
108
  assert X.shape == (4,256)
109
109
  sched = X.schedule()
110
110
  # TODO: enable these asserts when the scheduler can handle this
111
- #assert len(sched) == 1, f"{len(sched)} != 1"
111
+ #self.assertEqual(len(sched), 1)
112
112
  run_schedule(sched)
113
113
  #assert GlobalCounters.global_ops < 4*16384, f"too many ops {GlobalCounters.global_ops}"
114
114
  np.testing.assert_allclose(real_index, X.numpy())
@@ -123,7 +123,7 @@ class TestIndexing(unittest.TestCase):
123
123
  X = dataset[idxs]
124
124
  assert X.shape == (4,256)
125
125
  sched = X.schedule()
126
- assert len(sched) == 2
126
+ self.assertEqual(len(sched), 2)
127
127
  run_schedule(sched)
128
128
  assert GlobalCounters.global_ops < 4*16384, f"too many ops {GlobalCounters.global_ops} != {4*16384}"
129
129
  np.testing.assert_allclose(real_index, X.numpy())
@@ -138,7 +138,7 @@ class TestIndexing(unittest.TestCase):
138
138
  np.testing.assert_equal(X.numpy(), 0)
139
139
 
140
140
  @unittest.skipIf(getenv("PTX"), "broken on ptx for some reason")
141
- def test_index_mnist(self, noopt=1, op_limit=512*784*5):
141
+ def test_index_mnist(self, noopt=1, op_limit=512*784*13):
142
142
  from tinygrad.nn.datasets import mnist
143
143
  X_train, Y_train, _, _ = mnist()
144
144
  with Context(NOOPT=noopt, FUSE_ARANGE=1, SPLIT_REDUCEOP=0):
@@ -152,12 +152,13 @@ class TestIndexing(unittest.TestCase):
152
152
  @unittest.skip("not ready")
153
153
  def test_index_mnist_opt(self): self.test_index_mnist(0)
154
154
 
155
- @unittest.skipIf(getenv("PTX"), "broken on ptx for some reason")
155
+ @unittest.skipIf(getenv("PTX") or Device.DEFAULT == "WEBGPU", "broken on ptx and WebGPU for some reason")
156
156
  def test_llama_embedding(self, noopt=1, op_limit=65536):
157
157
  # llama3 is 128256
158
158
  vocab_size, embed_size = (10, 3) if CI else (32000, 4096)
159
159
  emb = nn.Embedding(vocab_size, embed_size)
160
- emb_w = emb.weight.numpy()
160
+ # TODO: why is a new realize needed here
161
+ emb_w = emb.weight.realize().numpy()
161
162
  x = Tensor([1,2,3,4])
162
163
  with Context(NOOPT=noopt, FUSE_ARANGE=1):
163
164
  GlobalCounters.reset()
@@ -2,7 +2,8 @@
2
2
  import unittest
3
3
  import numpy as np
4
4
  from tinygrad import dtypes, Tensor, TinyJit, GlobalCounters, Variable
5
- from tinygrad.engine.schedule import create_schedule
5
+ from tinygrad.device import is_dtype_supported
6
+ from tinygrad.helpers import temp
6
7
 
7
8
  N = 200 # has to be bigger than the cache to fail
8
9
 
@@ -168,16 +169,6 @@ class TestAssign(unittest.TestCase):
168
169
  a += 1
169
170
  np.testing.assert_allclose(a.numpy(), 3)
170
171
 
171
- # NOTE: this is similar to the resnet failure
172
- #@unittest.expectedFailure
173
- def test_double_assign_alt(self):
174
- a = Tensor.ones(4).contiguous().realize()
175
- b = Tensor([1, 2, 3, 4]).realize().lazydata
176
- a1 = a.lazydata.assign(b)
177
- a2 = a.lazydata.assign(b)
178
- sched = create_schedule([a1, a2])
179
- self.assertEqual(len(sched), 1)
180
-
181
172
  def test_crossover_assign(self):
182
173
  a = Tensor.full((4,), 2).contiguous().realize()
183
174
  b = Tensor.full((4,), 3).contiguous().realize()
@@ -212,6 +203,7 @@ class TestAssign(unittest.TestCase):
212
203
  np.testing.assert_equal(b0.numpy(), 128)
213
204
  np.testing.assert_equal(b1.numpy(), 608)
214
205
 
206
+ @unittest.skip("TODO: bring this assert back")
215
207
  def test_crossunder_assign(self):
216
208
  # NOTE: should *not* raise AssertionError from numpy
217
209
  with self.assertRaisesRegex(RuntimeError, "cycle"):
@@ -293,6 +285,7 @@ class TestAssign(unittest.TestCase):
293
285
  #assert ba1 == ba2 and ba1 != bb1
294
286
  np.testing.assert_allclose(a.numpy(), np.arange(N*N).reshape((N,N)) + np.arange(N*N).reshape((N,N)).transpose(1,0))
295
287
 
288
+ @unittest.skip("multi output not supported anymore")
296
289
  def test_simple_assignment_multioutput(self):
297
290
  a = Tensor.randn(32, 32).realize()
298
291
  b = Tensor.full((32, ), 1.).contiguous().realize()
@@ -331,6 +324,7 @@ class TestAssign(unittest.TestCase):
331
324
  b.assign(r + b.permute(1, 0))
332
325
  b.realize()
333
326
 
327
+ @unittest.skip("multi output not supported anymore")
334
328
  def test_permuted_reduceop_multioutput_dual_use(self):
335
329
  a = Tensor.randn(32, 32, 32).realize()
336
330
  b = Tensor.full((32, 32), 1.).contiguous().realize()
@@ -343,6 +337,7 @@ class TestAssign(unittest.TestCase):
343
337
  c.assign(r + b_perm)
344
338
  Tensor.realize(b, c)
345
339
 
340
+ @unittest.skip("multi output not supported anymore")
346
341
  def test_permuted_reduceop_multioutput_dual_use_possible(self):
347
342
  a = Tensor.randn(32, 32, 32, dtype=dtypes.int).realize()
348
343
  b = Tensor.arange(32 * 32).reshape(32, 32).realize()
@@ -376,6 +371,14 @@ class TestAssign(unittest.TestCase):
376
371
 
377
372
  # TODO: is there a way to sneak in a permute such that it returns the wrong answer?
378
373
 
374
+ @unittest.skipUnless(is_dtype_supported(dtypes.half), "need half")
375
+ def test_setitem_half(self):
376
+ a = Tensor.full((8,), 1.0, dtype=dtypes.half).contiguous().realize()
377
+ b = Tensor.full((4,), 2.0, dtype=dtypes.half).contiguous().realize()
378
+ assign = a[:4].assign(b)
379
+ assign.realize()
380
+ np.testing.assert_allclose(a.numpy(), [2., 2., 2., 2., 1., 1., 1., 1.])
381
+
379
382
  @unittest.skip("don't use output buffer, and mismatch dtype no longer supported")
380
383
  def test_cast_assignment(self):
381
384
  a = Tensor(np.arange(N*N, dtype=np.float32)).reshape(N,N)
@@ -387,5 +390,9 @@ class TestAssign(unittest.TestCase):
387
390
  assert oba1 is None and oba2 is None
388
391
  np.testing.assert_allclose(a.numpy(), np.arange(N*N,dtype=np.int32).reshape((N,N)))
389
392
 
393
+ def test_disk_assignment(self):
394
+ a = Tensor.empty(5, device=f"disk:{temp('disk_assignment')}").assign(Tensor.ones(5)).numpy()
395
+ np.testing.assert_equal(a, np.ones(5))
396
+
390
397
  if __name__ == "__main__":
391
398
  unittest.main()
@@ -1,16 +1,18 @@
1
- import unittest, math
1
+ import unittest, itertools, math
2
+ from typing import Any
2
3
  from tinygrad import Tensor, Device, dtypes
3
- from tinygrad.ops import Ops
4
- from tinygrad.engine.schedule import create_schedule
4
+ from tinygrad.dtype import DType
5
+ from tinygrad.ops import Ops, UOp
5
6
  from tinygrad.helpers import CI
7
+ from tinygrad.codegen.devectorizer import full_graph_rewrite
6
8
  import numpy as np
7
9
  from tinygrad.device import is_dtype_supported
8
10
 
9
11
  def _check_ast_count(desired_count:int, t:Tensor):
10
12
  # NOTE: this has side effect because everything can be scheduled only once
11
- schedule = create_schedule(t.lazydata.lbs)
13
+ schedule = t.schedule()
12
14
  asts = [s for s in schedule if s.ast.op is Ops.SINK]
13
- assert len(asts) == desired_count
15
+ assert len(asts) == desired_count, f"{len(asts)} != {desired_count}"
14
16
 
15
17
  class TestUnaryOpsConstFolding(unittest.TestCase):
16
18
  def test_all_consts_ops(self):
@@ -98,13 +100,47 @@ class TestBinaryOpsConstFolding(unittest.TestCase):
98
100
  def test_tensor_one_pow(self):
99
101
  _check_ast_count(0, Tensor.ones(4) ** Tensor([1.0, 2, 3, 4]))
100
102
 
103
+ class TestBitcastConstFolding(unittest.TestCase):
104
+ def test_scalar_bitcast(self):
105
+ def t(cases: dict[DType, Any]):
106
+ for (from_dt, from_v), (to_dt, to_v) in itertools.product(cases.items(), cases.items()):
107
+ if not math.isnan(from_v):
108
+ r = full_graph_rewrite(UOp.const(from_dt, from_v).bitcast(to_dt).sink()).src[0]
109
+ self.assertEqual(r.op, Ops.CONST, msg:=f"{from_dt} -> {to_dt} ({from_v} -> {to_v})")
110
+ self.assertEqual(r.dtype, to_dt, msg)
111
+ np.testing.assert_equal(r.arg, to_v, msg)
112
+
113
+ t({dtypes.int8: 0, dtypes.uint8: 0, dtypes.bool: False})
114
+ t({dtypes.int8: 1, dtypes.uint8: 1, dtypes.bool: True})
115
+
116
+ t({dtypes.int8: -1, dtypes.uint8: 2**8-1})
117
+ t({dtypes.int16: -1, dtypes.uint16: 2**16-1, dtypes.float16: float('nan')})
118
+ t({dtypes.int32: -1, dtypes.uint32: 2**32-1, dtypes.float32: float('nan')})
119
+ t({dtypes.int64: -1, dtypes.uint64: 2**64-1, dtypes.float64: float('nan')})
120
+
121
+ t({dtypes.int8: -2**7, dtypes.uint8: 2**7})
122
+ t({dtypes.int16: -2**15, dtypes.uint16: 2**15})
123
+ t({dtypes.int32: -2**31, dtypes.uint32: 2**31})
124
+ t({dtypes.int64: -2**63, dtypes.uint64: 2**63})
125
+
126
+ t({dtypes.int16: 13496, dtypes.uint16: 13496, dtypes.float16: 0.294921875})
127
+ t({dtypes.int32: 1050081145, dtypes.uint32: 1050081145, dtypes.float32: 0.29485681653022766})
128
+ t({dtypes.int64: 4598983288165178391, dtypes.uint64: 4598983288165178391, dtypes.float64: 0.29485681936461233})
129
+
130
+ def test_vec_bitcast(self):
131
+ r = full_graph_rewrite(UOp.const(dtypes.int32.vec(3), (-1, -2**31, 75)).bitcast(dtypes.uint32.vec(3)).sink()).src[0]
132
+ self.assertEqual(r.op, Ops.VECTORIZE)
133
+ self.assertEqual(r.dtype, dtypes.uint32.vec(3))
134
+ self.assertEqual(tuple(x.arg for x in r.src), (2**32-1, 2**31, 75))
135
+
101
136
  # folds advance indexing into basic indexing
102
137
  class TestIndexingConstFolding(unittest.TestCase):
103
138
  def test_scalar_index(self):
104
139
  t = Tensor.arange(16).float().reshape(1,1,4,4).realize()
105
- _check_ast_count(0, t[:,:,Tensor(1),:])
106
- _check_ast_count(0, t[:,:,Tensor(1)+2,:])
107
- _check_ast_count(0, t[:,:,Tensor(1),Tensor(0)])
140
+ # TODO: fold these
141
+ _check_ast_count(2, t[:,:,Tensor(1),:])
142
+ _check_ast_count(2, t[:,:,Tensor(1)+2,:])
143
+ _check_ast_count(2, t[:,:,Tensor(1),Tensor(0)])
108
144
 
109
145
  @unittest.expectedFailure
110
146
  def test_const_tensor_index(self):
@@ -130,11 +166,12 @@ class TestMovedConstFolding(unittest.TestCase):
130
166
 
131
167
  def test_cast_padded(self):
132
168
  # NOTE: this is folded due to CAST_BEFORE_VIEW
169
+ # update: CAST_BEFORE_VIEW=1 is no longer supported
133
170
  if is_dtype_supported(dtypes.int16):
134
- _check_ast_count(0, Tensor.ones(4).pad(((1, 1),)).cast(dtypes.int16))
171
+ _check_ast_count(1, Tensor.ones(4).pad(((1, 1),)).cast(dtypes.int16))
135
172
  np.testing.assert_equal(Tensor.ones(4).pad(((1, 1),)).cast(dtypes.int16).numpy(), [0, 1, 1, 1, 1, 0])
136
173
  if is_dtype_supported(dtypes.uint16):
137
- _check_ast_count(0, Tensor.full(4, fill_value=-1).pad(((1, 1),)).cast(dtypes.uint16))
174
+ _check_ast_count(1, Tensor.full(4, fill_value=-1).pad(((1, 1),)).cast(dtypes.uint16))
138
175
  np.testing.assert_equal(Tensor.full(4, fill_value=-1).pad(((1, 1),)).cast(dtypes.uint16).numpy(), [0, 65535, 65535, 65535, 65535, 0])
139
176
  # not folded
140
177
  if is_dtype_supported(dtypes.int64):
@@ -158,6 +195,37 @@ class TestReduceOpsConstFolding(unittest.TestCase):
158
195
  _check_ast_count(1, Tensor.ones(4).pad(((1, 1),)).exp().sum())
159
196
  np.testing.assert_allclose(Tensor.ones(4).pad(((1, 1),)).exp().sum().numpy(), 4 * math.e + 2)
160
197
 
198
+ def test_bool_zero_max(self):
199
+ _check_ast_count(0, Tensor.full((1, 2), True).shrink(((0, 1), (0, 0))).max((1, 0)))
200
+ np.testing.assert_equal(Tensor.full((1, 2), True).shrink(((0, 1), (0, 0))).max((1, 0)).numpy(), False)
201
+
202
+ def test_zero_size_ops(self):
203
+ for reduceop in [lambda x:x.prod(), lambda x:x.sum()]: # lambda x:x.max() NOTE: numpy gives "reduction operation maximum which has no identity"
204
+ _check_ast_count(0, reduceop(Tensor.empty(1, 0)))
205
+ np.testing.assert_equal(reduceop(Tensor.empty(shape:=(1, 0))).numpy(), reduceop(np.empty(shape)))
206
+
207
+ def test_zero_size_ops_view(self):
208
+ for reduceop in [lambda x:x.prod(), lambda x:x.sum()]:
209
+ _check_ast_count(0, reduceop(Tensor.empty(1, 0, 4).permute((1, 2, 0)).contiguous()))
210
+ np.testing.assert_equal(reduceop(Tensor.empty(shape:=(1, 0))).numpy(), reduceop(np.empty((shape))))
211
+
212
+ def test_zero_size_ops_realized(self):
213
+ for reduceop in [lambda x:x.prod(), lambda x:x.sum()]:
214
+ _check_ast_count(0, reduceop((Tensor.randn(0, 1)+1).realize()))
215
+ np.testing.assert_equal(reduceop((Tensor.randn(shape:=(0, 1))+1).realize()).numpy(), reduceop(np.empty(shape)))
216
+
217
+ def test_zero_size_realize_folded(self):
218
+ # non contiguous folded output doesn't realize
219
+ _check_ast_count(0, Tensor.empty(1, 0).sum())
220
+ # contiguous folded const can still schedule
221
+ a = Tensor.empty(1, 0).sum().contiguous()
222
+ _check_ast_count(2, a+2)
223
+ self.assertIsNotNone(a.lazydata.base.realized)
224
+ np.testing.assert_equal((Tensor.empty(1, 0).sum().contiguous()+2).numpy(), 2)
225
+ # otherwise we just fuse it
226
+ _check_ast_count(1, (Tensor.empty(1, 0).sum()+2).contiguous())
227
+ np.testing.assert_equal((Tensor.empty(1, 0).sum()+2).numpy(), 2)
228
+
161
229
  def test_const_prod(self):
162
230
  _check_ast_count(0, Tensor.full((2, 3), fill_value=2).prod())
163
231
  np.testing.assert_equal(Tensor.full((2, 3), fill_value=2).prod().numpy(), 2**(2*3))
@@ -206,6 +274,8 @@ class TestMultiConstFolding(unittest.TestCase):
206
274
  _check_ast_count(0, t ** 1)
207
275
  _check_ast_count(0, 1 ** t)
208
276
 
277
+ # failing because multi calls .contiguous() on every single sharded uop
278
+ @unittest.expectedFailure
209
279
  def test_multi_const_folding_tensor(self):
210
280
  ds = tuple(f"{Device.DEFAULT}:{i}" for i in range(4))
211
281
  t = Tensor.arange(16).float().realize().to(ds)
@@ -3,7 +3,6 @@ import unittest
3
3
  from tinygrad.ops import Ops
4
4
  from tinygrad.tensor import Tensor
5
5
  from tinygrad.nn import Conv2d
6
- from tinygrad.engine.schedule import create_schedule
7
6
  from tinygrad.shape.shapetracker import ShapeTracker, View
8
7
  from tinygrad.helpers import prod
9
8
  from test.unit.test_shapetracker import shapetracker_getitem
@@ -11,13 +10,12 @@ from test.unit.test_shapetracker import shapetracker_getitem
11
10
  class TestConvShapetracker(unittest.TestCase):
12
11
  def test_conv_3x3_one_view(self):
13
12
  conv = Conv2d(16, 32, (3, 3))
14
-
15
13
  # first run to init the weights, they are scheduled.
16
- create_schedule([conv(Tensor.empty(1, 16, 10, 10)).lazydata])
14
+ conv(Tensor.empty(1, 16, 10, 10)).schedule()
17
15
  # run it again to get the kernels
18
- sched = [si for si in create_schedule([conv(Tensor.empty(1, 16, 10, 10)).lazydata]) if si.ast.op is Ops.SINK]
16
+ sched = [si for si in conv(Tensor.empty(1, 16, 10, 10)).schedule() if si.ast.op is Ops.SINK]
19
17
  assert len(sched) == 1, f"conv should only have one kernel, getting {len(sched)}"
20
- for st in [x.st_arg for x in sched[0].ast.parents if x.op is Ops.LOAD]:
18
+ for st in [x.st_arg for x in sched[0].ast.toposort if x.op is Ops.LOAD]:
21
19
  assert len(st.views) == 1
22
20
 
23
21
  def test_conv_2x2_backward_one_view(self):
@@ -26,11 +24,10 @@ class TestConvShapetracker(unittest.TestCase):
26
24
  conv(X).mean().backward()
27
25
  si = X.grad.schedule()[-1]
28
26
  print(si)
29
- ldb = [x for x in si.ast.parents if x.op is Ops.LOAD][0]
27
+ ldb = [x for x in si.ast.toposort if x.op is Ops.LOAD][0]
30
28
  st: ShapeTracker = ldb.st_arg.simplify()
31
- # NOTE: st.real_size() is broken
32
29
  print(si.inputs[0].size)
33
- #self.assertEqual(si.inputs[0].size, st.real_size())
30
+ self.assertEqual(si.inputs[0].size, st.real_size())
34
31
  for v in st.views: print(v)
35
32
 
36
33
  # same st