tinygrad 0.10.2__tar.gz → 0.11.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (252) hide show
  1. {tinygrad-0.10.2/tinygrad.egg-info → tinygrad-0.11.0}/PKG-INFO +24 -16
  2. {tinygrad-0.10.2 → tinygrad-0.11.0}/README.md +5 -6
  3. {tinygrad-0.10.2 → tinygrad-0.11.0}/setup.py +37 -14
  4. tinygrad-0.11.0/test/test_amd_llvm.py +52 -0
  5. {tinygrad-0.10.2 → tinygrad-0.11.0}/test/test_arange.py +78 -39
  6. {tinygrad-0.10.2 → tinygrad-0.11.0}/test/test_assign.py +11 -11
  7. {tinygrad-0.10.2 → tinygrad-0.11.0}/test/test_const_folding.py +27 -36
  8. {tinygrad-0.10.2 → tinygrad-0.11.0}/test/test_copy_speed.py +29 -4
  9. tinygrad-0.11.0/test/test_define_reg.py +32 -0
  10. tinygrad-0.11.0/test/test_disassembly.py +21 -0
  11. tinygrad-0.11.0/test/test_dtype.py +426 -0
  12. {tinygrad-0.10.2 → tinygrad-0.11.0}/test/test_dtype_alu.py +19 -16
  13. tinygrad-0.11.0/test/test_edgecases.py +276 -0
  14. {tinygrad-0.10.2 → tinygrad-0.11.0}/test/test_gc.py +33 -3
  15. {tinygrad-0.10.2 → tinygrad-0.11.0}/test/test_graph.py +38 -9
  16. {tinygrad-0.10.2 → tinygrad-0.11.0}/test/test_hcq.py +120 -40
  17. tinygrad-0.11.0/test/test_hcq_iface.py +105 -0
  18. {tinygrad-0.10.2 → tinygrad-0.11.0}/test/test_image_dtype.py +31 -24
  19. tinygrad-0.11.0/test/test_interop.py +52 -0
  20. {tinygrad-0.10.2 → tinygrad-0.11.0}/test/test_jit.py +270 -9
  21. tinygrad-0.11.0/test/test_jit_cases.py +78 -0
  22. tinygrad-0.11.0/test/test_linalg.py +76 -0
  23. tinygrad-0.11.0/test/test_linearizer.py +1423 -0
  24. tinygrad-0.11.0/test/test_linearizer_dumb.py +201 -0
  25. {tinygrad-0.10.2 → tinygrad-0.11.0}/test/test_linearizer_overflows.py +37 -68
  26. tinygrad-0.11.0/test/test_memory_planner.py +124 -0
  27. {tinygrad-0.10.2 → tinygrad-0.11.0}/test/test_multitensor.py +274 -124
  28. {tinygrad-0.10.2 → tinygrad-0.11.0}/test/test_nn.py +75 -171
  29. {tinygrad-0.10.2 → tinygrad-0.11.0}/test/test_ops.py +568 -249
  30. tinygrad-0.11.0/test/test_opt_gemm.py +43 -0
  31. {tinygrad-0.10.2 → tinygrad-0.11.0}/test/test_optim.py +27 -1
  32. tinygrad-0.11.0/test/test_outerworld_range.py +148 -0
  33. {tinygrad-0.10.2 → tinygrad-0.11.0}/test/test_pickle.py +12 -8
  34. {tinygrad-0.10.2 → tinygrad-0.11.0}/test/test_profiler.py +65 -15
  35. tinygrad-0.11.0/test/test_quantize_onnx.py +364 -0
  36. {tinygrad-0.10.2 → tinygrad-0.11.0}/test/test_randomness.py +50 -16
  37. tinygrad-0.11.0/test/test_remote.py +99 -0
  38. tinygrad-0.11.0/test/test_renderer_failures.py +121 -0
  39. {tinygrad-0.10.2 → tinygrad-0.11.0}/test/test_sample.py +3 -1
  40. {tinygrad-0.10.2 → tinygrad-0.11.0}/test/test_schedule.py +520 -520
  41. tinygrad-0.11.0/test/test_search.py +146 -0
  42. {tinygrad-0.10.2 → tinygrad-0.11.0}/test/test_setitem.py +39 -13
  43. tinygrad-0.11.0/test/test_softmax_fusion.py +202 -0
  44. {tinygrad-0.10.2 → tinygrad-0.11.0}/test/test_speed_v_torch.py +7 -4
  45. tinygrad-0.11.0/test/test_stunning.py +59 -0
  46. tinygrad-0.11.0/test/test_subbuffer.py +183 -0
  47. {tinygrad-0.10.2 → tinygrad-0.11.0}/test/test_symbolic_jit.py +35 -1
  48. {tinygrad-0.10.2 → tinygrad-0.11.0}/test/test_symbolic_ops.py +97 -6
  49. {tinygrad-0.10.2 → tinygrad-0.11.0}/test/test_tensor.py +105 -40
  50. {tinygrad-0.10.2 → tinygrad-0.11.0}/test/test_tensor_uop.py +5 -5
  51. {tinygrad-0.10.2 → tinygrad-0.11.0}/test/test_tensor_variable.py +30 -24
  52. {tinygrad-0.10.2 → tinygrad-0.11.0}/test/test_tiny.py +28 -9
  53. {tinygrad-0.10.2 → tinygrad-0.11.0}/test/test_transcendental.py +36 -5
  54. {tinygrad-0.10.2 → tinygrad-0.11.0}/test/test_uop_graph.py +163 -140
  55. {tinygrad-0.10.2 → tinygrad-0.11.0}/test/test_uops.py +114 -119
  56. {tinygrad-0.10.2 → tinygrad-0.11.0}/test/test_uops_stats.py +30 -34
  57. {tinygrad-0.10.2 → tinygrad-0.11.0}/test/test_winograd.py +18 -29
  58. {tinygrad-0.10.2 → tinygrad-0.11.0}/test/test_zero_copy.py +1 -1
  59. {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/__init__.py +1 -1
  60. tinygrad-0.11.0/tinygrad/apps/llm.py +206 -0
  61. tinygrad-0.11.0/tinygrad/codegen/__init__.py +116 -0
  62. tinygrad-0.11.0/tinygrad/codegen/devectorizer.py +390 -0
  63. {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/codegen/expander.py +8 -16
  64. tinygrad-0.11.0/tinygrad/codegen/gpudims.py +89 -0
  65. tinygrad-0.11.0/tinygrad/codegen/linearize.py +236 -0
  66. tinygrad-0.11.0/tinygrad/codegen/lowerer.py +114 -0
  67. tinygrad-0.11.0/tinygrad/codegen/opt/__init__.py +38 -0
  68. tinygrad-0.11.0/tinygrad/codegen/opt/heuristic.py +125 -0
  69. tinygrad-0.11.0/tinygrad/codegen/opt/kernel.py +510 -0
  70. {tinygrad-0.10.2/tinygrad/engine → tinygrad-0.11.0/tinygrad/codegen/opt}/search.py +51 -35
  71. tinygrad-0.11.0/tinygrad/codegen/opt/swizzler.py +134 -0
  72. tinygrad-0.11.0/tinygrad/codegen/opt/tc.py +127 -0
  73. tinygrad-0.11.0/tinygrad/codegen/quantize.py +67 -0
  74. {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/device.py +122 -132
  75. {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/dtype.py +152 -35
  76. {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/engine/jit.py +81 -54
  77. tinygrad-0.11.0/tinygrad/engine/memory.py +69 -0
  78. {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/engine/realize.py +82 -41
  79. tinygrad-0.11.0/tinygrad/engine/schedule.py +83 -0
  80. tinygrad-0.11.0/tinygrad/frontend/onnx.py +1253 -0
  81. tinygrad-0.11.0/tinygrad/frontend/torch.py +5 -0
  82. {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/gradient.py +19 -27
  83. {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/helpers.py +95 -47
  84. {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/nn/__init__.py +7 -8
  85. tinygrad-0.11.0/tinygrad/nn/optim.py +177 -0
  86. {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/nn/state.py +37 -23
  87. {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/renderer/__init__.py +40 -60
  88. {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/renderer/cstyle.py +143 -128
  89. tinygrad-0.11.0/tinygrad/renderer/llvmir.py +242 -0
  90. {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/renderer/ptx.py +50 -32
  91. {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/renderer/wgsl.py +27 -23
  92. tinygrad-0.11.0/tinygrad/runtime/autogen/am/am.py +5861 -0
  93. tinygrad-0.11.0/tinygrad/runtime/autogen/am/pm4_nv.py +962 -0
  94. tinygrad-0.11.0/tinygrad/runtime/autogen/am/pm4_soc15.py +931 -0
  95. tinygrad-0.11.0/tinygrad/runtime/autogen/am/sdma_4_0_0.py +5209 -0
  96. tinygrad-0.11.0/tinygrad/runtime/autogen/am/sdma_4_4_2.py +5209 -0
  97. tinygrad-0.11.0/tinygrad/runtime/autogen/am/sdma_5_0_0.py +7103 -0
  98. tinygrad-0.11.0/tinygrad/runtime/autogen/am/sdma_6_0_0.py +8085 -0
  99. tinygrad-0.11.0/tinygrad/runtime/autogen/am/smu_v13_0_0.py +3068 -0
  100. tinygrad-0.11.0/tinygrad/runtime/autogen/am/smu_v14_0_2.py +3605 -0
  101. tinygrad-0.11.0/tinygrad/runtime/autogen/amd_gpu.py +22115 -0
  102. {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/runtime/autogen/comgr.py +35 -9
  103. tinygrad-0.11.0/tinygrad/runtime/autogen/comgr_3.py +906 -0
  104. {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/runtime/autogen/cuda.py +2419 -494
  105. {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/runtime/autogen/hsa.py +57 -16
  106. tinygrad-0.11.0/tinygrad/runtime/autogen/ib.py +7171 -0
  107. {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/runtime/autogen/io_uring.py +917 -118
  108. tinygrad-0.11.0/tinygrad/runtime/autogen/kfd.py +1548 -0
  109. {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/runtime/autogen/libc.py +613 -218
  110. tinygrad-0.11.0/tinygrad/runtime/autogen/libusb.py +1643 -0
  111. tinygrad-0.11.0/tinygrad/runtime/autogen/nv/nv.py +8602 -0
  112. {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/runtime/autogen/nv_gpu.py +7218 -2072
  113. {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/runtime/autogen/opencl.py +2 -4
  114. tinygrad-0.11.0/tinygrad/runtime/autogen/sqtt.py +1789 -0
  115. {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/runtime/autogen/vfio.py +3 -3
  116. {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/runtime/autogen/webgpu.py +273 -264
  117. {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/runtime/graph/cuda.py +3 -3
  118. {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/runtime/graph/hcq.py +68 -29
  119. {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/runtime/graph/metal.py +29 -13
  120. tinygrad-0.11.0/tinygrad/runtime/graph/remote.py +114 -0
  121. tinygrad-0.11.0/tinygrad/runtime/ops_amd.py +852 -0
  122. tinygrad-0.11.0/tinygrad/runtime/ops_cpu.py +125 -0
  123. {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/runtime/ops_cuda.py +12 -14
  124. {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/runtime/ops_disk.py +13 -10
  125. {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/runtime/ops_dsp.py +47 -40
  126. {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/runtime/ops_gpu.py +13 -11
  127. {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/runtime/ops_hip.py +6 -9
  128. {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/runtime/ops_llvm.py +35 -15
  129. {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/runtime/ops_metal.py +29 -19
  130. tinygrad-0.11.0/tinygrad/runtime/ops_npy.py +11 -0
  131. tinygrad-0.11.0/tinygrad/runtime/ops_null.py +28 -0
  132. tinygrad-0.11.0/tinygrad/runtime/ops_nv.py +621 -0
  133. {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/runtime/ops_python.py +62 -52
  134. {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/runtime/ops_qcom.py +28 -39
  135. tinygrad-0.11.0/tinygrad/runtime/ops_remote.py +482 -0
  136. {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/runtime/ops_webgpu.py +28 -28
  137. tinygrad-0.11.0/tinygrad/runtime/support/am/amdev.py +261 -0
  138. tinygrad-0.11.0/tinygrad/runtime/support/am/ip.py +502 -0
  139. tinygrad-0.11.0/tinygrad/runtime/support/amd.py +138 -0
  140. tinygrad-0.10.2/tinygrad/runtime/support/compiler_hip.py → tinygrad-0.11.0/tinygrad/runtime/support/compiler_amd.py +40 -8
  141. {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/runtime/support/compiler_cuda.py +8 -11
  142. {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/runtime/support/elf.py +2 -1
  143. {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/runtime/support/hcq.py +184 -97
  144. tinygrad-0.11.0/tinygrad/runtime/support/ib.py +172 -0
  145. {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/runtime/support/llvm.py +3 -4
  146. tinygrad-0.11.0/tinygrad/runtime/support/memory.py +251 -0
  147. tinygrad-0.11.0/tinygrad/runtime/support/nv/ip.py +581 -0
  148. tinygrad-0.11.0/tinygrad/runtime/support/nv/nvdev.py +183 -0
  149. tinygrad-0.11.0/tinygrad/runtime/support/system.py +170 -0
  150. tinygrad-0.11.0/tinygrad/runtime/support/usb.py +268 -0
  151. tinygrad-0.11.0/tinygrad/runtime/support/webgpu.py +18 -0
  152. tinygrad-0.11.0/tinygrad/schedule/__init__.py +0 -0
  153. tinygrad-0.11.0/tinygrad/schedule/grouper.py +119 -0
  154. tinygrad-0.11.0/tinygrad/schedule/kernelize.py +368 -0
  155. tinygrad-0.11.0/tinygrad/schedule/multi.py +231 -0
  156. tinygrad-0.11.0/tinygrad/shape/__init__.py +0 -0
  157. {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/shape/shapetracker.py +40 -46
  158. {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/shape/view.py +88 -52
  159. {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/tensor.py +968 -542
  160. tinygrad-0.11.0/tinygrad/uop/__init__.py +117 -0
  161. tinygrad-0.10.2/tinygrad/codegen/transcendental.py → tinygrad-0.11.0/tinygrad/uop/decompositions.py +125 -38
  162. tinygrad-0.11.0/tinygrad/uop/mathtraits.py +169 -0
  163. tinygrad-0.11.0/tinygrad/uop/ops.py +1021 -0
  164. tinygrad-0.11.0/tinygrad/uop/spec.py +228 -0
  165. {tinygrad-0.10.2/tinygrad/codegen → tinygrad-0.11.0/tinygrad/uop}/symbolic.py +239 -216
  166. tinygrad-0.11.0/tinygrad/uop/upat.py +163 -0
  167. tinygrad-0.11.0/tinygrad/viz/assets/cdnjs.cloudflare.com/ajax/libs/highlight.js/11.10.0/languages/x86asm.min.js +19 -0
  168. tinygrad-0.11.0/tinygrad/viz/assets/d3js.org/d3.v7.min.js +2 -0
  169. tinygrad-0.11.0/tinygrad/viz/assets/dagrejs.github.io/project/dagre/latest/dagre.min.js +801 -0
  170. tinygrad-0.11.0/tinygrad/viz/index.html +344 -0
  171. tinygrad-0.11.0/tinygrad/viz/js/index.js +718 -0
  172. tinygrad-0.11.0/tinygrad/viz/js/worker.js +29 -0
  173. tinygrad-0.11.0/tinygrad/viz/serve.py +327 -0
  174. {tinygrad-0.10.2 → tinygrad-0.11.0/tinygrad.egg-info}/PKG-INFO +24 -16
  175. {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad.egg-info/SOURCES.txt +70 -21
  176. {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad.egg-info/requires.txt +16 -9
  177. tinygrad-0.10.2/test/test_conv.py +0 -150
  178. tinygrad-0.10.2/test/test_conv_shapetracker.py +0 -55
  179. tinygrad-0.10.2/test/test_dtype.py +0 -893
  180. tinygrad-0.10.2/test/test_fuzz_shape_ops.py +0 -88
  181. tinygrad-0.10.2/test/test_linearizer.py +0 -2203
  182. tinygrad-0.10.2/test/test_linearizer_dumb.py +0 -225
  183. tinygrad-0.10.2/test/test_linearizer_failures.py +0 -1415
  184. tinygrad-0.10.2/test/test_masked_st.py +0 -32
  185. tinygrad-0.10.2/test/test_quantize_onnx.py +0 -212
  186. tinygrad-0.10.2/test/test_rearrange_einops.py +0 -321
  187. tinygrad-0.10.2/test/test_renderer_failures.py +0 -76
  188. tinygrad-0.10.2/test/test_search.py +0 -190
  189. tinygrad-0.10.2/test/test_subbuffer.py +0 -68
  190. tinygrad-0.10.2/test/test_symbolic_shapetracker.py +0 -244
  191. tinygrad-0.10.2/tinygrad/codegen/devectorizer.py +0 -247
  192. tinygrad-0.10.2/tinygrad/codegen/kernel.py +0 -693
  193. tinygrad-0.10.2/tinygrad/codegen/linearize.py +0 -234
  194. tinygrad-0.10.2/tinygrad/codegen/lowerer.py +0 -161
  195. tinygrad-0.10.2/tinygrad/engine/memory.py +0 -50
  196. tinygrad-0.10.2/tinygrad/engine/multi.py +0 -161
  197. tinygrad-0.10.2/tinygrad/engine/schedule.py +0 -458
  198. tinygrad-0.10.2/tinygrad/nn/optim.py +0 -146
  199. tinygrad-0.10.2/tinygrad/ops.py +0 -1003
  200. tinygrad-0.10.2/tinygrad/renderer/llvmir.py +0 -191
  201. tinygrad-0.10.2/tinygrad/runtime/autogen/amd_gpu.py +0 -87879
  202. tinygrad-0.10.2/tinygrad/runtime/autogen/kfd.py +0 -826
  203. tinygrad-0.10.2/tinygrad/runtime/ops_amd.py +0 -635
  204. tinygrad-0.10.2/tinygrad/runtime/ops_cloud.py +0 -220
  205. tinygrad-0.10.2/tinygrad/runtime/ops_cpu.py +0 -24
  206. tinygrad-0.10.2/tinygrad/runtime/ops_npy.py +0 -9
  207. tinygrad-0.10.2/tinygrad/runtime/ops_nv.py +0 -549
  208. tinygrad-0.10.2/tinygrad/runtime/support/allocator.py +0 -94
  209. tinygrad-0.10.2/tinygrad/runtime/support/am/amdev.py +0 -396
  210. tinygrad-0.10.2/tinygrad/runtime/support/am/ip.py +0 -463
  211. tinygrad-0.10.2/tinygrad/spec.py +0 -155
  212. tinygrad-0.10.2/tinygrad/viz/assets/d3js.org/d3.v5.min.js +0 -2
  213. tinygrad-0.10.2/tinygrad/viz/assets/dagrejs.github.io/project/dagre-d3/latest/dagre-d3.min.js +0 -4816
  214. tinygrad-0.10.2/tinygrad/viz/index.html +0 -544
  215. tinygrad-0.10.2/tinygrad/viz/perfetto.html +0 -178
  216. tinygrad-0.10.2/tinygrad/viz/serve.py +0 -205
  217. {tinygrad-0.10.2 → tinygrad-0.11.0}/LICENSE +0 -0
  218. {tinygrad-0.10.2 → tinygrad-0.11.0}/setup.cfg +0 -0
  219. {tinygrad-0.10.2 → tinygrad-0.11.0}/test/test_compile_failures.py +0 -0
  220. {tinygrad-0.10.2 → tinygrad-0.11.0}/test/test_device_speed.py +0 -0
  221. {tinygrad-0.10.2 → tinygrad-0.11.0}/test/test_fusion_op.py +0 -0
  222. {tinygrad-0.10.2 → tinygrad-0.11.0}/test/test_kernel_cache.py +0 -0
  223. {tinygrad-0.10.2 → tinygrad-0.11.0}/test/test_metal.py +0 -0
  224. {tinygrad-0.10.2 → tinygrad-0.11.0}/test/test_method_cache.py +0 -0
  225. {tinygrad-0.10.2 → tinygrad-0.11.0}/test/test_net_speed.py +0 -0
  226. {tinygrad-0.10.2 → tinygrad-0.11.0}/test/test_ocl.py +0 -0
  227. {tinygrad-0.10.2 → tinygrad-0.11.0}/test/test_specific_conv.py +0 -0
  228. {tinygrad-0.10.2 → tinygrad-0.11.0}/test/test_tensor_data.py +0 -0
  229. {tinygrad-0.10.2 → tinygrad-0.11.0}/test/test_to_numpy.py +0 -0
  230. {tinygrad-0.10.2/tinygrad/codegen → tinygrad-0.11.0/tinygrad/engine}/__init__.py +0 -0
  231. {tinygrad-0.10.2/tinygrad/engine → tinygrad-0.11.0/tinygrad/frontend}/__init__.py +0 -0
  232. {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/nn/datasets.py +0 -0
  233. {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/py.typed +0 -0
  234. {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/runtime/__init__.py +0 -0
  235. {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/runtime/autogen/adreno.py +0 -0
  236. {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/runtime/autogen/hip.py +0 -0
  237. {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/runtime/autogen/kgsl.py +0 -0
  238. {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/runtime/autogen/llvm.py +0 -0
  239. {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/runtime/autogen/nvrtc.py +0 -0
  240. {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/runtime/autogen/pci.py +0 -0
  241. {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/runtime/autogen/qcom_dsp.py +0 -0
  242. {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/runtime/graph/__init__.py +0 -0
  243. {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/runtime/support/__init__.py +0 -0
  244. {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/runtime/support/am/__init__.py +0 -0
  245. {tinygrad-0.10.2/tinygrad/shape → tinygrad-0.11.0/tinygrad/runtime/support/nv}/__init__.py +0 -0
  246. {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/viz/assets/cdnjs.cloudflare.com/ajax/libs/highlight.js/11.10.0/highlight.min.js +0 -0
  247. {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/viz/assets/cdnjs.cloudflare.com/ajax/libs/highlight.js/11.10.0/languages/cpp.min.js +0 -0
  248. {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/viz/assets/cdnjs.cloudflare.com/ajax/libs/highlight.js/11.10.0/languages/python.min.js +0 -0
  249. {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/viz/assets/cdnjs.cloudflare.com/ajax/libs/highlight.js/11.10.0/styles/default.min.css +0 -0
  250. {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/viz/assets/unpkg.com/@highlightjs/cdn-assets@11.10.0/styles/tokyo-night-dark.min.css +0 -0
  251. {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad.egg-info/dependency_links.txt +0 -0
  252. {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.2
1
+ Metadata-Version: 2.4
2
2
  Name: tinygrad
3
- Version: 0.10.2
3
+ Version: 0.11.0
4
4
  Summary: You like pytorch? You like micrograd? You love tinygrad! <3
5
5
  Author: George Hotz
6
6
  License: MIT
@@ -19,31 +19,38 @@ Requires-Dist: mypy==1.13.0; extra == "linting"
19
19
  Requires-Dist: typing-extensions; extra == "linting"
20
20
  Requires-Dist: pre-commit; extra == "linting"
21
21
  Requires-Dist: ruff; extra == "linting"
22
- Requires-Dist: types-tqdm; extra == "linting"
22
+ Requires-Dist: numpy; extra == "linting"
23
23
  Provides-Extra: testing-minimal
24
24
  Requires-Dist: numpy; extra == "testing-minimal"
25
- Requires-Dist: torch; extra == "testing-minimal"
25
+ Requires-Dist: torch==2.7.1; extra == "testing-minimal"
26
26
  Requires-Dist: pytest; extra == "testing-minimal"
27
27
  Requires-Dist: pytest-xdist; extra == "testing-minimal"
28
28
  Requires-Dist: hypothesis; extra == "testing-minimal"
29
+ Requires-Dist: z3-solver; extra == "testing-minimal"
30
+ Requires-Dist: ml_dtypes; extra == "testing-minimal"
29
31
  Provides-Extra: testing-unit
30
32
  Requires-Dist: numpy; extra == "testing-unit"
31
- Requires-Dist: torch; extra == "testing-unit"
33
+ Requires-Dist: torch==2.7.1; extra == "testing-unit"
32
34
  Requires-Dist: pytest; extra == "testing-unit"
33
35
  Requires-Dist: pytest-xdist; extra == "testing-unit"
34
36
  Requires-Dist: hypothesis; extra == "testing-unit"
37
+ Requires-Dist: z3-solver; extra == "testing-unit"
38
+ Requires-Dist: ml_dtypes; extra == "testing-unit"
35
39
  Requires-Dist: tqdm; extra == "testing-unit"
36
40
  Requires-Dist: safetensors; extra == "testing-unit"
37
41
  Requires-Dist: tabulate; extra == "testing-unit"
38
42
  Provides-Extra: testing
39
43
  Requires-Dist: numpy; extra == "testing"
40
- Requires-Dist: torch; extra == "testing"
44
+ Requires-Dist: torch==2.7.1; extra == "testing"
41
45
  Requires-Dist: pytest; extra == "testing"
42
46
  Requires-Dist: pytest-xdist; extra == "testing"
43
47
  Requires-Dist: hypothesis; extra == "testing"
48
+ Requires-Dist: z3-solver; extra == "testing"
49
+ Requires-Dist: ml_dtypes; extra == "testing"
44
50
  Requires-Dist: pillow; extra == "testing"
45
- Requires-Dist: onnx==1.16.0; extra == "testing"
51
+ Requires-Dist: onnx==1.18.0; extra == "testing"
46
52
  Requires-Dist: onnx2torch; extra == "testing"
53
+ Requires-Dist: onnxruntime; extra == "testing"
47
54
  Requires-Dist: opencv-python; extra == "testing"
48
55
  Requires-Dist: tabulate; extra == "testing"
49
56
  Requires-Dist: tqdm; extra == "testing"
@@ -58,6 +65,10 @@ Requires-Dist: nibabel; extra == "testing"
58
65
  Requires-Dist: bottle; extra == "testing"
59
66
  Requires-Dist: ggml-python; extra == "testing"
60
67
  Requires-Dist: capstone; extra == "testing"
68
+ Requires-Dist: pycocotools; extra == "testing"
69
+ Requires-Dist: boto3; extra == "testing"
70
+ Requires-Dist: pandas; extra == "testing"
71
+ Requires-Dist: influxdb3-python; extra == "testing"
61
72
  Provides-Extra: docs
62
73
  Requires-Dist: mkdocs; extra == "docs"
63
74
  Requires-Dist: mkdocs-material; extra == "docs"
@@ -66,14 +77,12 @@ Requires-Dist: markdown-callouts; extra == "docs"
66
77
  Requires-Dist: markdown-exec[ansi]; extra == "docs"
67
78
  Requires-Dist: black; extra == "docs"
68
79
  Requires-Dist: numpy; extra == "docs"
69
- Provides-Extra: testing-tf
70
- Requires-Dist: tensorflow==2.15.1; extra == "testing-tf"
71
- Requires-Dist: tensorflow_addons; extra == "testing-tf"
72
80
  Dynamic: author
73
81
  Dynamic: classifier
74
82
  Dynamic: description
75
83
  Dynamic: description-content-type
76
84
  Dynamic: license
85
+ Dynamic: license-file
77
86
  Dynamic: provides-extra
78
87
  Dynamic: requires-python
79
88
  Dynamic: summary
@@ -101,11 +110,11 @@ tinygrad: For something between [PyTorch](https://github.com/pytorch/pytorch) an
101
110
 
102
111
  ---
103
112
 
104
- This may not be the best deep learning framework, but it is a deep learning framework.
113
+ Despite tinygrad's size, it is a fully featured deep learning framework.
105
114
 
106
- Due to its extreme simplicity, it aims to be the easiest framework to add new accelerators to, with support for both inference and training. If XLA is CISC, tinygrad is RISC.
115
+ Due to its extreme simplicity, it is the easiest framework to add new accelerators to, with support for both inference and training. If XLA is CISC, tinygrad is RISC.
107
116
 
108
- tinygrad is still alpha software, but we [raised some money](https://geohot.github.io/blog/jekyll/update/2023/05/24/the-tiny-corp-raised-5M.html) to make it good. Someday, we will tape out chips.
117
+ tinygrad is now beta software, we [raised some money](https://geohot.github.io/blog/jekyll/update/2023/05/24/the-tiny-corp-raised-5M.html) to make it good. Someday, we will tape out chips.
109
118
 
110
119
  ## Features
111
120
 
@@ -119,9 +128,8 @@ Try a matmul. See how, despite the style, it is fused into one kernel with the p
119
128
 
120
129
  ```sh
121
130
  DEBUG=3 python3 -c "from tinygrad import Tensor;
122
- N = 1024; a, b = Tensor.rand(N, N), Tensor.rand(N, N);
123
- c = (a.reshape(N, 1, N) * b.T.reshape(1, N, N)).sum(axis=2);
124
- print((c.numpy() - (a.numpy() @ b.numpy())).mean())"
131
+ N = 1024; a, b = Tensor.empty(N, N), Tensor.empty(N, N);
132
+ (a.reshape(N, 1, N) * b.T.reshape(1, N, N)).sum(axis=2).realize()"
125
133
  ```
126
134
 
127
135
  And we can change `DEBUG` to `4` to see the generated code.
@@ -21,11 +21,11 @@ tinygrad: For something between [PyTorch](https://github.com/pytorch/pytorch) an
21
21
 
22
22
  ---
23
23
 
24
- This may not be the best deep learning framework, but it is a deep learning framework.
24
+ Despite tinygrad's size, it is a fully featured deep learning framework.
25
25
 
26
- Due to its extreme simplicity, it aims to be the easiest framework to add new accelerators to, with support for both inference and training. If XLA is CISC, tinygrad is RISC.
26
+ Due to its extreme simplicity, it is the easiest framework to add new accelerators to, with support for both inference and training. If XLA is CISC, tinygrad is RISC.
27
27
 
28
- tinygrad is still alpha software, but we [raised some money](https://geohot.github.io/blog/jekyll/update/2023/05/24/the-tiny-corp-raised-5M.html) to make it good. Someday, we will tape out chips.
28
+ tinygrad is now beta software, we [raised some money](https://geohot.github.io/blog/jekyll/update/2023/05/24/the-tiny-corp-raised-5M.html) to make it good. Someday, we will tape out chips.
29
29
 
30
30
  ## Features
31
31
 
@@ -39,9 +39,8 @@ Try a matmul. See how, despite the style, it is fused into one kernel with the p
39
39
 
40
40
  ```sh
41
41
  DEBUG=3 python3 -c "from tinygrad import Tensor;
42
- N = 1024; a, b = Tensor.rand(N, N), Tensor.rand(N, N);
43
- c = (a.reshape(N, 1, N) * b.T.reshape(1, N, N)).sum(axis=2);
44
- print((c.numpy() - (a.numpy() @ b.numpy())).mean())"
42
+ N = 1024; a, b = Tensor.empty(N, N), Tensor.empty(N, N);
43
+ (a.reshape(N, 1, N) * b.T.reshape(1, N, N)).sum(axis=2).realize()"
45
44
  ```
46
45
 
47
46
  And we can change `DEBUG` to `4` to see the generated code.
@@ -9,22 +9,44 @@ with open(directory / 'README.md', encoding='utf-8') as f:
9
9
 
10
10
  testing_minimal = [
11
11
  "numpy",
12
- "torch",
12
+ "torch==2.7.1",
13
13
  "pytest",
14
14
  "pytest-xdist",
15
15
  "hypothesis",
16
+ "z3-solver",
17
+ "ml_dtypes"
16
18
  ]
17
19
 
18
20
  setup(name='tinygrad',
19
- version='0.10.2',
21
+ version='0.11.0',
20
22
  description='You like pytorch? You like micrograd? You love tinygrad! <3',
21
23
  author='George Hotz',
22
24
  license='MIT',
23
25
  long_description=long_description,
24
26
  long_description_content_type='text/markdown',
25
- packages = ['tinygrad', 'tinygrad.runtime.autogen', 'tinygrad.codegen', 'tinygrad.nn', 'tinygrad.renderer', 'tinygrad.engine', 'tinygrad.viz',
26
- 'tinygrad.runtime', 'tinygrad.runtime.support', 'tinygrad.runtime.support.am', 'tinygrad.runtime.graph', 'tinygrad.shape'],
27
- package_data = {'tinygrad': ['py.typed'], 'tinygrad.viz': ['index.html', 'perfetto.html', 'assets/**/*']},
27
+ packages = [
28
+ 'tinygrad',
29
+ 'tinygrad.apps',
30
+ 'tinygrad.codegen',
31
+ 'tinygrad.codegen.opt',
32
+ 'tinygrad.engine',
33
+ 'tinygrad.frontend',
34
+ 'tinygrad.nn',
35
+ 'tinygrad.renderer',
36
+ 'tinygrad.runtime',
37
+ 'tinygrad.runtime.autogen',
38
+ 'tinygrad.runtime.autogen.am',
39
+ 'tinygrad.runtime.autogen.nv',
40
+ 'tinygrad.runtime.graph',
41
+ 'tinygrad.runtime.support',
42
+ 'tinygrad.runtime.support.am',
43
+ 'tinygrad.runtime.support.nv',
44
+ 'tinygrad.schedule',
45
+ 'tinygrad.shape',
46
+ 'tinygrad.uop',
47
+ 'tinygrad.viz',
48
+ ],
49
+ package_data = {'tinygrad': ['py.typed'], 'tinygrad.viz': ['index.html', 'assets/**/*', 'js/*']},
28
50
  classifiers=[
29
51
  "Programming Language :: Python :: 3",
30
52
  "License :: OSI Approved :: MIT License"
@@ -40,19 +62,20 @@ setup(name='tinygrad',
40
62
  "typing-extensions",
41
63
  "pre-commit",
42
64
  "ruff",
43
- "types-tqdm",
65
+ "numpy",
44
66
  ],
45
- #'mlperf': ["mlperf-logging @ git+https://github.com/mlperf/logging.git@4.1.0-rc3"],
67
+ #'mlperf': ["mlperf-logging @ git+https://github.com/mlperf/logging.git@5.0.0-rc3"],
46
68
  'testing_minimal': testing_minimal,
47
69
  'testing_unit': testing_minimal + [
48
70
  "tqdm",
49
71
  "safetensors",
50
- "tabulate" # for sz.py
72
+ "tabulate", # for sz.py
51
73
  ],
52
74
  'testing': testing_minimal + [
53
75
  "pillow",
54
- "onnx==1.16.0",
76
+ "onnx==1.18.0",
55
77
  "onnx2torch",
78
+ "onnxruntime",
56
79
  "opencv-python",
57
80
  "tabulate",
58
81
  "tqdm",
@@ -66,7 +89,11 @@ setup(name='tinygrad',
66
89
  "nibabel",
67
90
  "bottle",
68
91
  "ggml-python",
69
- "capstone"
92
+ "capstone",
93
+ "pycocotools",
94
+ "boto3",
95
+ "pandas",
96
+ "influxdb3-python"
70
97
  ],
71
98
  'docs': [
72
99
  "mkdocs",
@@ -77,9 +104,5 @@ setup(name='tinygrad',
77
104
  "black",
78
105
  "numpy",
79
106
  ],
80
- 'testing_tf': [
81
- "tensorflow==2.15.1",
82
- "tensorflow_addons",
83
- ],
84
107
  },
85
108
  include_package_data=True)
@@ -0,0 +1,52 @@
1
+ import unittest
2
+ import numpy as np
3
+ from tinygrad import Device
4
+ from tinygrad.device import CompileError
5
+ from tinygrad.helpers import flat_mv
6
+ if Device.DEFAULT=="AMD":
7
+ from tinygrad.runtime.ops_amd import AMDAllocator, AMDDevice, AMDProgram
8
+ from tinygrad.runtime.support.compiler_amd import AMDLLVMCompiler
9
+
10
+ @unittest.skipUnless(Device.DEFAULT == "AMD", "Runs only on AMD")
11
+ class TestAMDLLVM(unittest.TestCase):
12
+ def test_compiler(self):
13
+ src = '''
14
+ ; https://github.com/llvm/llvm-project/blob/main/llvm/test/CodeGen/AMDGPU/imm.ll
15
+ define amdgpu_kernel void @i64_imm_inline_lo(ptr addrspace(1) %out) {
16
+ entry:
17
+ store i64 1311768464867721221, ptr addrspace(1) %out ; 0x1234567800000005
18
+ ret void
19
+ }
20
+ '''
21
+ device = AMDDevice()
22
+ compiler = AMDLLVMCompiler("gfx1100")
23
+ obj = compiler.compile(src)
24
+ allocator = AMDAllocator(device)
25
+ a = allocator.alloc(1*8)
26
+ prog = AMDProgram(device, "test", obj)
27
+ prog(a, wait=True)
28
+ na = np.empty(1, np.uint64)
29
+ allocator._copyout(flat_mv(na.data), a)
30
+ assert na == [0x1234567800000005]
31
+
32
+ def test_compiler_diag_error(self):
33
+ src = """
34
+ @local_temp0 = internal unnamed_addr addrspace(3) global [{N} x float*] undef, align 16
35
+ define amdgpu_kernel void @test(float* noalias align 32 %data0, half* noalias align 32 %data1, float* noalias align 32 %data2) #0
36
+ {{
37
+ %local_temp0 = addrspacecast [{N} x float*] addrspace(3)* @local_temp0 to [{N} x float*]*
38
+ %v178 = getelementptr inbounds float, float* %local_temp0, i32 1
39
+ %v133 = getelementptr inbounds float, float* %data2, i32 1
40
+ %v134 = load float, float* %v133
41
+ store float %v134, float* %v178
42
+ ret void
43
+ }}
44
+ """
45
+ compiler = AMDLLVMCompiler("gfx1100")
46
+ compiler.compile(src.format(N=65536//8))
47
+ with self.assertRaises(CompileError):
48
+ # llvm diagnostic: <unknown>:0:0: local memory (65544) exceeds limit (65536) in function 'test'
49
+ compiler.compile(src.format(N=65536//8+1))
50
+
51
+ if __name__ == '__main__':
52
+ unittest.main()
@@ -1,12 +1,13 @@
1
1
  import unittest, contextlib
2
2
  import numpy as np
3
- from tinygrad import Tensor, GlobalCounters, dtypes, nn, Device
3
+ from tinygrad import Tensor, GlobalCounters, dtypes, nn, Device, Variable
4
4
  from tinygrad.helpers import CI, Context, getenv
5
5
  from tinygrad.engine.realize import run_schedule
6
- from tinygrad.codegen.kernel import Opt, OptOps, Kernel, KernelOptError
7
- from tinygrad.engine.realize import CompiledRunner, ExecItem
8
- from tinygrad.engine.search import get_kernel_actions
9
- from tinygrad.ops import Ops
6
+ from tinygrad.codegen.opt.kernel import Opt, OptOps, Kernel, KernelOptError
7
+ from tinygrad.engine.realize import CompiledRunner, ExecItem, get_program
8
+ from tinygrad.codegen.opt.search import get_kernel_actions
9
+ from tinygrad.uop.ops import Ops
10
+ from tinygrad.codegen import apply_rewrites, rewrites_for_views
10
11
 
11
12
  class TestArange(unittest.TestCase):
12
13
  def _get_flops(self, N, opts=None):
@@ -14,41 +15,46 @@ class TestArange(unittest.TestCase):
14
15
  tt = Tensor.arange(N)
15
16
  sched = tt.schedule()
16
17
  self.assertEqual(len(sched), 1)
17
- k = Kernel(sched[-1].ast)
18
- if opts is not None:
19
- for o in opts: k.apply_opt(o)
20
- p = k.to_program()
18
+ p = get_program(sched[-1].ast, opts=opts)
21
19
  print(p.name)
22
20
  #print(p.src)
23
- ExecItem(CompiledRunner(p), [tt.lazydata.buffer]).run()
21
+ ExecItem(CompiledRunner(p), [tt.uop.buffer]).run()
24
22
  np.testing.assert_equal(tt.numpy(), np.arange(N))
25
23
  return p.estimates.ops
26
24
 
27
25
  def test_complexity(self, opts=None, limit=None):
28
- # add 1 to avoid divide by 0. arange is 0 flops now!
29
- f1 = self._get_flops(256, opts) + 1
30
- f2 = self._get_flops(2560, opts) + 1
26
+ f1 = self._get_flops(256, opts)
27
+ f2 = self._get_flops(2560, opts)
31
28
  print(f"{f1=}, {f2=}")
32
- assert (f1 < 5000 and f2 < 5000) or (f2 / f1 < 15), f"bad complexity, flops {f2/f1:.1f}X while inputs 10X"
29
+ # add 1 to avoid divide by 0. arange is 0 flops now!
30
+ assert (f1 < 6000 and f2 < 6000) or ((f2+1) / (f1+1) < 16), f"bad complexity, flops {(f2+1) / (f1+1):.1f}X while inputs 10X"
33
31
  if limit is not None and not getenv("PTX"):
34
32
  # PTX counts index ALU in flops
35
33
  assert f1 <= limit, f"{f1=}, {limit=}"
36
34
 
37
- def test_complexity_w_upcast(self): return self.test_complexity([Opt(OptOps.UPCAST, 0, 4)], limit=1)
38
- def test_complexity_w_unroll2(self): return self.test_complexity([Opt(OptOps.UNROLL, 0, 2)], limit=1)
39
- def test_complexity_w_unroll4(self): return self.test_complexity([Opt(OptOps.UNROLL, 0, 4)], limit=1)
40
- def test_complexity_w_unroll8(self): return self.test_complexity([Opt(OptOps.UNROLL, 0, 8)], limit=1)
41
- def test_complexity_w_upcast_and_unroll(self): return self.test_complexity([Opt(OptOps.UPCAST, 0, 4), Opt(OptOps.UNROLL, 0, 4)], limit=1)
35
+ def test_complexity_w_upcast(self): return self.test_complexity([Opt(OptOps.UPCAST, 0, 4)], limit=0)
36
+ def test_complexity_w_unroll2(self): return self.test_complexity([Opt(OptOps.UNROLL, 0, 2)], limit=0)
37
+ def test_complexity_w_unroll4(self): return self.test_complexity([Opt(OptOps.UNROLL, 0, 4)], limit=0)
38
+ def test_complexity_w_unroll8(self): return self.test_complexity([Opt(OptOps.UNROLL, 0, 8)], limit=0)
39
+ def test_complexity_w_upcast_and_unroll(self): return self.test_complexity([Opt(OptOps.UPCAST, 0, 4), Opt(OptOps.UNROLL, 0, 4)], limit=0)
42
40
 
43
- @unittest.skip("doesn't work yet")
44
- def test_complexity_w_local_and_padto(self): return self.test_complexity([Opt(OptOps.LOCAL, 0, 16), Opt(op=OptOps.PADTO, axis=1, arg=32)])
41
+ if Device.default.renderer.has_local:
42
+ # TODO: fix limit
43
+ def test_complexity_w_group(self): return self.test_complexity([Opt(OptOps.GROUP, 0, 16)], limit=81920)
44
+ def test_complexity_w_group_top(self): return self.test_complexity([Opt(OptOps.GROUPTOP, 0, 16)], limit=106496)
45
+
46
+ def test_complexity_w_local(self): return self.test_complexity([Opt(OptOps.LOCAL, 0, 16)], limit=0)
47
+ @unittest.skip("doesn't work yet. TODO: this absolutely should work")
48
+ def test_complexity_w_local_unroll4(self): return self.test_complexity([Opt(OptOps.LOCAL, 0, 16), Opt(OptOps.UNROLL, 0, 4)], limit=0)
49
+ @unittest.skip("doesn't work yet")
50
+ def test_complexity_w_local_and_padto(self): return self.test_complexity([Opt(OptOps.LOCAL, 0, 16), Opt(OptOps.PADTO, axis=1, arg=32)])
45
51
 
46
52
  def test_all_opts(self, opts=None, exclude=None):
47
- k = Kernel(Tensor.arange(256).schedule()[-1].ast)
53
+ k = Kernel(apply_rewrites(Tensor.arange(256).schedule()[-1].ast, rewrites_for_views))
48
54
  if opts is not None:
49
55
  for o in opts: k.apply_opt(o)
50
56
  all_opts_256 = [kk.applied_opts for kk in get_kernel_actions(k, include_0=False).values()]
51
- k = Kernel(Tensor.arange(2560).schedule()[-1].ast)
57
+ k = Kernel(apply_rewrites(Tensor.arange(2560).schedule()[-1].ast, rewrites_for_views))
52
58
  if opts is not None:
53
59
  for o in opts: k.apply_opt(o)
54
60
  all_opts_2560 = [kk.applied_opts for kk in get_kernel_actions(k, include_0=False).values()]
@@ -65,6 +71,24 @@ class TestArange(unittest.TestCase):
65
71
  def test_all_opts_w_upcast_and_unroll(self):
66
72
  return self.test_all_opts([Opt(OptOps.UPCAST, 0, 4), Opt(OptOps.UNROLL, 0, 4)], [Opt(op=OptOps.GROUP, axis=0, arg=0)])
67
73
 
74
+ class TestRand(unittest.TestCase):
75
+ def test_fused_rand_less_ops(self, noopt=1):
76
+ GlobalCounters.reset()
77
+ with Context(FUSE_ARANGE=0, NOOPT=noopt):
78
+ out = Tensor.rand(16384)
79
+ out.realize()
80
+ unfused_ops = GlobalCounters.global_ops
81
+
82
+ GlobalCounters.reset()
83
+ with Context(FUSE_ARANGE=1, NOOPT=noopt):
84
+ out = Tensor.rand(16384)
85
+ out.realize()
86
+ print(f"fused {GlobalCounters.global_ops} unfused {unfused_ops}")
87
+ self.assertLessEqual(GlobalCounters.global_ops, unfused_ops*2)
88
+ def test_fused_rand_less_ops_opt(self): self.test_fused_rand_less_ops(0)
89
+
90
+ DSET, DDIM = 2048, 32
91
+
68
92
  class TestIndexing(unittest.TestCase):
69
93
  def test_arange_2_reduce(self):
70
94
  needle = Tensor.zeros(16384, dtype=dtypes.int).contiguous()
@@ -80,52 +104,63 @@ class TestIndexing(unittest.TestCase):
80
104
 
81
105
  @unittest.skipIf(getenv("PTX"), "broken on ptx for some reason")
82
106
  def test_manual_index(self):
83
- dataset = Tensor.rand(16384, 256).realize()
107
+ dataset = Tensor.rand(DSET, DDIM).realize()
84
108
  idxs = Tensor([0,3,5,6]).realize()
85
109
  real_index = dataset.numpy()[idxs.numpy()]
86
110
  print("*** indexing ***")
87
111
  with Context(NOOPT=1, FUSE_ARANGE=1):
88
112
  GlobalCounters.reset()
89
- rng = Tensor.ones(4, 256, 16384, dtype=dtypes.int)._cumalu(axis=-1, op=Ops.ADD, _include_initial=True).reshape(4, 256, 16384, 1)
90
- idxs = idxs.reshape(4,1,1,1).expand(4, 256, 16384, 1)
91
- reshape_dataset = dataset.T.reshape(1, 256, 16384, 1).expand(4, 256, 16384, 1)
92
- full = (rng==idxs).where(reshape_dataset, Tensor.zeros(4, 256, 16384, 1))
113
+ rng = Tensor.ones(4, DDIM, DSET, dtype=dtypes.int)._cumalu(axis=-1, op=Ops.ADD, _include_initial=True).reshape(4, DDIM, DSET, 1)
114
+ idxs = idxs.reshape(4,1,1,1).expand(4, DDIM, DSET, 1)
115
+ reshape_dataset = dataset.T.reshape(1, DDIM, DSET, 1).expand(4, DDIM, DSET, 1)
116
+ full = (rng==idxs).where(reshape_dataset, Tensor.zeros(4, DDIM, DSET, 1))
93
117
  X = full.sum(axis=(2,3))
94
118
  sched = X.schedule()
95
119
  self.assertEqual(len(sched), 1)
96
120
  run_schedule(sched)
97
- assert GlobalCounters.global_ops < 4*16384, f"too many ops {GlobalCounters.global_ops}"
121
+ assert GlobalCounters.global_ops < 4*DSET, f"too many ops {GlobalCounters.global_ops}"
98
122
  np.testing.assert_allclose(real_index, X.numpy())
99
123
 
124
+ def test_index_variable(self):
125
+ dataset = Tensor.rand(DSET, DDIM).realize()
126
+ v = Variable("v", 0, DDIM-1)
127
+ with Context(NOOPT=1, FUSE_ARANGE=1, SPLIT_REDUCEOP=0):
128
+ GlobalCounters.reset()
129
+ vb = Tensor(v.bind(12))
130
+ comp = dataset[vb].numpy()
131
+ # no global ops because they are all indexing
132
+ self.assertEqual(GlobalCounters.global_ops, 0)
133
+ np.testing.assert_allclose(comp, dataset.numpy()[12])
134
+
100
135
  def test_index(self):
101
- dataset = Tensor.rand(16384, 256).realize()
136
+ dataset = Tensor.rand(DSET, DDIM).realize()
102
137
  idxs = Tensor([0,3,5,6]).realize()
103
138
  real_index = dataset.numpy()[idxs.numpy()]
104
139
  print("*** indexing ***")
105
140
  with Context(NOOPT=1):
106
141
  GlobalCounters.reset()
107
142
  X = dataset[idxs]
108
- assert X.shape == (4,256)
143
+ assert X.shape == (4,DDIM)
109
144
  sched = X.schedule()
110
145
  # TODO: enable these asserts when the scheduler can handle this
111
146
  #self.assertEqual(len(sched), 1)
112
147
  run_schedule(sched)
113
- #assert GlobalCounters.global_ops < 4*16384, f"too many ops {GlobalCounters.global_ops}"
148
+ #assert GlobalCounters.global_ops < 4*DSET, f"too many ops {GlobalCounters.global_ops}"
114
149
  np.testing.assert_allclose(real_index, X.numpy())
115
150
 
116
151
  def test_index_fused(self, noopt=1):
117
- dataset = Tensor.rand(16384, 256).realize()
152
+ dataset = Tensor.rand(DSET, DDIM).realize()
118
153
  idxs = Tensor([0,3,5,6]).realize()
119
154
  real_index = dataset.numpy()[idxs.numpy()]
120
155
  print("*** indexing ***")
121
156
  with Context(NOOPT=noopt, FUSE_ARANGE=1):
122
157
  GlobalCounters.reset()
123
158
  X = dataset[idxs]
124
- assert X.shape == (4,256)
159
+ assert X.shape == (4,DDIM)
125
160
  sched = X.schedule()
126
161
  self.assertEqual(len(sched), 2)
127
162
  run_schedule(sched)
128
- assert GlobalCounters.global_ops < 4*16384, f"too many ops {GlobalCounters.global_ops} != {4*16384}"
163
+ assert GlobalCounters.global_ops < 4*DSET, f"too many ops {GlobalCounters.global_ops} != {4*DSET}"
129
164
  np.testing.assert_allclose(real_index, X.numpy())
130
165
  @unittest.skip("not ready")
131
166
  def test_index_fused_opt(self): self.test_index_fused(0)
@@ -138,10 +173,12 @@ class TestIndexing(unittest.TestCase):
138
173
  np.testing.assert_equal(X.numpy(), 0)
139
174
 
140
175
  @unittest.skipIf(getenv("PTX"), "broken on ptx for some reason")
141
- def test_index_mnist(self, noopt=1, op_limit=512*784*13):
176
+ def test_index_mnist(self, noopt=1, op_limit=512*784*13, split_reduceop=0):
177
+ # WEBGPU generates more ops due to bitpacking of < 4-byte dtypes
178
+ if Device.DEFAULT == "WEBGPU": op_limit *= 15
142
179
  from tinygrad.nn.datasets import mnist
143
180
  X_train, Y_train, _, _ = mnist()
144
- with Context(NOOPT=noopt, FUSE_ARANGE=1, SPLIT_REDUCEOP=0):
181
+ with Context(NOOPT=noopt, FUSE_ARANGE=1, SPLIT_REDUCEOP=split_reduceop):
145
182
  samples = Tensor.randint(getenv("BS", 512), high=X_train.shape[0]).realize()
146
183
  GlobalCounters.reset()
147
184
  x = X_train[samples].numpy()
@@ -149,10 +186,12 @@ class TestIndexing(unittest.TestCase):
149
186
  assert GlobalCounters.global_ops < op_limit, f"too many ops {GlobalCounters.global_ops} != {op_limit}"
150
187
  np.testing.assert_allclose(X_train.numpy()[samples.numpy()], x)
151
188
  np.testing.assert_allclose(Y_train.numpy()[samples.numpy()], y)
152
- @unittest.skip("not ready")
189
+
153
190
  def test_index_mnist_opt(self): self.test_index_mnist(0)
191
+ def test_index_mnist_split(self): self.test_index_mnist(1, split_reduceop=1)
192
+ def test_index_mnist_opt_split(self): self.test_index_mnist(0, split_reduceop=1)
154
193
 
155
- @unittest.skipIf(getenv("PTX") or Device.DEFAULT == "WEBGPU", "broken on ptx and WebGPU for some reason")
194
+ @unittest.skipIf(getenv("PTX"), "broken on ptx for some reason")
156
195
  def test_llama_embedding(self, noopt=1, op_limit=65536):
157
196
  # llama3 is 128256
158
197
  vocab_size, embed_size = (10, 3) if CI else (32000, 4096)
@@ -13,11 +13,11 @@ class TestAssign(unittest.TestCase):
13
13
  b = Tensor(np.arange(N*N, dtype=np.float32)).reshape(N,N)
14
14
  a.realize()
15
15
  b.realize()
16
- ba1 = a.lazydata.base.realized
17
- bb1 = b.lazydata.base.realized
16
+ ba1 = a.uop.base.realized
17
+ bb1 = b.uop.base.realized
18
18
  a += b
19
19
  a.realize()
20
- ba2 = a.lazydata.base.realized
20
+ ba2 = a.uop.base.realized
21
21
  assert ba1 == ba2 and ba1 != bb1
22
22
  np.testing.assert_allclose(a.numpy(), (np.arange(N*N)*2).reshape((N,N)))
23
23
 
@@ -259,13 +259,13 @@ class TestAssign(unittest.TestCase):
259
259
  b = Tensor(np.arange(N*N, dtype=np.float32)).reshape(N,N)
260
260
  a.realize()
261
261
  b.realize()
262
- ba1 = a.lazydata.base.realized
263
- bb1 = b.lazydata.base.realized
262
+ ba1 = a.uop.base.realized
263
+ bb1 = b.uop.base.realized
264
264
  with self.assertRaises((RuntimeError, AssertionError)):
265
265
  a = a.permute(1,0)
266
266
  a += b
267
267
  a.realize()
268
- ba2 = a.lazydata.base.realized
268
+ ba2 = a.uop.base.realized
269
269
  assert ba1 != ba2 and ba1 != bb1
270
270
  np.testing.assert_allclose(a.numpy(), np.arange(N*N).reshape((N,N)) + np.arange(N*N).reshape((N,N)).transpose(1,0))
271
271
 
@@ -275,12 +275,12 @@ class TestAssign(unittest.TestCase):
275
275
  a.realize()
276
276
  b.realize()
277
277
  #GlobalCounters.cache = []
278
- ba1 = a.lazydata.base.realized # noqa: F841
279
- bb1 = b.lazydata.base.realized # noqa: F841
278
+ ba1 = a.uop.base.realized # noqa: F841
279
+ bb1 = b.uop.base.realized # noqa: F841
280
280
  with self.assertRaisesRegex(RuntimeError, "contiguous"):
281
281
  a.assign(a.permute(1,0) + b) # this should not work!
282
282
  a.realize()
283
- ba2 = a.lazydata.base.realized # noqa: F841
283
+ ba2 = a.uop.base.realized # noqa: F841
284
284
  # NOTE: don't test that it's assigned
285
285
  #assert ba1 == ba2 and ba1 != bb1
286
286
  np.testing.assert_allclose(a.numpy(), np.arange(N*N).reshape((N,N)) + np.arange(N*N).reshape((N,N)).transpose(1,0))
@@ -383,10 +383,10 @@ class TestAssign(unittest.TestCase):
383
383
  def test_cast_assignment(self):
384
384
  a = Tensor(np.arange(N*N, dtype=np.float32)).reshape(N,N)
385
385
  a.realize()
386
- oba1 = a.lazydata.base.output_buffer
386
+ oba1 = a.uop.base.output_buffer
387
387
  a.assign(a.cast(dtypes.int32).realize())
388
388
  a.realize()
389
- oba2 = a.lazydata.base.output_buffer
389
+ oba2 = a.uop.base.output_buffer
390
390
  assert oba1 is None and oba2 is None
391
391
  np.testing.assert_allclose(a.numpy(), np.arange(N*N,dtype=np.int32).reshape((N,N)))
392
392