tinygrad 0.10.2__tar.gz → 0.11.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {tinygrad-0.10.2/tinygrad.egg-info → tinygrad-0.11.0}/PKG-INFO +24 -16
- {tinygrad-0.10.2 → tinygrad-0.11.0}/README.md +5 -6
- {tinygrad-0.10.2 → tinygrad-0.11.0}/setup.py +37 -14
- tinygrad-0.11.0/test/test_amd_llvm.py +52 -0
- {tinygrad-0.10.2 → tinygrad-0.11.0}/test/test_arange.py +78 -39
- {tinygrad-0.10.2 → tinygrad-0.11.0}/test/test_assign.py +11 -11
- {tinygrad-0.10.2 → tinygrad-0.11.0}/test/test_const_folding.py +27 -36
- {tinygrad-0.10.2 → tinygrad-0.11.0}/test/test_copy_speed.py +29 -4
- tinygrad-0.11.0/test/test_define_reg.py +32 -0
- tinygrad-0.11.0/test/test_disassembly.py +21 -0
- tinygrad-0.11.0/test/test_dtype.py +426 -0
- {tinygrad-0.10.2 → tinygrad-0.11.0}/test/test_dtype_alu.py +19 -16
- tinygrad-0.11.0/test/test_edgecases.py +276 -0
- {tinygrad-0.10.2 → tinygrad-0.11.0}/test/test_gc.py +33 -3
- {tinygrad-0.10.2 → tinygrad-0.11.0}/test/test_graph.py +38 -9
- {tinygrad-0.10.2 → tinygrad-0.11.0}/test/test_hcq.py +120 -40
- tinygrad-0.11.0/test/test_hcq_iface.py +105 -0
- {tinygrad-0.10.2 → tinygrad-0.11.0}/test/test_image_dtype.py +31 -24
- tinygrad-0.11.0/test/test_interop.py +52 -0
- {tinygrad-0.10.2 → tinygrad-0.11.0}/test/test_jit.py +270 -9
- tinygrad-0.11.0/test/test_jit_cases.py +78 -0
- tinygrad-0.11.0/test/test_linalg.py +76 -0
- tinygrad-0.11.0/test/test_linearizer.py +1423 -0
- tinygrad-0.11.0/test/test_linearizer_dumb.py +201 -0
- {tinygrad-0.10.2 → tinygrad-0.11.0}/test/test_linearizer_overflows.py +37 -68
- tinygrad-0.11.0/test/test_memory_planner.py +124 -0
- {tinygrad-0.10.2 → tinygrad-0.11.0}/test/test_multitensor.py +274 -124
- {tinygrad-0.10.2 → tinygrad-0.11.0}/test/test_nn.py +75 -171
- {tinygrad-0.10.2 → tinygrad-0.11.0}/test/test_ops.py +568 -249
- tinygrad-0.11.0/test/test_opt_gemm.py +43 -0
- {tinygrad-0.10.2 → tinygrad-0.11.0}/test/test_optim.py +27 -1
- tinygrad-0.11.0/test/test_outerworld_range.py +148 -0
- {tinygrad-0.10.2 → tinygrad-0.11.0}/test/test_pickle.py +12 -8
- {tinygrad-0.10.2 → tinygrad-0.11.0}/test/test_profiler.py +65 -15
- tinygrad-0.11.0/test/test_quantize_onnx.py +364 -0
- {tinygrad-0.10.2 → tinygrad-0.11.0}/test/test_randomness.py +50 -16
- tinygrad-0.11.0/test/test_remote.py +99 -0
- tinygrad-0.11.0/test/test_renderer_failures.py +121 -0
- {tinygrad-0.10.2 → tinygrad-0.11.0}/test/test_sample.py +3 -1
- {tinygrad-0.10.2 → tinygrad-0.11.0}/test/test_schedule.py +520 -520
- tinygrad-0.11.0/test/test_search.py +146 -0
- {tinygrad-0.10.2 → tinygrad-0.11.0}/test/test_setitem.py +39 -13
- tinygrad-0.11.0/test/test_softmax_fusion.py +202 -0
- {tinygrad-0.10.2 → tinygrad-0.11.0}/test/test_speed_v_torch.py +7 -4
- tinygrad-0.11.0/test/test_stunning.py +59 -0
- tinygrad-0.11.0/test/test_subbuffer.py +183 -0
- {tinygrad-0.10.2 → tinygrad-0.11.0}/test/test_symbolic_jit.py +35 -1
- {tinygrad-0.10.2 → tinygrad-0.11.0}/test/test_symbolic_ops.py +97 -6
- {tinygrad-0.10.2 → tinygrad-0.11.0}/test/test_tensor.py +105 -40
- {tinygrad-0.10.2 → tinygrad-0.11.0}/test/test_tensor_uop.py +5 -5
- {tinygrad-0.10.2 → tinygrad-0.11.0}/test/test_tensor_variable.py +30 -24
- {tinygrad-0.10.2 → tinygrad-0.11.0}/test/test_tiny.py +28 -9
- {tinygrad-0.10.2 → tinygrad-0.11.0}/test/test_transcendental.py +36 -5
- {tinygrad-0.10.2 → tinygrad-0.11.0}/test/test_uop_graph.py +163 -140
- {tinygrad-0.10.2 → tinygrad-0.11.0}/test/test_uops.py +114 -119
- {tinygrad-0.10.2 → tinygrad-0.11.0}/test/test_uops_stats.py +30 -34
- {tinygrad-0.10.2 → tinygrad-0.11.0}/test/test_winograd.py +18 -29
- {tinygrad-0.10.2 → tinygrad-0.11.0}/test/test_zero_copy.py +1 -1
- {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/__init__.py +1 -1
- tinygrad-0.11.0/tinygrad/apps/llm.py +206 -0
- tinygrad-0.11.0/tinygrad/codegen/__init__.py +116 -0
- tinygrad-0.11.0/tinygrad/codegen/devectorizer.py +390 -0
- {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/codegen/expander.py +8 -16
- tinygrad-0.11.0/tinygrad/codegen/gpudims.py +89 -0
- tinygrad-0.11.0/tinygrad/codegen/linearize.py +236 -0
- tinygrad-0.11.0/tinygrad/codegen/lowerer.py +114 -0
- tinygrad-0.11.0/tinygrad/codegen/opt/__init__.py +38 -0
- tinygrad-0.11.0/tinygrad/codegen/opt/heuristic.py +125 -0
- tinygrad-0.11.0/tinygrad/codegen/opt/kernel.py +510 -0
- {tinygrad-0.10.2/tinygrad/engine → tinygrad-0.11.0/tinygrad/codegen/opt}/search.py +51 -35
- tinygrad-0.11.0/tinygrad/codegen/opt/swizzler.py +134 -0
- tinygrad-0.11.0/tinygrad/codegen/opt/tc.py +127 -0
- tinygrad-0.11.0/tinygrad/codegen/quantize.py +67 -0
- {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/device.py +122 -132
- {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/dtype.py +152 -35
- {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/engine/jit.py +81 -54
- tinygrad-0.11.0/tinygrad/engine/memory.py +69 -0
- {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/engine/realize.py +82 -41
- tinygrad-0.11.0/tinygrad/engine/schedule.py +83 -0
- tinygrad-0.11.0/tinygrad/frontend/onnx.py +1253 -0
- tinygrad-0.11.0/tinygrad/frontend/torch.py +5 -0
- {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/gradient.py +19 -27
- {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/helpers.py +95 -47
- {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/nn/__init__.py +7 -8
- tinygrad-0.11.0/tinygrad/nn/optim.py +177 -0
- {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/nn/state.py +37 -23
- {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/renderer/__init__.py +40 -60
- {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/renderer/cstyle.py +143 -128
- tinygrad-0.11.0/tinygrad/renderer/llvmir.py +242 -0
- {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/renderer/ptx.py +50 -32
- {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/renderer/wgsl.py +27 -23
- tinygrad-0.11.0/tinygrad/runtime/autogen/am/am.py +5861 -0
- tinygrad-0.11.0/tinygrad/runtime/autogen/am/pm4_nv.py +962 -0
- tinygrad-0.11.0/tinygrad/runtime/autogen/am/pm4_soc15.py +931 -0
- tinygrad-0.11.0/tinygrad/runtime/autogen/am/sdma_4_0_0.py +5209 -0
- tinygrad-0.11.0/tinygrad/runtime/autogen/am/sdma_4_4_2.py +5209 -0
- tinygrad-0.11.0/tinygrad/runtime/autogen/am/sdma_5_0_0.py +7103 -0
- tinygrad-0.11.0/tinygrad/runtime/autogen/am/sdma_6_0_0.py +8085 -0
- tinygrad-0.11.0/tinygrad/runtime/autogen/am/smu_v13_0_0.py +3068 -0
- tinygrad-0.11.0/tinygrad/runtime/autogen/am/smu_v14_0_2.py +3605 -0
- tinygrad-0.11.0/tinygrad/runtime/autogen/amd_gpu.py +22115 -0
- {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/runtime/autogen/comgr.py +35 -9
- tinygrad-0.11.0/tinygrad/runtime/autogen/comgr_3.py +906 -0
- {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/runtime/autogen/cuda.py +2419 -494
- {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/runtime/autogen/hsa.py +57 -16
- tinygrad-0.11.0/tinygrad/runtime/autogen/ib.py +7171 -0
- {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/runtime/autogen/io_uring.py +917 -118
- tinygrad-0.11.0/tinygrad/runtime/autogen/kfd.py +1548 -0
- {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/runtime/autogen/libc.py +613 -218
- tinygrad-0.11.0/tinygrad/runtime/autogen/libusb.py +1643 -0
- tinygrad-0.11.0/tinygrad/runtime/autogen/nv/nv.py +8602 -0
- {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/runtime/autogen/nv_gpu.py +7218 -2072
- {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/runtime/autogen/opencl.py +2 -4
- tinygrad-0.11.0/tinygrad/runtime/autogen/sqtt.py +1789 -0
- {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/runtime/autogen/vfio.py +3 -3
- {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/runtime/autogen/webgpu.py +273 -264
- {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/runtime/graph/cuda.py +3 -3
- {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/runtime/graph/hcq.py +68 -29
- {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/runtime/graph/metal.py +29 -13
- tinygrad-0.11.0/tinygrad/runtime/graph/remote.py +114 -0
- tinygrad-0.11.0/tinygrad/runtime/ops_amd.py +852 -0
- tinygrad-0.11.0/tinygrad/runtime/ops_cpu.py +125 -0
- {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/runtime/ops_cuda.py +12 -14
- {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/runtime/ops_disk.py +13 -10
- {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/runtime/ops_dsp.py +47 -40
- {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/runtime/ops_gpu.py +13 -11
- {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/runtime/ops_hip.py +6 -9
- {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/runtime/ops_llvm.py +35 -15
- {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/runtime/ops_metal.py +29 -19
- tinygrad-0.11.0/tinygrad/runtime/ops_npy.py +11 -0
- tinygrad-0.11.0/tinygrad/runtime/ops_null.py +28 -0
- tinygrad-0.11.0/tinygrad/runtime/ops_nv.py +621 -0
- {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/runtime/ops_python.py +62 -52
- {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/runtime/ops_qcom.py +28 -39
- tinygrad-0.11.0/tinygrad/runtime/ops_remote.py +482 -0
- {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/runtime/ops_webgpu.py +28 -28
- tinygrad-0.11.0/tinygrad/runtime/support/am/amdev.py +261 -0
- tinygrad-0.11.0/tinygrad/runtime/support/am/ip.py +502 -0
- tinygrad-0.11.0/tinygrad/runtime/support/amd.py +138 -0
- tinygrad-0.10.2/tinygrad/runtime/support/compiler_hip.py → tinygrad-0.11.0/tinygrad/runtime/support/compiler_amd.py +40 -8
- {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/runtime/support/compiler_cuda.py +8 -11
- {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/runtime/support/elf.py +2 -1
- {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/runtime/support/hcq.py +184 -97
- tinygrad-0.11.0/tinygrad/runtime/support/ib.py +172 -0
- {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/runtime/support/llvm.py +3 -4
- tinygrad-0.11.0/tinygrad/runtime/support/memory.py +251 -0
- tinygrad-0.11.0/tinygrad/runtime/support/nv/ip.py +581 -0
- tinygrad-0.11.0/tinygrad/runtime/support/nv/nvdev.py +183 -0
- tinygrad-0.11.0/tinygrad/runtime/support/system.py +170 -0
- tinygrad-0.11.0/tinygrad/runtime/support/usb.py +268 -0
- tinygrad-0.11.0/tinygrad/runtime/support/webgpu.py +18 -0
- tinygrad-0.11.0/tinygrad/schedule/__init__.py +0 -0
- tinygrad-0.11.0/tinygrad/schedule/grouper.py +119 -0
- tinygrad-0.11.0/tinygrad/schedule/kernelize.py +368 -0
- tinygrad-0.11.0/tinygrad/schedule/multi.py +231 -0
- tinygrad-0.11.0/tinygrad/shape/__init__.py +0 -0
- {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/shape/shapetracker.py +40 -46
- {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/shape/view.py +88 -52
- {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/tensor.py +968 -542
- tinygrad-0.11.0/tinygrad/uop/__init__.py +117 -0
- tinygrad-0.10.2/tinygrad/codegen/transcendental.py → tinygrad-0.11.0/tinygrad/uop/decompositions.py +125 -38
- tinygrad-0.11.0/tinygrad/uop/mathtraits.py +169 -0
- tinygrad-0.11.0/tinygrad/uop/ops.py +1021 -0
- tinygrad-0.11.0/tinygrad/uop/spec.py +228 -0
- {tinygrad-0.10.2/tinygrad/codegen → tinygrad-0.11.0/tinygrad/uop}/symbolic.py +239 -216
- tinygrad-0.11.0/tinygrad/uop/upat.py +163 -0
- tinygrad-0.11.0/tinygrad/viz/assets/cdnjs.cloudflare.com/ajax/libs/highlight.js/11.10.0/languages/x86asm.min.js +19 -0
- tinygrad-0.11.0/tinygrad/viz/assets/d3js.org/d3.v7.min.js +2 -0
- tinygrad-0.11.0/tinygrad/viz/assets/dagrejs.github.io/project/dagre/latest/dagre.min.js +801 -0
- tinygrad-0.11.0/tinygrad/viz/index.html +344 -0
- tinygrad-0.11.0/tinygrad/viz/js/index.js +718 -0
- tinygrad-0.11.0/tinygrad/viz/js/worker.js +29 -0
- tinygrad-0.11.0/tinygrad/viz/serve.py +327 -0
- {tinygrad-0.10.2 → tinygrad-0.11.0/tinygrad.egg-info}/PKG-INFO +24 -16
- {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad.egg-info/SOURCES.txt +70 -21
- {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad.egg-info/requires.txt +16 -9
- tinygrad-0.10.2/test/test_conv.py +0 -150
- tinygrad-0.10.2/test/test_conv_shapetracker.py +0 -55
- tinygrad-0.10.2/test/test_dtype.py +0 -893
- tinygrad-0.10.2/test/test_fuzz_shape_ops.py +0 -88
- tinygrad-0.10.2/test/test_linearizer.py +0 -2203
- tinygrad-0.10.2/test/test_linearizer_dumb.py +0 -225
- tinygrad-0.10.2/test/test_linearizer_failures.py +0 -1415
- tinygrad-0.10.2/test/test_masked_st.py +0 -32
- tinygrad-0.10.2/test/test_quantize_onnx.py +0 -212
- tinygrad-0.10.2/test/test_rearrange_einops.py +0 -321
- tinygrad-0.10.2/test/test_renderer_failures.py +0 -76
- tinygrad-0.10.2/test/test_search.py +0 -190
- tinygrad-0.10.2/test/test_subbuffer.py +0 -68
- tinygrad-0.10.2/test/test_symbolic_shapetracker.py +0 -244
- tinygrad-0.10.2/tinygrad/codegen/devectorizer.py +0 -247
- tinygrad-0.10.2/tinygrad/codegen/kernel.py +0 -693
- tinygrad-0.10.2/tinygrad/codegen/linearize.py +0 -234
- tinygrad-0.10.2/tinygrad/codegen/lowerer.py +0 -161
- tinygrad-0.10.2/tinygrad/engine/memory.py +0 -50
- tinygrad-0.10.2/tinygrad/engine/multi.py +0 -161
- tinygrad-0.10.2/tinygrad/engine/schedule.py +0 -458
- tinygrad-0.10.2/tinygrad/nn/optim.py +0 -146
- tinygrad-0.10.2/tinygrad/ops.py +0 -1003
- tinygrad-0.10.2/tinygrad/renderer/llvmir.py +0 -191
- tinygrad-0.10.2/tinygrad/runtime/autogen/amd_gpu.py +0 -87879
- tinygrad-0.10.2/tinygrad/runtime/autogen/kfd.py +0 -826
- tinygrad-0.10.2/tinygrad/runtime/ops_amd.py +0 -635
- tinygrad-0.10.2/tinygrad/runtime/ops_cloud.py +0 -220
- tinygrad-0.10.2/tinygrad/runtime/ops_cpu.py +0 -24
- tinygrad-0.10.2/tinygrad/runtime/ops_npy.py +0 -9
- tinygrad-0.10.2/tinygrad/runtime/ops_nv.py +0 -549
- tinygrad-0.10.2/tinygrad/runtime/support/allocator.py +0 -94
- tinygrad-0.10.2/tinygrad/runtime/support/am/amdev.py +0 -396
- tinygrad-0.10.2/tinygrad/runtime/support/am/ip.py +0 -463
- tinygrad-0.10.2/tinygrad/spec.py +0 -155
- tinygrad-0.10.2/tinygrad/viz/assets/d3js.org/d3.v5.min.js +0 -2
- tinygrad-0.10.2/tinygrad/viz/assets/dagrejs.github.io/project/dagre-d3/latest/dagre-d3.min.js +0 -4816
- tinygrad-0.10.2/tinygrad/viz/index.html +0 -544
- tinygrad-0.10.2/tinygrad/viz/perfetto.html +0 -178
- tinygrad-0.10.2/tinygrad/viz/serve.py +0 -205
- {tinygrad-0.10.2 → tinygrad-0.11.0}/LICENSE +0 -0
- {tinygrad-0.10.2 → tinygrad-0.11.0}/setup.cfg +0 -0
- {tinygrad-0.10.2 → tinygrad-0.11.0}/test/test_compile_failures.py +0 -0
- {tinygrad-0.10.2 → tinygrad-0.11.0}/test/test_device_speed.py +0 -0
- {tinygrad-0.10.2 → tinygrad-0.11.0}/test/test_fusion_op.py +0 -0
- {tinygrad-0.10.2 → tinygrad-0.11.0}/test/test_kernel_cache.py +0 -0
- {tinygrad-0.10.2 → tinygrad-0.11.0}/test/test_metal.py +0 -0
- {tinygrad-0.10.2 → tinygrad-0.11.0}/test/test_method_cache.py +0 -0
- {tinygrad-0.10.2 → tinygrad-0.11.0}/test/test_net_speed.py +0 -0
- {tinygrad-0.10.2 → tinygrad-0.11.0}/test/test_ocl.py +0 -0
- {tinygrad-0.10.2 → tinygrad-0.11.0}/test/test_specific_conv.py +0 -0
- {tinygrad-0.10.2 → tinygrad-0.11.0}/test/test_tensor_data.py +0 -0
- {tinygrad-0.10.2 → tinygrad-0.11.0}/test/test_to_numpy.py +0 -0
- {tinygrad-0.10.2/tinygrad/codegen → tinygrad-0.11.0/tinygrad/engine}/__init__.py +0 -0
- {tinygrad-0.10.2/tinygrad/engine → tinygrad-0.11.0/tinygrad/frontend}/__init__.py +0 -0
- {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/nn/datasets.py +0 -0
- {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/py.typed +0 -0
- {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/runtime/__init__.py +0 -0
- {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/runtime/autogen/adreno.py +0 -0
- {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/runtime/autogen/hip.py +0 -0
- {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/runtime/autogen/kgsl.py +0 -0
- {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/runtime/autogen/llvm.py +0 -0
- {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/runtime/autogen/nvrtc.py +0 -0
- {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/runtime/autogen/pci.py +0 -0
- {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/runtime/autogen/qcom_dsp.py +0 -0
- {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/runtime/graph/__init__.py +0 -0
- {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/runtime/support/__init__.py +0 -0
- {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/runtime/support/am/__init__.py +0 -0
- {tinygrad-0.10.2/tinygrad/shape → tinygrad-0.11.0/tinygrad/runtime/support/nv}/__init__.py +0 -0
- {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/viz/assets/cdnjs.cloudflare.com/ajax/libs/highlight.js/11.10.0/highlight.min.js +0 -0
- {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/viz/assets/cdnjs.cloudflare.com/ajax/libs/highlight.js/11.10.0/languages/cpp.min.js +0 -0
- {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/viz/assets/cdnjs.cloudflare.com/ajax/libs/highlight.js/11.10.0/languages/python.min.js +0 -0
- {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/viz/assets/cdnjs.cloudflare.com/ajax/libs/highlight.js/11.10.0/styles/default.min.css +0 -0
- {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad/viz/assets/unpkg.com/@highlightjs/cdn-assets@11.10.0/styles/tokyo-night-dark.min.css +0 -0
- {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad.egg-info/dependency_links.txt +0 -0
- {tinygrad-0.10.2 → tinygrad-0.11.0}/tinygrad.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
|
-
Metadata-Version: 2.
|
1
|
+
Metadata-Version: 2.4
|
2
2
|
Name: tinygrad
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.11.0
|
4
4
|
Summary: You like pytorch? You like micrograd? You love tinygrad! <3
|
5
5
|
Author: George Hotz
|
6
6
|
License: MIT
|
@@ -19,31 +19,38 @@ Requires-Dist: mypy==1.13.0; extra == "linting"
|
|
19
19
|
Requires-Dist: typing-extensions; extra == "linting"
|
20
20
|
Requires-Dist: pre-commit; extra == "linting"
|
21
21
|
Requires-Dist: ruff; extra == "linting"
|
22
|
-
Requires-Dist:
|
22
|
+
Requires-Dist: numpy; extra == "linting"
|
23
23
|
Provides-Extra: testing-minimal
|
24
24
|
Requires-Dist: numpy; extra == "testing-minimal"
|
25
|
-
Requires-Dist: torch; extra == "testing-minimal"
|
25
|
+
Requires-Dist: torch==2.7.1; extra == "testing-minimal"
|
26
26
|
Requires-Dist: pytest; extra == "testing-minimal"
|
27
27
|
Requires-Dist: pytest-xdist; extra == "testing-minimal"
|
28
28
|
Requires-Dist: hypothesis; extra == "testing-minimal"
|
29
|
+
Requires-Dist: z3-solver; extra == "testing-minimal"
|
30
|
+
Requires-Dist: ml_dtypes; extra == "testing-minimal"
|
29
31
|
Provides-Extra: testing-unit
|
30
32
|
Requires-Dist: numpy; extra == "testing-unit"
|
31
|
-
Requires-Dist: torch; extra == "testing-unit"
|
33
|
+
Requires-Dist: torch==2.7.1; extra == "testing-unit"
|
32
34
|
Requires-Dist: pytest; extra == "testing-unit"
|
33
35
|
Requires-Dist: pytest-xdist; extra == "testing-unit"
|
34
36
|
Requires-Dist: hypothesis; extra == "testing-unit"
|
37
|
+
Requires-Dist: z3-solver; extra == "testing-unit"
|
38
|
+
Requires-Dist: ml_dtypes; extra == "testing-unit"
|
35
39
|
Requires-Dist: tqdm; extra == "testing-unit"
|
36
40
|
Requires-Dist: safetensors; extra == "testing-unit"
|
37
41
|
Requires-Dist: tabulate; extra == "testing-unit"
|
38
42
|
Provides-Extra: testing
|
39
43
|
Requires-Dist: numpy; extra == "testing"
|
40
|
-
Requires-Dist: torch; extra == "testing"
|
44
|
+
Requires-Dist: torch==2.7.1; extra == "testing"
|
41
45
|
Requires-Dist: pytest; extra == "testing"
|
42
46
|
Requires-Dist: pytest-xdist; extra == "testing"
|
43
47
|
Requires-Dist: hypothesis; extra == "testing"
|
48
|
+
Requires-Dist: z3-solver; extra == "testing"
|
49
|
+
Requires-Dist: ml_dtypes; extra == "testing"
|
44
50
|
Requires-Dist: pillow; extra == "testing"
|
45
|
-
Requires-Dist: onnx==1.
|
51
|
+
Requires-Dist: onnx==1.18.0; extra == "testing"
|
46
52
|
Requires-Dist: onnx2torch; extra == "testing"
|
53
|
+
Requires-Dist: onnxruntime; extra == "testing"
|
47
54
|
Requires-Dist: opencv-python; extra == "testing"
|
48
55
|
Requires-Dist: tabulate; extra == "testing"
|
49
56
|
Requires-Dist: tqdm; extra == "testing"
|
@@ -58,6 +65,10 @@ Requires-Dist: nibabel; extra == "testing"
|
|
58
65
|
Requires-Dist: bottle; extra == "testing"
|
59
66
|
Requires-Dist: ggml-python; extra == "testing"
|
60
67
|
Requires-Dist: capstone; extra == "testing"
|
68
|
+
Requires-Dist: pycocotools; extra == "testing"
|
69
|
+
Requires-Dist: boto3; extra == "testing"
|
70
|
+
Requires-Dist: pandas; extra == "testing"
|
71
|
+
Requires-Dist: influxdb3-python; extra == "testing"
|
61
72
|
Provides-Extra: docs
|
62
73
|
Requires-Dist: mkdocs; extra == "docs"
|
63
74
|
Requires-Dist: mkdocs-material; extra == "docs"
|
@@ -66,14 +77,12 @@ Requires-Dist: markdown-callouts; extra == "docs"
|
|
66
77
|
Requires-Dist: markdown-exec[ansi]; extra == "docs"
|
67
78
|
Requires-Dist: black; extra == "docs"
|
68
79
|
Requires-Dist: numpy; extra == "docs"
|
69
|
-
Provides-Extra: testing-tf
|
70
|
-
Requires-Dist: tensorflow==2.15.1; extra == "testing-tf"
|
71
|
-
Requires-Dist: tensorflow_addons; extra == "testing-tf"
|
72
80
|
Dynamic: author
|
73
81
|
Dynamic: classifier
|
74
82
|
Dynamic: description
|
75
83
|
Dynamic: description-content-type
|
76
84
|
Dynamic: license
|
85
|
+
Dynamic: license-file
|
77
86
|
Dynamic: provides-extra
|
78
87
|
Dynamic: requires-python
|
79
88
|
Dynamic: summary
|
@@ -101,11 +110,11 @@ tinygrad: For something between [PyTorch](https://github.com/pytorch/pytorch) an
|
|
101
110
|
|
102
111
|
---
|
103
112
|
|
104
|
-
|
113
|
+
Despite tinygrad's size, it is a fully featured deep learning framework.
|
105
114
|
|
106
|
-
Due to its extreme simplicity, it
|
115
|
+
Due to its extreme simplicity, it is the easiest framework to add new accelerators to, with support for both inference and training. If XLA is CISC, tinygrad is RISC.
|
107
116
|
|
108
|
-
tinygrad is
|
117
|
+
tinygrad is now beta software, we [raised some money](https://geohot.github.io/blog/jekyll/update/2023/05/24/the-tiny-corp-raised-5M.html) to make it good. Someday, we will tape out chips.
|
109
118
|
|
110
119
|
## Features
|
111
120
|
|
@@ -119,9 +128,8 @@ Try a matmul. See how, despite the style, it is fused into one kernel with the p
|
|
119
128
|
|
120
129
|
```sh
|
121
130
|
DEBUG=3 python3 -c "from tinygrad import Tensor;
|
122
|
-
N = 1024; a, b = Tensor.
|
123
|
-
|
124
|
-
print((c.numpy() - (a.numpy() @ b.numpy())).mean())"
|
131
|
+
N = 1024; a, b = Tensor.empty(N, N), Tensor.empty(N, N);
|
132
|
+
(a.reshape(N, 1, N) * b.T.reshape(1, N, N)).sum(axis=2).realize()"
|
125
133
|
```
|
126
134
|
|
127
135
|
And we can change `DEBUG` to `4` to see the generated code.
|
@@ -21,11 +21,11 @@ tinygrad: For something between [PyTorch](https://github.com/pytorch/pytorch) an
|
|
21
21
|
|
22
22
|
---
|
23
23
|
|
24
|
-
|
24
|
+
Despite tinygrad's size, it is a fully featured deep learning framework.
|
25
25
|
|
26
|
-
Due to its extreme simplicity, it
|
26
|
+
Due to its extreme simplicity, it is the easiest framework to add new accelerators to, with support for both inference and training. If XLA is CISC, tinygrad is RISC.
|
27
27
|
|
28
|
-
tinygrad is
|
28
|
+
tinygrad is now beta software, we [raised some money](https://geohot.github.io/blog/jekyll/update/2023/05/24/the-tiny-corp-raised-5M.html) to make it good. Someday, we will tape out chips.
|
29
29
|
|
30
30
|
## Features
|
31
31
|
|
@@ -39,9 +39,8 @@ Try a matmul. See how, despite the style, it is fused into one kernel with the p
|
|
39
39
|
|
40
40
|
```sh
|
41
41
|
DEBUG=3 python3 -c "from tinygrad import Tensor;
|
42
|
-
N = 1024; a, b = Tensor.
|
43
|
-
|
44
|
-
print((c.numpy() - (a.numpy() @ b.numpy())).mean())"
|
42
|
+
N = 1024; a, b = Tensor.empty(N, N), Tensor.empty(N, N);
|
43
|
+
(a.reshape(N, 1, N) * b.T.reshape(1, N, N)).sum(axis=2).realize()"
|
45
44
|
```
|
46
45
|
|
47
46
|
And we can change `DEBUG` to `4` to see the generated code.
|
@@ -9,22 +9,44 @@ with open(directory / 'README.md', encoding='utf-8') as f:
|
|
9
9
|
|
10
10
|
testing_minimal = [
|
11
11
|
"numpy",
|
12
|
-
"torch",
|
12
|
+
"torch==2.7.1",
|
13
13
|
"pytest",
|
14
14
|
"pytest-xdist",
|
15
15
|
"hypothesis",
|
16
|
+
"z3-solver",
|
17
|
+
"ml_dtypes"
|
16
18
|
]
|
17
19
|
|
18
20
|
setup(name='tinygrad',
|
19
|
-
version='0.
|
21
|
+
version='0.11.0',
|
20
22
|
description='You like pytorch? You like micrograd? You love tinygrad! <3',
|
21
23
|
author='George Hotz',
|
22
24
|
license='MIT',
|
23
25
|
long_description=long_description,
|
24
26
|
long_description_content_type='text/markdown',
|
25
|
-
packages = [
|
26
|
-
|
27
|
-
|
27
|
+
packages = [
|
28
|
+
'tinygrad',
|
29
|
+
'tinygrad.apps',
|
30
|
+
'tinygrad.codegen',
|
31
|
+
'tinygrad.codegen.opt',
|
32
|
+
'tinygrad.engine',
|
33
|
+
'tinygrad.frontend',
|
34
|
+
'tinygrad.nn',
|
35
|
+
'tinygrad.renderer',
|
36
|
+
'tinygrad.runtime',
|
37
|
+
'tinygrad.runtime.autogen',
|
38
|
+
'tinygrad.runtime.autogen.am',
|
39
|
+
'tinygrad.runtime.autogen.nv',
|
40
|
+
'tinygrad.runtime.graph',
|
41
|
+
'tinygrad.runtime.support',
|
42
|
+
'tinygrad.runtime.support.am',
|
43
|
+
'tinygrad.runtime.support.nv',
|
44
|
+
'tinygrad.schedule',
|
45
|
+
'tinygrad.shape',
|
46
|
+
'tinygrad.uop',
|
47
|
+
'tinygrad.viz',
|
48
|
+
],
|
49
|
+
package_data = {'tinygrad': ['py.typed'], 'tinygrad.viz': ['index.html', 'assets/**/*', 'js/*']},
|
28
50
|
classifiers=[
|
29
51
|
"Programming Language :: Python :: 3",
|
30
52
|
"License :: OSI Approved :: MIT License"
|
@@ -40,19 +62,20 @@ setup(name='tinygrad',
|
|
40
62
|
"typing-extensions",
|
41
63
|
"pre-commit",
|
42
64
|
"ruff",
|
43
|
-
"
|
65
|
+
"numpy",
|
44
66
|
],
|
45
|
-
#'mlperf': ["mlperf-logging @ git+https://github.com/mlperf/logging.git@
|
67
|
+
#'mlperf': ["mlperf-logging @ git+https://github.com/mlperf/logging.git@5.0.0-rc3"],
|
46
68
|
'testing_minimal': testing_minimal,
|
47
69
|
'testing_unit': testing_minimal + [
|
48
70
|
"tqdm",
|
49
71
|
"safetensors",
|
50
|
-
"tabulate" # for sz.py
|
72
|
+
"tabulate", # for sz.py
|
51
73
|
],
|
52
74
|
'testing': testing_minimal + [
|
53
75
|
"pillow",
|
54
|
-
"onnx==1.
|
76
|
+
"onnx==1.18.0",
|
55
77
|
"onnx2torch",
|
78
|
+
"onnxruntime",
|
56
79
|
"opencv-python",
|
57
80
|
"tabulate",
|
58
81
|
"tqdm",
|
@@ -66,7 +89,11 @@ setup(name='tinygrad',
|
|
66
89
|
"nibabel",
|
67
90
|
"bottle",
|
68
91
|
"ggml-python",
|
69
|
-
"capstone"
|
92
|
+
"capstone",
|
93
|
+
"pycocotools",
|
94
|
+
"boto3",
|
95
|
+
"pandas",
|
96
|
+
"influxdb3-python"
|
70
97
|
],
|
71
98
|
'docs': [
|
72
99
|
"mkdocs",
|
@@ -77,9 +104,5 @@ setup(name='tinygrad',
|
|
77
104
|
"black",
|
78
105
|
"numpy",
|
79
106
|
],
|
80
|
-
'testing_tf': [
|
81
|
-
"tensorflow==2.15.1",
|
82
|
-
"tensorflow_addons",
|
83
|
-
],
|
84
107
|
},
|
85
108
|
include_package_data=True)
|
@@ -0,0 +1,52 @@
|
|
1
|
+
import unittest
|
2
|
+
import numpy as np
|
3
|
+
from tinygrad import Device
|
4
|
+
from tinygrad.device import CompileError
|
5
|
+
from tinygrad.helpers import flat_mv
|
6
|
+
if Device.DEFAULT=="AMD":
|
7
|
+
from tinygrad.runtime.ops_amd import AMDAllocator, AMDDevice, AMDProgram
|
8
|
+
from tinygrad.runtime.support.compiler_amd import AMDLLVMCompiler
|
9
|
+
|
10
|
+
@unittest.skipUnless(Device.DEFAULT == "AMD", "Runs only on AMD")
|
11
|
+
class TestAMDLLVM(unittest.TestCase):
|
12
|
+
def test_compiler(self):
|
13
|
+
src = '''
|
14
|
+
; https://github.com/llvm/llvm-project/blob/main/llvm/test/CodeGen/AMDGPU/imm.ll
|
15
|
+
define amdgpu_kernel void @i64_imm_inline_lo(ptr addrspace(1) %out) {
|
16
|
+
entry:
|
17
|
+
store i64 1311768464867721221, ptr addrspace(1) %out ; 0x1234567800000005
|
18
|
+
ret void
|
19
|
+
}
|
20
|
+
'''
|
21
|
+
device = AMDDevice()
|
22
|
+
compiler = AMDLLVMCompiler("gfx1100")
|
23
|
+
obj = compiler.compile(src)
|
24
|
+
allocator = AMDAllocator(device)
|
25
|
+
a = allocator.alloc(1*8)
|
26
|
+
prog = AMDProgram(device, "test", obj)
|
27
|
+
prog(a, wait=True)
|
28
|
+
na = np.empty(1, np.uint64)
|
29
|
+
allocator._copyout(flat_mv(na.data), a)
|
30
|
+
assert na == [0x1234567800000005]
|
31
|
+
|
32
|
+
def test_compiler_diag_error(self):
|
33
|
+
src = """
|
34
|
+
@local_temp0 = internal unnamed_addr addrspace(3) global [{N} x float*] undef, align 16
|
35
|
+
define amdgpu_kernel void @test(float* noalias align 32 %data0, half* noalias align 32 %data1, float* noalias align 32 %data2) #0
|
36
|
+
{{
|
37
|
+
%local_temp0 = addrspacecast [{N} x float*] addrspace(3)* @local_temp0 to [{N} x float*]*
|
38
|
+
%v178 = getelementptr inbounds float, float* %local_temp0, i32 1
|
39
|
+
%v133 = getelementptr inbounds float, float* %data2, i32 1
|
40
|
+
%v134 = load float, float* %v133
|
41
|
+
store float %v134, float* %v178
|
42
|
+
ret void
|
43
|
+
}}
|
44
|
+
"""
|
45
|
+
compiler = AMDLLVMCompiler("gfx1100")
|
46
|
+
compiler.compile(src.format(N=65536//8))
|
47
|
+
with self.assertRaises(CompileError):
|
48
|
+
# llvm diagnostic: <unknown>:0:0: local memory (65544) exceeds limit (65536) in function 'test'
|
49
|
+
compiler.compile(src.format(N=65536//8+1))
|
50
|
+
|
51
|
+
if __name__ == '__main__':
|
52
|
+
unittest.main()
|
@@ -1,12 +1,13 @@
|
|
1
1
|
import unittest, contextlib
|
2
2
|
import numpy as np
|
3
|
-
from tinygrad import Tensor, GlobalCounters, dtypes, nn, Device
|
3
|
+
from tinygrad import Tensor, GlobalCounters, dtypes, nn, Device, Variable
|
4
4
|
from tinygrad.helpers import CI, Context, getenv
|
5
5
|
from tinygrad.engine.realize import run_schedule
|
6
|
-
from tinygrad.codegen.kernel import Opt, OptOps, Kernel, KernelOptError
|
7
|
-
from tinygrad.engine.realize import CompiledRunner, ExecItem
|
8
|
-
from tinygrad.
|
9
|
-
from tinygrad.ops import Ops
|
6
|
+
from tinygrad.codegen.opt.kernel import Opt, OptOps, Kernel, KernelOptError
|
7
|
+
from tinygrad.engine.realize import CompiledRunner, ExecItem, get_program
|
8
|
+
from tinygrad.codegen.opt.search import get_kernel_actions
|
9
|
+
from tinygrad.uop.ops import Ops
|
10
|
+
from tinygrad.codegen import apply_rewrites, rewrites_for_views
|
10
11
|
|
11
12
|
class TestArange(unittest.TestCase):
|
12
13
|
def _get_flops(self, N, opts=None):
|
@@ -14,41 +15,46 @@ class TestArange(unittest.TestCase):
|
|
14
15
|
tt = Tensor.arange(N)
|
15
16
|
sched = tt.schedule()
|
16
17
|
self.assertEqual(len(sched), 1)
|
17
|
-
|
18
|
-
if opts is not None:
|
19
|
-
for o in opts: k.apply_opt(o)
|
20
|
-
p = k.to_program()
|
18
|
+
p = get_program(sched[-1].ast, opts=opts)
|
21
19
|
print(p.name)
|
22
20
|
#print(p.src)
|
23
|
-
ExecItem(CompiledRunner(p), [tt.
|
21
|
+
ExecItem(CompiledRunner(p), [tt.uop.buffer]).run()
|
24
22
|
np.testing.assert_equal(tt.numpy(), np.arange(N))
|
25
23
|
return p.estimates.ops
|
26
24
|
|
27
25
|
def test_complexity(self, opts=None, limit=None):
|
28
|
-
|
29
|
-
|
30
|
-
f2 = self._get_flops(2560, opts) + 1
|
26
|
+
f1 = self._get_flops(256, opts)
|
27
|
+
f2 = self._get_flops(2560, opts)
|
31
28
|
print(f"{f1=}, {f2=}")
|
32
|
-
|
29
|
+
# add 1 to avoid divide by 0. arange is 0 flops now!
|
30
|
+
assert (f1 < 6000 and f2 < 6000) or ((f2+1) / (f1+1) < 16), f"bad complexity, flops {(f2+1) / (f1+1):.1f}X while inputs 10X"
|
33
31
|
if limit is not None and not getenv("PTX"):
|
34
32
|
# PTX counts index ALU in flops
|
35
33
|
assert f1 <= limit, f"{f1=}, {limit=}"
|
36
34
|
|
37
|
-
def test_complexity_w_upcast(self): return self.test_complexity([Opt(OptOps.UPCAST, 0, 4)], limit=
|
38
|
-
def test_complexity_w_unroll2(self): return self.test_complexity([Opt(OptOps.UNROLL, 0, 2)], limit=
|
39
|
-
def test_complexity_w_unroll4(self): return self.test_complexity([Opt(OptOps.UNROLL, 0, 4)], limit=
|
40
|
-
def test_complexity_w_unroll8(self): return self.test_complexity([Opt(OptOps.UNROLL, 0, 8)], limit=
|
41
|
-
def test_complexity_w_upcast_and_unroll(self): return self.test_complexity([Opt(OptOps.UPCAST, 0, 4), Opt(OptOps.UNROLL, 0, 4)], limit=
|
35
|
+
def test_complexity_w_upcast(self): return self.test_complexity([Opt(OptOps.UPCAST, 0, 4)], limit=0)
|
36
|
+
def test_complexity_w_unroll2(self): return self.test_complexity([Opt(OptOps.UNROLL, 0, 2)], limit=0)
|
37
|
+
def test_complexity_w_unroll4(self): return self.test_complexity([Opt(OptOps.UNROLL, 0, 4)], limit=0)
|
38
|
+
def test_complexity_w_unroll8(self): return self.test_complexity([Opt(OptOps.UNROLL, 0, 8)], limit=0)
|
39
|
+
def test_complexity_w_upcast_and_unroll(self): return self.test_complexity([Opt(OptOps.UPCAST, 0, 4), Opt(OptOps.UNROLL, 0, 4)], limit=0)
|
42
40
|
|
43
|
-
|
44
|
-
|
41
|
+
if Device.default.renderer.has_local:
|
42
|
+
# TODO: fix limit
|
43
|
+
def test_complexity_w_group(self): return self.test_complexity([Opt(OptOps.GROUP, 0, 16)], limit=81920)
|
44
|
+
def test_complexity_w_group_top(self): return self.test_complexity([Opt(OptOps.GROUPTOP, 0, 16)], limit=106496)
|
45
|
+
|
46
|
+
def test_complexity_w_local(self): return self.test_complexity([Opt(OptOps.LOCAL, 0, 16)], limit=0)
|
47
|
+
@unittest.skip("doesn't work yet. TODO: this absolutely should work")
|
48
|
+
def test_complexity_w_local_unroll4(self): return self.test_complexity([Opt(OptOps.LOCAL, 0, 16), Opt(OptOps.UNROLL, 0, 4)], limit=0)
|
49
|
+
@unittest.skip("doesn't work yet")
|
50
|
+
def test_complexity_w_local_and_padto(self): return self.test_complexity([Opt(OptOps.LOCAL, 0, 16), Opt(OptOps.PADTO, axis=1, arg=32)])
|
45
51
|
|
46
52
|
def test_all_opts(self, opts=None, exclude=None):
|
47
|
-
k = Kernel(Tensor.arange(256).schedule()[-1].ast)
|
53
|
+
k = Kernel(apply_rewrites(Tensor.arange(256).schedule()[-1].ast, rewrites_for_views))
|
48
54
|
if opts is not None:
|
49
55
|
for o in opts: k.apply_opt(o)
|
50
56
|
all_opts_256 = [kk.applied_opts for kk in get_kernel_actions(k, include_0=False).values()]
|
51
|
-
k = Kernel(Tensor.arange(2560).schedule()[-1].ast)
|
57
|
+
k = Kernel(apply_rewrites(Tensor.arange(2560).schedule()[-1].ast, rewrites_for_views))
|
52
58
|
if opts is not None:
|
53
59
|
for o in opts: k.apply_opt(o)
|
54
60
|
all_opts_2560 = [kk.applied_opts for kk in get_kernel_actions(k, include_0=False).values()]
|
@@ -65,6 +71,24 @@ class TestArange(unittest.TestCase):
|
|
65
71
|
def test_all_opts_w_upcast_and_unroll(self):
|
66
72
|
return self.test_all_opts([Opt(OptOps.UPCAST, 0, 4), Opt(OptOps.UNROLL, 0, 4)], [Opt(op=OptOps.GROUP, axis=0, arg=0)])
|
67
73
|
|
74
|
+
class TestRand(unittest.TestCase):
|
75
|
+
def test_fused_rand_less_ops(self, noopt=1):
|
76
|
+
GlobalCounters.reset()
|
77
|
+
with Context(FUSE_ARANGE=0, NOOPT=noopt):
|
78
|
+
out = Tensor.rand(16384)
|
79
|
+
out.realize()
|
80
|
+
unfused_ops = GlobalCounters.global_ops
|
81
|
+
|
82
|
+
GlobalCounters.reset()
|
83
|
+
with Context(FUSE_ARANGE=1, NOOPT=noopt):
|
84
|
+
out = Tensor.rand(16384)
|
85
|
+
out.realize()
|
86
|
+
print(f"fused {GlobalCounters.global_ops} unfused {unfused_ops}")
|
87
|
+
self.assertLessEqual(GlobalCounters.global_ops, unfused_ops*2)
|
88
|
+
def test_fused_rand_less_ops_opt(self): self.test_fused_rand_less_ops(0)
|
89
|
+
|
90
|
+
DSET, DDIM = 2048, 32
|
91
|
+
|
68
92
|
class TestIndexing(unittest.TestCase):
|
69
93
|
def test_arange_2_reduce(self):
|
70
94
|
needle = Tensor.zeros(16384, dtype=dtypes.int).contiguous()
|
@@ -80,52 +104,63 @@ class TestIndexing(unittest.TestCase):
|
|
80
104
|
|
81
105
|
@unittest.skipIf(getenv("PTX"), "broken on ptx for some reason")
|
82
106
|
def test_manual_index(self):
|
83
|
-
dataset = Tensor.rand(
|
107
|
+
dataset = Tensor.rand(DSET, DDIM).realize()
|
84
108
|
idxs = Tensor([0,3,5,6]).realize()
|
85
109
|
real_index = dataset.numpy()[idxs.numpy()]
|
86
110
|
print("*** indexing ***")
|
87
111
|
with Context(NOOPT=1, FUSE_ARANGE=1):
|
88
112
|
GlobalCounters.reset()
|
89
|
-
rng = Tensor.ones(4,
|
90
|
-
idxs = idxs.reshape(4,1,1,1).expand(4,
|
91
|
-
reshape_dataset = dataset.T.reshape(1,
|
92
|
-
full = (rng==idxs).where(reshape_dataset, Tensor.zeros(4,
|
113
|
+
rng = Tensor.ones(4, DDIM, DSET, dtype=dtypes.int)._cumalu(axis=-1, op=Ops.ADD, _include_initial=True).reshape(4, DDIM, DSET, 1)
|
114
|
+
idxs = idxs.reshape(4,1,1,1).expand(4, DDIM, DSET, 1)
|
115
|
+
reshape_dataset = dataset.T.reshape(1, DDIM, DSET, 1).expand(4, DDIM, DSET, 1)
|
116
|
+
full = (rng==idxs).where(reshape_dataset, Tensor.zeros(4, DDIM, DSET, 1))
|
93
117
|
X = full.sum(axis=(2,3))
|
94
118
|
sched = X.schedule()
|
95
119
|
self.assertEqual(len(sched), 1)
|
96
120
|
run_schedule(sched)
|
97
|
-
assert GlobalCounters.global_ops < 4*
|
121
|
+
assert GlobalCounters.global_ops < 4*DSET, f"too many ops {GlobalCounters.global_ops}"
|
98
122
|
np.testing.assert_allclose(real_index, X.numpy())
|
99
123
|
|
124
|
+
def test_index_variable(self):
|
125
|
+
dataset = Tensor.rand(DSET, DDIM).realize()
|
126
|
+
v = Variable("v", 0, DDIM-1)
|
127
|
+
with Context(NOOPT=1, FUSE_ARANGE=1, SPLIT_REDUCEOP=0):
|
128
|
+
GlobalCounters.reset()
|
129
|
+
vb = Tensor(v.bind(12))
|
130
|
+
comp = dataset[vb].numpy()
|
131
|
+
# no global ops because they are all indexing
|
132
|
+
self.assertEqual(GlobalCounters.global_ops, 0)
|
133
|
+
np.testing.assert_allclose(comp, dataset.numpy()[12])
|
134
|
+
|
100
135
|
def test_index(self):
|
101
|
-
dataset = Tensor.rand(
|
136
|
+
dataset = Tensor.rand(DSET, DDIM).realize()
|
102
137
|
idxs = Tensor([0,3,5,6]).realize()
|
103
138
|
real_index = dataset.numpy()[idxs.numpy()]
|
104
139
|
print("*** indexing ***")
|
105
140
|
with Context(NOOPT=1):
|
106
141
|
GlobalCounters.reset()
|
107
142
|
X = dataset[idxs]
|
108
|
-
assert X.shape == (4,
|
143
|
+
assert X.shape == (4,DDIM)
|
109
144
|
sched = X.schedule()
|
110
145
|
# TODO: enable these asserts when the scheduler can handle this
|
111
146
|
#self.assertEqual(len(sched), 1)
|
112
147
|
run_schedule(sched)
|
113
|
-
#assert GlobalCounters.global_ops < 4*
|
148
|
+
#assert GlobalCounters.global_ops < 4*DSET, f"too many ops {GlobalCounters.global_ops}"
|
114
149
|
np.testing.assert_allclose(real_index, X.numpy())
|
115
150
|
|
116
151
|
def test_index_fused(self, noopt=1):
|
117
|
-
dataset = Tensor.rand(
|
152
|
+
dataset = Tensor.rand(DSET, DDIM).realize()
|
118
153
|
idxs = Tensor([0,3,5,6]).realize()
|
119
154
|
real_index = dataset.numpy()[idxs.numpy()]
|
120
155
|
print("*** indexing ***")
|
121
156
|
with Context(NOOPT=noopt, FUSE_ARANGE=1):
|
122
157
|
GlobalCounters.reset()
|
123
158
|
X = dataset[idxs]
|
124
|
-
assert X.shape == (4,
|
159
|
+
assert X.shape == (4,DDIM)
|
125
160
|
sched = X.schedule()
|
126
161
|
self.assertEqual(len(sched), 2)
|
127
162
|
run_schedule(sched)
|
128
|
-
assert GlobalCounters.global_ops < 4*
|
163
|
+
assert GlobalCounters.global_ops < 4*DSET, f"too many ops {GlobalCounters.global_ops} != {4*DSET}"
|
129
164
|
np.testing.assert_allclose(real_index, X.numpy())
|
130
165
|
@unittest.skip("not ready")
|
131
166
|
def test_index_fused_opt(self): self.test_index_fused(0)
|
@@ -138,10 +173,12 @@ class TestIndexing(unittest.TestCase):
|
|
138
173
|
np.testing.assert_equal(X.numpy(), 0)
|
139
174
|
|
140
175
|
@unittest.skipIf(getenv("PTX"), "broken on ptx for some reason")
|
141
|
-
def test_index_mnist(self, noopt=1, op_limit=512*784*13):
|
176
|
+
def test_index_mnist(self, noopt=1, op_limit=512*784*13, split_reduceop=0):
|
177
|
+
# WEBGPU generates more ops due to bitpacking of < 4-byte dtypes
|
178
|
+
if Device.DEFAULT == "WEBGPU": op_limit *= 15
|
142
179
|
from tinygrad.nn.datasets import mnist
|
143
180
|
X_train, Y_train, _, _ = mnist()
|
144
|
-
with Context(NOOPT=noopt, FUSE_ARANGE=1, SPLIT_REDUCEOP=
|
181
|
+
with Context(NOOPT=noopt, FUSE_ARANGE=1, SPLIT_REDUCEOP=split_reduceop):
|
145
182
|
samples = Tensor.randint(getenv("BS", 512), high=X_train.shape[0]).realize()
|
146
183
|
GlobalCounters.reset()
|
147
184
|
x = X_train[samples].numpy()
|
@@ -149,10 +186,12 @@ class TestIndexing(unittest.TestCase):
|
|
149
186
|
assert GlobalCounters.global_ops < op_limit, f"too many ops {GlobalCounters.global_ops} != {op_limit}"
|
150
187
|
np.testing.assert_allclose(X_train.numpy()[samples.numpy()], x)
|
151
188
|
np.testing.assert_allclose(Y_train.numpy()[samples.numpy()], y)
|
152
|
-
|
189
|
+
|
153
190
|
def test_index_mnist_opt(self): self.test_index_mnist(0)
|
191
|
+
def test_index_mnist_split(self): self.test_index_mnist(1, split_reduceop=1)
|
192
|
+
def test_index_mnist_opt_split(self): self.test_index_mnist(0, split_reduceop=1)
|
154
193
|
|
155
|
-
@unittest.skipIf(getenv("PTX")
|
194
|
+
@unittest.skipIf(getenv("PTX"), "broken on ptx for some reason")
|
156
195
|
def test_llama_embedding(self, noopt=1, op_limit=65536):
|
157
196
|
# llama3 is 128256
|
158
197
|
vocab_size, embed_size = (10, 3) if CI else (32000, 4096)
|
@@ -13,11 +13,11 @@ class TestAssign(unittest.TestCase):
|
|
13
13
|
b = Tensor(np.arange(N*N, dtype=np.float32)).reshape(N,N)
|
14
14
|
a.realize()
|
15
15
|
b.realize()
|
16
|
-
ba1 = a.
|
17
|
-
bb1 = b.
|
16
|
+
ba1 = a.uop.base.realized
|
17
|
+
bb1 = b.uop.base.realized
|
18
18
|
a += b
|
19
19
|
a.realize()
|
20
|
-
ba2 = a.
|
20
|
+
ba2 = a.uop.base.realized
|
21
21
|
assert ba1 == ba2 and ba1 != bb1
|
22
22
|
np.testing.assert_allclose(a.numpy(), (np.arange(N*N)*2).reshape((N,N)))
|
23
23
|
|
@@ -259,13 +259,13 @@ class TestAssign(unittest.TestCase):
|
|
259
259
|
b = Tensor(np.arange(N*N, dtype=np.float32)).reshape(N,N)
|
260
260
|
a.realize()
|
261
261
|
b.realize()
|
262
|
-
ba1 = a.
|
263
|
-
bb1 = b.
|
262
|
+
ba1 = a.uop.base.realized
|
263
|
+
bb1 = b.uop.base.realized
|
264
264
|
with self.assertRaises((RuntimeError, AssertionError)):
|
265
265
|
a = a.permute(1,0)
|
266
266
|
a += b
|
267
267
|
a.realize()
|
268
|
-
ba2 = a.
|
268
|
+
ba2 = a.uop.base.realized
|
269
269
|
assert ba1 != ba2 and ba1 != bb1
|
270
270
|
np.testing.assert_allclose(a.numpy(), np.arange(N*N).reshape((N,N)) + np.arange(N*N).reshape((N,N)).transpose(1,0))
|
271
271
|
|
@@ -275,12 +275,12 @@ class TestAssign(unittest.TestCase):
|
|
275
275
|
a.realize()
|
276
276
|
b.realize()
|
277
277
|
#GlobalCounters.cache = []
|
278
|
-
ba1 = a.
|
279
|
-
bb1 = b.
|
278
|
+
ba1 = a.uop.base.realized # noqa: F841
|
279
|
+
bb1 = b.uop.base.realized # noqa: F841
|
280
280
|
with self.assertRaisesRegex(RuntimeError, "contiguous"):
|
281
281
|
a.assign(a.permute(1,0) + b) # this should not work!
|
282
282
|
a.realize()
|
283
|
-
ba2 = a.
|
283
|
+
ba2 = a.uop.base.realized # noqa: F841
|
284
284
|
# NOTE: don't test that it's assigned
|
285
285
|
#assert ba1 == ba2 and ba1 != bb1
|
286
286
|
np.testing.assert_allclose(a.numpy(), np.arange(N*N).reshape((N,N)) + np.arange(N*N).reshape((N,N)).transpose(1,0))
|
@@ -383,10 +383,10 @@ class TestAssign(unittest.TestCase):
|
|
383
383
|
def test_cast_assignment(self):
|
384
384
|
a = Tensor(np.arange(N*N, dtype=np.float32)).reshape(N,N)
|
385
385
|
a.realize()
|
386
|
-
oba1 = a.
|
386
|
+
oba1 = a.uop.base.output_buffer
|
387
387
|
a.assign(a.cast(dtypes.int32).realize())
|
388
388
|
a.realize()
|
389
|
-
oba2 = a.
|
389
|
+
oba2 = a.uop.base.output_buffer
|
390
390
|
assert oba1 is None and oba2 is None
|
391
391
|
np.testing.assert_allclose(a.numpy(), np.arange(N*N,dtype=np.int32).reshape((N,N)))
|
392
392
|
|