tinygrad 0.10.0__tar.gz → 0.10.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {tinygrad-0.10.0 → tinygrad-0.10.2}/PKG-INFO +36 -13
- {tinygrad-0.10.0 → tinygrad-0.10.2}/README.md +7 -6
- {tinygrad-0.10.0 → tinygrad-0.10.2}/setup.py +23 -14
- {tinygrad-0.10.0 → tinygrad-0.10.2}/test/test_arange.py +18 -17
- {tinygrad-0.10.0 → tinygrad-0.10.2}/test/test_assign.py +18 -11
- {tinygrad-0.10.0 → tinygrad-0.10.2}/test/test_const_folding.py +80 -10
- {tinygrad-0.10.0 → tinygrad-0.10.2}/test/test_conv_shapetracker.py +5 -8
- {tinygrad-0.10.0 → tinygrad-0.10.2}/test/test_copy_speed.py +5 -5
- {tinygrad-0.10.0 → tinygrad-0.10.2}/test/test_device_speed.py +1 -1
- {tinygrad-0.10.0 → tinygrad-0.10.2}/test/test_dtype.py +55 -17
- {tinygrad-0.10.0 → tinygrad-0.10.2}/test/test_dtype_alu.py +41 -9
- {tinygrad-0.10.0 → tinygrad-0.10.2}/test/test_fusion_op.py +8 -9
- {tinygrad-0.10.0 → tinygrad-0.10.2}/test/test_fuzz_shape_ops.py +1 -1
- {tinygrad-0.10.0 → tinygrad-0.10.2}/test/test_gc.py +24 -8
- {tinygrad-0.10.0 → tinygrad-0.10.2}/test/test_graph.py +2 -3
- {tinygrad-0.10.0 → tinygrad-0.10.2}/test/test_hcq.py +105 -77
- {tinygrad-0.10.0 → tinygrad-0.10.2}/test/test_image_dtype.py +63 -8
- {tinygrad-0.10.0 → tinygrad-0.10.2}/test/test_jit.py +102 -3
- tinygrad-0.10.2/test/test_kernel_cache.py +29 -0
- {tinygrad-0.10.0 → tinygrad-0.10.2}/test/test_linearizer.py +209 -180
- {tinygrad-0.10.0 → tinygrad-0.10.2}/test/test_linearizer_dumb.py +9 -7
- {tinygrad-0.10.0 → tinygrad-0.10.2}/test/test_linearizer_failures.py +99 -119
- {tinygrad-0.10.0 → tinygrad-0.10.2}/test/test_linearizer_overflows.py +11 -11
- {tinygrad-0.10.0 → tinygrad-0.10.2}/test/test_metal.py +3 -5
- {tinygrad-0.10.0 → tinygrad-0.10.2}/test/test_multitensor.py +171 -53
- {tinygrad-0.10.0 → tinygrad-0.10.2}/test/test_net_speed.py +1 -1
- {tinygrad-0.10.0 → tinygrad-0.10.2}/test/test_nn.py +48 -36
- {tinygrad-0.10.0 → tinygrad-0.10.2}/test/test_ops.py +520 -95
- {tinygrad-0.10.0 → tinygrad-0.10.2}/test/test_optim.py +1 -1
- {tinygrad-0.10.0 → tinygrad-0.10.2}/test/test_pickle.py +60 -6
- tinygrad-0.10.2/test/test_profiler.py +163 -0
- tinygrad-0.10.2/test/test_quantize_onnx.py +212 -0
- {tinygrad-0.10.0 → tinygrad-0.10.2}/test/test_randomness.py +4 -3
- {tinygrad-0.10.0 → tinygrad-0.10.2}/test/test_renderer_failures.py +17 -9
- {tinygrad-0.10.0 → tinygrad-0.10.2}/test/test_sample.py +2 -1
- {tinygrad-0.10.0 → tinygrad-0.10.2}/test/test_schedule.py +944 -183
- {tinygrad-0.10.0 → tinygrad-0.10.2}/test/test_search.py +43 -11
- {tinygrad-0.10.0 → tinygrad-0.10.2}/test/test_setitem.py +24 -9
- {tinygrad-0.10.0 → tinygrad-0.10.2}/test/test_speed_v_torch.py +10 -3
- {tinygrad-0.10.0 → tinygrad-0.10.2}/test/test_subbuffer.py +20 -3
- {tinygrad-0.10.0 → tinygrad-0.10.2}/test/test_symbolic_ops.py +0 -2
- {tinygrad-0.10.0 → tinygrad-0.10.2}/test/test_symbolic_shapetracker.py +12 -1
- {tinygrad-0.10.0 → tinygrad-0.10.2}/test/test_tensor.py +142 -58
- {tinygrad-0.10.0 → tinygrad-0.10.2}/test/test_tensor_data.py +14 -0
- tinygrad-0.10.0/test/test_lazybuffer.py → tinygrad-0.10.2/test/test_tensor_uop.py +28 -45
- {tinygrad-0.10.0 → tinygrad-0.10.2}/test/test_tiny.py +44 -10
- {tinygrad-0.10.0 → tinygrad-0.10.2}/test/test_transcendental.py +19 -8
- {tinygrad-0.10.0 → tinygrad-0.10.2}/test/test_uop_graph.py +128 -67
- {tinygrad-0.10.0 → tinygrad-0.10.2}/test/test_uops.py +229 -61
- {tinygrad-0.10.0 → tinygrad-0.10.2}/test/test_uops_stats.py +32 -18
- {tinygrad-0.10.0 → tinygrad-0.10.2}/test/test_winograd.py +11 -4
- {tinygrad-0.10.0 → tinygrad-0.10.2}/test/test_zero_copy.py +1 -1
- tinygrad-0.10.2/tinygrad/codegen/devectorizer.py +247 -0
- tinygrad-0.10.2/tinygrad/codegen/expander.py +121 -0
- {tinygrad-0.10.0 → tinygrad-0.10.2}/tinygrad/codegen/kernel.py +141 -201
- tinygrad-0.10.2/tinygrad/codegen/linearize.py +234 -0
- {tinygrad-0.10.0 → tinygrad-0.10.2}/tinygrad/codegen/lowerer.py +60 -42
- tinygrad-0.10.2/tinygrad/codegen/symbolic.py +476 -0
- {tinygrad-0.10.0 → tinygrad-0.10.2}/tinygrad/codegen/transcendental.py +22 -13
- tinygrad-0.10.2/tinygrad/device.py +361 -0
- {tinygrad-0.10.0 → tinygrad-0.10.2}/tinygrad/dtype.py +39 -28
- {tinygrad-0.10.0 → tinygrad-0.10.2}/tinygrad/engine/jit.py +83 -65
- {tinygrad-0.10.0 → tinygrad-0.10.2}/tinygrad/engine/memory.py +4 -5
- tinygrad-0.10.2/tinygrad/engine/multi.py +161 -0
- tinygrad-0.10.2/tinygrad/engine/realize.py +171 -0
- tinygrad-0.10.2/tinygrad/engine/schedule.py +458 -0
- {tinygrad-0.10.0 → tinygrad-0.10.2}/tinygrad/engine/search.py +55 -66
- tinygrad-0.10.2/tinygrad/gradient.py +73 -0
- {tinygrad-0.10.0 → tinygrad-0.10.2}/tinygrad/helpers.py +81 -59
- {tinygrad-0.10.0 → tinygrad-0.10.2}/tinygrad/nn/__init__.py +30 -32
- {tinygrad-0.10.0 → tinygrad-0.10.2}/tinygrad/nn/datasets.py +1 -2
- {tinygrad-0.10.0 → tinygrad-0.10.2}/tinygrad/nn/optim.py +22 -26
- {tinygrad-0.10.0 → tinygrad-0.10.2}/tinygrad/nn/state.py +91 -66
- tinygrad-0.10.2/tinygrad/ops.py +1003 -0
- tinygrad-0.10.2/tinygrad/renderer/__init__.py +148 -0
- {tinygrad-0.10.0 → tinygrad-0.10.2}/tinygrad/renderer/cstyle.py +99 -92
- {tinygrad-0.10.0 → tinygrad-0.10.2}/tinygrad/renderer/llvmir.py +83 -34
- {tinygrad-0.10.0 → tinygrad-0.10.2}/tinygrad/renderer/ptx.py +83 -99
- tinygrad-0.10.2/tinygrad/renderer/wgsl.py +95 -0
- {tinygrad-0.10.0 → tinygrad-0.10.2}/tinygrad/runtime/autogen/amd_gpu.py +39507 -12
- {tinygrad-0.10.0 → tinygrad-0.10.2}/tinygrad/runtime/autogen/comgr.py +2 -0
- {tinygrad-0.10.0 → tinygrad-0.10.2}/tinygrad/runtime/autogen/kfd.py +4 -3
- {tinygrad-0.10.0 → tinygrad-0.10.2}/tinygrad/runtime/autogen/kgsl.py +1 -1
- {tinygrad-0.10.0 → tinygrad-0.10.2}/tinygrad/runtime/autogen/libc.py +404 -71
- tinygrad-0.10.2/tinygrad/runtime/autogen/llvm.py +11379 -0
- tinygrad-0.10.2/tinygrad/runtime/autogen/pci.py +1333 -0
- tinygrad-0.10.2/tinygrad/runtime/autogen/vfio.py +891 -0
- tinygrad-0.10.2/tinygrad/runtime/autogen/webgpu.py +6985 -0
- {tinygrad-0.10.0 → tinygrad-0.10.2}/tinygrad/runtime/graph/cuda.py +8 -9
- tinygrad-0.10.2/tinygrad/runtime/graph/hcq.py +205 -0
- tinygrad-0.10.2/tinygrad/runtime/graph/metal.py +100 -0
- tinygrad-0.10.2/tinygrad/runtime/ops_amd.py +635 -0
- {tinygrad-0.10.0 → tinygrad-0.10.2}/tinygrad/runtime/ops_cloud.py +34 -34
- tinygrad-0.10.2/tinygrad/runtime/ops_cpu.py +24 -0
- {tinygrad-0.10.0 → tinygrad-0.10.2}/tinygrad/runtime/ops_cuda.py +30 -27
- {tinygrad-0.10.0 → tinygrad-0.10.2}/tinygrad/runtime/ops_disk.py +62 -63
- tinygrad-0.10.2/tinygrad/runtime/ops_dsp.py +298 -0
- {tinygrad-0.10.0 → tinygrad-0.10.2}/tinygrad/runtime/ops_gpu.py +30 -30
- {tinygrad-0.10.0 → tinygrad-0.10.2}/tinygrad/runtime/ops_hip.py +29 -31
- tinygrad-0.10.2/tinygrad/runtime/ops_llvm.py +58 -0
- tinygrad-0.10.2/tinygrad/runtime/ops_metal.py +224 -0
- {tinygrad-0.10.0 → tinygrad-0.10.2}/tinygrad/runtime/ops_npy.py +2 -2
- {tinygrad-0.10.0 → tinygrad-0.10.2}/tinygrad/runtime/ops_nv.py +238 -273
- {tinygrad-0.10.0 → tinygrad-0.10.2}/tinygrad/runtime/ops_python.py +55 -50
- {tinygrad-0.10.0 → tinygrad-0.10.2}/tinygrad/runtime/ops_qcom.py +129 -157
- tinygrad-0.10.2/tinygrad/runtime/ops_webgpu.py +225 -0
- tinygrad-0.10.2/tinygrad/runtime/support/allocator.py +94 -0
- tinygrad-0.10.2/tinygrad/runtime/support/am/amdev.py +396 -0
- tinygrad-0.10.2/tinygrad/runtime/support/am/ip.py +463 -0
- {tinygrad-0.10.0 → tinygrad-0.10.2}/tinygrad/runtime/support/compiler_cuda.py +4 -2
- {tinygrad-0.10.0 → tinygrad-0.10.2}/tinygrad/runtime/support/elf.py +28 -4
- tinygrad-0.10.2/tinygrad/runtime/support/hcq.py +471 -0
- tinygrad-0.10.2/tinygrad/runtime/support/llvm.py +26 -0
- tinygrad-0.10.2/tinygrad/shape/__init__.py +0 -0
- tinygrad-0.10.2/tinygrad/shape/shapetracker.py +143 -0
- {tinygrad-0.10.0 → tinygrad-0.10.2}/tinygrad/shape/view.py +104 -140
- tinygrad-0.10.2/tinygrad/spec.py +155 -0
- {tinygrad-0.10.0 → tinygrad-0.10.2}/tinygrad/tensor.py +835 -527
- tinygrad-0.10.2/tinygrad/viz/assets/cdnjs.cloudflare.com/ajax/libs/highlight.js/11.10.0/highlight.min.js +1232 -0
- tinygrad-0.10.2/tinygrad/viz/assets/cdnjs.cloudflare.com/ajax/libs/highlight.js/11.10.0/languages/cpp.min.js +47 -0
- tinygrad-0.10.2/tinygrad/viz/assets/cdnjs.cloudflare.com/ajax/libs/highlight.js/11.10.0/languages/python.min.js +42 -0
- tinygrad-0.10.2/tinygrad/viz/assets/cdnjs.cloudflare.com/ajax/libs/highlight.js/11.10.0/styles/default.min.css +9 -0
- tinygrad-0.10.2/tinygrad/viz/assets/d3js.org/d3.v5.min.js +2 -0
- tinygrad-0.10.2/tinygrad/viz/assets/dagrejs.github.io/project/dagre-d3/latest/dagre-d3.min.js +4816 -0
- tinygrad-0.10.2/tinygrad/viz/assets/unpkg.com/@highlightjs/cdn-assets@11.10.0/styles/tokyo-night-dark.min.css +8 -0
- tinygrad-0.10.2/tinygrad/viz/index.html +544 -0
- tinygrad-0.10.2/tinygrad/viz/perfetto.html +178 -0
- tinygrad-0.10.2/tinygrad/viz/serve.py +205 -0
- {tinygrad-0.10.0 → tinygrad-0.10.2}/tinygrad.egg-info/PKG-INFO +36 -13
- {tinygrad-0.10.0 → tinygrad-0.10.2}/tinygrad.egg-info/SOURCES.txt +31 -9
- {tinygrad-0.10.0 → tinygrad-0.10.2}/tinygrad.egg-info/requires.txt +21 -6
- tinygrad-0.10.0/test/test_kernel_cache.py +0 -27
- tinygrad-0.10.0/test/test_profiler.py +0 -221
- tinygrad-0.10.0/test/test_viz.py +0 -93
- tinygrad-0.10.0/tinygrad/codegen/linearize.py +0 -95
- tinygrad-0.10.0/tinygrad/codegen/uopgraph.py +0 -506
- tinygrad-0.10.0/tinygrad/device.py +0 -221
- tinygrad-0.10.0/tinygrad/engine/lazy.py +0 -228
- tinygrad-0.10.0/tinygrad/engine/realize.py +0 -217
- tinygrad-0.10.0/tinygrad/engine/schedule.py +0 -419
- tinygrad-0.10.0/tinygrad/function.py +0 -212
- tinygrad-0.10.0/tinygrad/multi.py +0 -177
- tinygrad-0.10.0/tinygrad/ops.py +0 -1152
- tinygrad-0.10.0/tinygrad/renderer/__init__.py +0 -89
- tinygrad-0.10.0/tinygrad/runtime/graph/clang.py +0 -39
- tinygrad-0.10.0/tinygrad/runtime/graph/hcq.py +0 -200
- tinygrad-0.10.0/tinygrad/runtime/graph/metal.py +0 -103
- tinygrad-0.10.0/tinygrad/runtime/ops_amd.py +0 -471
- tinygrad-0.10.0/tinygrad/runtime/ops_clang.py +0 -35
- tinygrad-0.10.0/tinygrad/runtime/ops_dsp.py +0 -181
- tinygrad-0.10.0/tinygrad/runtime/ops_llvm.py +0 -51
- tinygrad-0.10.0/tinygrad/runtime/ops_metal.py +0 -188
- tinygrad-0.10.0/tinygrad/runtime/support/hcq.py +0 -539
- tinygrad-0.10.0/tinygrad/shape/shapetracker.py +0 -111
- {tinygrad-0.10.0 → tinygrad-0.10.2}/LICENSE +0 -0
- {tinygrad-0.10.0 → tinygrad-0.10.2}/setup.cfg +0 -0
- {tinygrad-0.10.0 → tinygrad-0.10.2}/test/test_compile_failures.py +0 -0
- {tinygrad-0.10.0 → tinygrad-0.10.2}/test/test_conv.py +0 -0
- {tinygrad-0.10.0 → tinygrad-0.10.2}/test/test_masked_st.py +0 -0
- {tinygrad-0.10.0 → tinygrad-0.10.2}/test/test_method_cache.py +0 -0
- {tinygrad-0.10.0 → tinygrad-0.10.2}/test/test_ocl.py +0 -0
- {tinygrad-0.10.0 → tinygrad-0.10.2}/test/test_rearrange_einops.py +0 -0
- {tinygrad-0.10.0 → tinygrad-0.10.2}/test/test_specific_conv.py +0 -0
- {tinygrad-0.10.0 → tinygrad-0.10.2}/test/test_symbolic_jit.py +0 -0
- {tinygrad-0.10.0 → tinygrad-0.10.2}/test/test_tensor_variable.py +0 -0
- {tinygrad-0.10.0 → tinygrad-0.10.2}/test/test_to_numpy.py +0 -0
- {tinygrad-0.10.0 → tinygrad-0.10.2}/tinygrad/__init__.py +0 -0
- {tinygrad-0.10.0 → tinygrad-0.10.2}/tinygrad/codegen/__init__.py +0 -0
- {tinygrad-0.10.0 → tinygrad-0.10.2}/tinygrad/engine/__init__.py +0 -0
- {tinygrad-0.10.0 → tinygrad-0.10.2}/tinygrad/py.typed +0 -0
- {tinygrad-0.10.0 → tinygrad-0.10.2}/tinygrad/runtime/__init__.py +0 -0
- {tinygrad-0.10.0 → tinygrad-0.10.2}/tinygrad/runtime/autogen/adreno.py +0 -0
- {tinygrad-0.10.0 → tinygrad-0.10.2}/tinygrad/runtime/autogen/cuda.py +0 -0
- {tinygrad-0.10.0 → tinygrad-0.10.2}/tinygrad/runtime/autogen/hip.py +0 -0
- {tinygrad-0.10.0 → tinygrad-0.10.2}/tinygrad/runtime/autogen/hsa.py +0 -0
- {tinygrad-0.10.0 → tinygrad-0.10.2}/tinygrad/runtime/autogen/io_uring.py +0 -0
- {tinygrad-0.10.0 → tinygrad-0.10.2}/tinygrad/runtime/autogen/nv_gpu.py +0 -0
- {tinygrad-0.10.0 → tinygrad-0.10.2}/tinygrad/runtime/autogen/nvrtc.py +0 -0
- {tinygrad-0.10.0 → tinygrad-0.10.2}/tinygrad/runtime/autogen/opencl.py +0 -0
- {tinygrad-0.10.0 → tinygrad-0.10.2}/tinygrad/runtime/autogen/qcom_dsp.py +0 -0
- {tinygrad-0.10.0 → tinygrad-0.10.2}/tinygrad/runtime/graph/__init__.py +0 -0
- {tinygrad-0.10.0 → tinygrad-0.10.2}/tinygrad/runtime/support/__init__.py +0 -0
- {tinygrad-0.10.0/tinygrad/shape → tinygrad-0.10.2/tinygrad/runtime/support/am}/__init__.py +0 -0
- {tinygrad-0.10.0 → tinygrad-0.10.2}/tinygrad/runtime/support/compiler_hip.py +0 -0
- {tinygrad-0.10.0 → tinygrad-0.10.2}/tinygrad.egg-info/dependency_links.txt +0 -0
- {tinygrad-0.10.0 → tinygrad-0.10.2}/tinygrad.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
|
-
Metadata-Version: 2.
|
1
|
+
Metadata-Version: 2.2
|
2
2
|
Name: tinygrad
|
3
|
-
Version: 0.10.
|
3
|
+
Version: 0.10.2
|
4
4
|
Summary: You like pytorch? You like micrograd? You love tinygrad! <3
|
5
5
|
Author: George Hotz
|
6
6
|
License: MIT
|
@@ -9,25 +9,39 @@ Classifier: License :: OSI Approved :: MIT License
|
|
9
9
|
Requires-Python: >=3.10
|
10
10
|
Description-Content-Type: text/markdown
|
11
11
|
License-File: LICENSE
|
12
|
-
Provides-Extra: llvm
|
13
|
-
Requires-Dist: llvmlite; extra == "llvm"
|
14
12
|
Provides-Extra: arm
|
15
13
|
Requires-Dist: unicorn; extra == "arm"
|
16
14
|
Provides-Extra: triton
|
17
15
|
Requires-Dist: triton-nightly>=2.1.0.dev20231014192330; extra == "triton"
|
18
16
|
Provides-Extra: linting
|
19
17
|
Requires-Dist: pylint; extra == "linting"
|
20
|
-
Requires-Dist: mypy==1.
|
18
|
+
Requires-Dist: mypy==1.13.0; extra == "linting"
|
21
19
|
Requires-Dist: typing-extensions; extra == "linting"
|
22
20
|
Requires-Dist: pre-commit; extra == "linting"
|
23
21
|
Requires-Dist: ruff; extra == "linting"
|
24
22
|
Requires-Dist: types-tqdm; extra == "linting"
|
23
|
+
Provides-Extra: testing-minimal
|
24
|
+
Requires-Dist: numpy; extra == "testing-minimal"
|
25
|
+
Requires-Dist: torch; extra == "testing-minimal"
|
26
|
+
Requires-Dist: pytest; extra == "testing-minimal"
|
27
|
+
Requires-Dist: pytest-xdist; extra == "testing-minimal"
|
28
|
+
Requires-Dist: hypothesis; extra == "testing-minimal"
|
29
|
+
Provides-Extra: testing-unit
|
30
|
+
Requires-Dist: numpy; extra == "testing-unit"
|
31
|
+
Requires-Dist: torch; extra == "testing-unit"
|
32
|
+
Requires-Dist: pytest; extra == "testing-unit"
|
33
|
+
Requires-Dist: pytest-xdist; extra == "testing-unit"
|
34
|
+
Requires-Dist: hypothesis; extra == "testing-unit"
|
35
|
+
Requires-Dist: tqdm; extra == "testing-unit"
|
36
|
+
Requires-Dist: safetensors; extra == "testing-unit"
|
37
|
+
Requires-Dist: tabulate; extra == "testing-unit"
|
25
38
|
Provides-Extra: testing
|
26
39
|
Requires-Dist: numpy; extra == "testing"
|
27
40
|
Requires-Dist: torch; extra == "testing"
|
28
|
-
Requires-Dist: pillow; extra == "testing"
|
29
41
|
Requires-Dist: pytest; extra == "testing"
|
30
42
|
Requires-Dist: pytest-xdist; extra == "testing"
|
43
|
+
Requires-Dist: hypothesis; extra == "testing"
|
44
|
+
Requires-Dist: pillow; extra == "testing"
|
31
45
|
Requires-Dist: onnx==1.16.0; extra == "testing"
|
32
46
|
Requires-Dist: onnx2torch; extra == "testing"
|
33
47
|
Requires-Dist: opencv-python; extra == "testing"
|
@@ -40,10 +54,10 @@ Requires-Dist: tiktoken; extra == "testing"
|
|
40
54
|
Requires-Dist: blobfile; extra == "testing"
|
41
55
|
Requires-Dist: librosa; extra == "testing"
|
42
56
|
Requires-Dist: networkx; extra == "testing"
|
43
|
-
Requires-Dist: hypothesis; extra == "testing"
|
44
57
|
Requires-Dist: nibabel; extra == "testing"
|
45
58
|
Requires-Dist: bottle; extra == "testing"
|
46
59
|
Requires-Dist: ggml-python; extra == "testing"
|
60
|
+
Requires-Dist: capstone; extra == "testing"
|
47
61
|
Provides-Extra: docs
|
48
62
|
Requires-Dist: mkdocs; extra == "docs"
|
49
63
|
Requires-Dist: mkdocs-material; extra == "docs"
|
@@ -55,6 +69,14 @@ Requires-Dist: numpy; extra == "docs"
|
|
55
69
|
Provides-Extra: testing-tf
|
56
70
|
Requires-Dist: tensorflow==2.15.1; extra == "testing-tf"
|
57
71
|
Requires-Dist: tensorflow_addons; extra == "testing-tf"
|
72
|
+
Dynamic: author
|
73
|
+
Dynamic: classifier
|
74
|
+
Dynamic: description
|
75
|
+
Dynamic: description-content-type
|
76
|
+
Dynamic: license
|
77
|
+
Dynamic: provides-extra
|
78
|
+
Dynamic: requires-python
|
79
|
+
Dynamic: summary
|
58
80
|
|
59
81
|
<div align="center">
|
60
82
|
|
@@ -139,13 +161,14 @@ See [examples/beautiful_mnist.py](examples/beautiful_mnist.py) for the full vers
|
|
139
161
|
tinygrad already supports numerous accelerators, including:
|
140
162
|
|
141
163
|
- [x] [GPU (OpenCL)](tinygrad/runtime/ops_gpu.py)
|
142
|
-
- [x] [
|
164
|
+
- [x] [CPU (C Code)](tinygrad/runtime/ops_cpu.py)
|
143
165
|
- [x] [LLVM](tinygrad/runtime/ops_llvm.py)
|
144
166
|
- [x] [METAL](tinygrad/runtime/ops_metal.py)
|
145
167
|
- [x] [CUDA](tinygrad/runtime/ops_cuda.py)
|
146
168
|
- [x] [AMD](tinygrad/runtime/ops_amd.py)
|
147
169
|
- [x] [NV](tinygrad/runtime/ops_nv.py)
|
148
170
|
- [x] [QCOM](tinygrad/runtime/ops_qcom.py)
|
171
|
+
- [x] [WEBGPU](tinygrad/runtime/ops_webgpu.py)
|
149
172
|
|
150
173
|
And it is easy to add more! Your accelerator of choice only needs to support a total of ~25 low level ops.
|
151
174
|
|
@@ -183,8 +206,8 @@ y = Tensor([[2.0,0,-2.0]], requires_grad=True)
|
|
183
206
|
z = y.matmul(x).sum()
|
184
207
|
z.backward()
|
185
208
|
|
186
|
-
print(x.grad.
|
187
|
-
print(y.grad.
|
209
|
+
print(x.grad.tolist()) # dz/dx
|
210
|
+
print(y.grad.tolist()) # dz/dy
|
188
211
|
```
|
189
212
|
|
190
213
|
The same thing but in PyTorch:
|
@@ -196,8 +219,8 @@ y = torch.tensor([[2.0,0,-2.0]], requires_grad=True)
|
|
196
219
|
z = y.matmul(x).sum()
|
197
220
|
z.backward()
|
198
221
|
|
199
|
-
print(x.grad.
|
200
|
-
print(y.grad.
|
222
|
+
print(x.grad.tolist()) # dz/dx
|
223
|
+
print(y.grad.tolist()) # dz/dy
|
201
224
|
```
|
202
225
|
|
203
226
|
## Contributing
|
@@ -208,7 +231,7 @@ We'll start with what will get your PR closed with a pointer to this section:
|
|
208
231
|
|
209
232
|
- No code golf! While low line count is a guiding light of this project, anything that remotely looks like code golf will be closed. The true goal is reducing complexity and increasing readability, and deleting `\n`s does nothing to help with that.
|
210
233
|
- All docs and whitespace changes will be closed unless you are a well-known contributor. The people writing the docs should be those who know the codebase the absolute best. People who have not demonstrated that shouldn't be messing with docs. Whitespace changes are both useless *and* carry a risk of introducing bugs.
|
211
|
-
- Anything you claim is a "speedup" must be benchmarked. In general, the goal is simplicity, so even if your PR makes things marginally faster, you have to consider the tradeoff with
|
234
|
+
- Anything you claim is a "speedup" must be benchmarked. In general, the goal is simplicity, so even if your PR makes things marginally faster, you have to consider the tradeoff with maintainability and readability.
|
212
235
|
- In general, the code outside the core `tinygrad/` folder is not well tested, so unless the current code there is broken, you shouldn't be changing it.
|
213
236
|
- If your PR looks "complex", is a big diff, or adds lots of lines, it won't be reviewed or merged. Consider breaking it up into smaller PRs that are individually clear wins. A common pattern I see is prerequisite refactors before adding new functionality. If you can (cleanly) refactor to the point that the feature is a 3 line change, this is great, and something easy for us to review.
|
214
237
|
|
@@ -81,13 +81,14 @@ See [examples/beautiful_mnist.py](examples/beautiful_mnist.py) for the full vers
|
|
81
81
|
tinygrad already supports numerous accelerators, including:
|
82
82
|
|
83
83
|
- [x] [GPU (OpenCL)](tinygrad/runtime/ops_gpu.py)
|
84
|
-
- [x] [
|
84
|
+
- [x] [CPU (C Code)](tinygrad/runtime/ops_cpu.py)
|
85
85
|
- [x] [LLVM](tinygrad/runtime/ops_llvm.py)
|
86
86
|
- [x] [METAL](tinygrad/runtime/ops_metal.py)
|
87
87
|
- [x] [CUDA](tinygrad/runtime/ops_cuda.py)
|
88
88
|
- [x] [AMD](tinygrad/runtime/ops_amd.py)
|
89
89
|
- [x] [NV](tinygrad/runtime/ops_nv.py)
|
90
90
|
- [x] [QCOM](tinygrad/runtime/ops_qcom.py)
|
91
|
+
- [x] [WEBGPU](tinygrad/runtime/ops_webgpu.py)
|
91
92
|
|
92
93
|
And it is easy to add more! Your accelerator of choice only needs to support a total of ~25 low level ops.
|
93
94
|
|
@@ -125,8 +126,8 @@ y = Tensor([[2.0,0,-2.0]], requires_grad=True)
|
|
125
126
|
z = y.matmul(x).sum()
|
126
127
|
z.backward()
|
127
128
|
|
128
|
-
print(x.grad.
|
129
|
-
print(y.grad.
|
129
|
+
print(x.grad.tolist()) # dz/dx
|
130
|
+
print(y.grad.tolist()) # dz/dy
|
130
131
|
```
|
131
132
|
|
132
133
|
The same thing but in PyTorch:
|
@@ -138,8 +139,8 @@ y = torch.tensor([[2.0,0,-2.0]], requires_grad=True)
|
|
138
139
|
z = y.matmul(x).sum()
|
139
140
|
z.backward()
|
140
141
|
|
141
|
-
print(x.grad.
|
142
|
-
print(y.grad.
|
142
|
+
print(x.grad.tolist()) # dz/dx
|
143
|
+
print(y.grad.tolist()) # dz/dy
|
143
144
|
```
|
144
145
|
|
145
146
|
## Contributing
|
@@ -150,7 +151,7 @@ We'll start with what will get your PR closed with a pointer to this section:
|
|
150
151
|
|
151
152
|
- No code golf! While low line count is a guiding light of this project, anything that remotely looks like code golf will be closed. The true goal is reducing complexity and increasing readability, and deleting `\n`s does nothing to help with that.
|
152
153
|
- All docs and whitespace changes will be closed unless you are a well-known contributor. The people writing the docs should be those who know the codebase the absolute best. People who have not demonstrated that shouldn't be messing with docs. Whitespace changes are both useless *and* carry a risk of introducing bugs.
|
153
|
-
- Anything you claim is a "speedup" must be benchmarked. In general, the goal is simplicity, so even if your PR makes things marginally faster, you have to consider the tradeoff with
|
154
|
+
- Anything you claim is a "speedup" must be benchmarked. In general, the goal is simplicity, so even if your PR makes things marginally faster, you have to consider the tradeoff with maintainability and readability.
|
154
155
|
- In general, the code outside the core `tinygrad/` folder is not well tested, so unless the current code there is broken, you shouldn't be changing it.
|
155
156
|
- If your PR looks "complex", is a big diff, or adds lots of lines, it won't be reviewed or merged. Consider breaking it up into smaller PRs that are individually clear wins. A common pattern I see is prerequisite refactors before adding new functionality. If you can (cleanly) refactor to the point that the feature is a 3 line change, this is great, and something easy for us to review.
|
156
157
|
|
@@ -7,16 +7,24 @@ directory = Path(__file__).resolve().parent
|
|
7
7
|
with open(directory / 'README.md', encoding='utf-8') as f:
|
8
8
|
long_description = f.read()
|
9
9
|
|
10
|
+
testing_minimal = [
|
11
|
+
"numpy",
|
12
|
+
"torch",
|
13
|
+
"pytest",
|
14
|
+
"pytest-xdist",
|
15
|
+
"hypothesis",
|
16
|
+
]
|
17
|
+
|
10
18
|
setup(name='tinygrad',
|
11
|
-
version='0.10.
|
19
|
+
version='0.10.2',
|
12
20
|
description='You like pytorch? You like micrograd? You love tinygrad! <3',
|
13
21
|
author='George Hotz',
|
14
22
|
license='MIT',
|
15
23
|
long_description=long_description,
|
16
24
|
long_description_content_type='text/markdown',
|
17
|
-
packages = ['tinygrad', 'tinygrad.runtime.autogen', 'tinygrad.codegen', 'tinygrad.nn', 'tinygrad.renderer', 'tinygrad.engine',
|
18
|
-
'tinygrad.runtime', 'tinygrad.runtime.support', 'tinygrad.runtime.graph', 'tinygrad.shape'],
|
19
|
-
package_data = {'tinygrad': ['py.typed']},
|
25
|
+
packages = ['tinygrad', 'tinygrad.runtime.autogen', 'tinygrad.codegen', 'tinygrad.nn', 'tinygrad.renderer', 'tinygrad.engine', 'tinygrad.viz',
|
26
|
+
'tinygrad.runtime', 'tinygrad.runtime.support', 'tinygrad.runtime.support.am', 'tinygrad.runtime.graph', 'tinygrad.shape'],
|
27
|
+
package_data = {'tinygrad': ['py.typed'], 'tinygrad.viz': ['index.html', 'perfetto.html', 'assets/**/*']},
|
20
28
|
classifiers=[
|
21
29
|
"Programming Language :: Python :: 3",
|
22
30
|
"License :: OSI Approved :: MIT License"
|
@@ -24,24 +32,25 @@ setup(name='tinygrad',
|
|
24
32
|
install_requires=[],
|
25
33
|
python_requires='>=3.10',
|
26
34
|
extras_require={
|
27
|
-
'llvm': ["llvmlite"],
|
28
35
|
'arm': ["unicorn"],
|
29
36
|
'triton': ["triton-nightly>=2.1.0.dev20231014192330"],
|
30
37
|
'linting': [
|
31
38
|
"pylint",
|
32
|
-
"mypy==1.
|
39
|
+
"mypy==1.13.0",
|
33
40
|
"typing-extensions",
|
34
41
|
"pre-commit",
|
35
42
|
"ruff",
|
36
43
|
"types-tqdm",
|
37
44
|
],
|
38
45
|
#'mlperf': ["mlperf-logging @ git+https://github.com/mlperf/logging.git@4.1.0-rc3"],
|
39
|
-
'
|
40
|
-
|
41
|
-
"
|
46
|
+
'testing_minimal': testing_minimal,
|
47
|
+
'testing_unit': testing_minimal + [
|
48
|
+
"tqdm",
|
49
|
+
"safetensors",
|
50
|
+
"tabulate" # for sz.py
|
51
|
+
],
|
52
|
+
'testing': testing_minimal + [
|
42
53
|
"pillow",
|
43
|
-
"pytest",
|
44
|
-
"pytest-xdist",
|
45
54
|
"onnx==1.16.0",
|
46
55
|
"onnx2torch",
|
47
56
|
"opencv-python",
|
@@ -54,10 +63,10 @@ setup(name='tinygrad',
|
|
54
63
|
"blobfile",
|
55
64
|
"librosa",
|
56
65
|
"networkx",
|
57
|
-
"hypothesis",
|
58
66
|
"nibabel",
|
59
67
|
"bottle",
|
60
|
-
"ggml-python"
|
68
|
+
"ggml-python",
|
69
|
+
"capstone"
|
61
70
|
],
|
62
71
|
'docs': [
|
63
72
|
"mkdocs",
|
@@ -71,6 +80,6 @@ setup(name='tinygrad',
|
|
71
80
|
'testing_tf': [
|
72
81
|
"tensorflow==2.15.1",
|
73
82
|
"tensorflow_addons",
|
74
|
-
]
|
83
|
+
],
|
75
84
|
},
|
76
85
|
include_package_data=True)
|
@@ -1,11 +1,12 @@
|
|
1
1
|
import unittest, contextlib
|
2
2
|
import numpy as np
|
3
|
-
from tinygrad import Tensor, GlobalCounters, dtypes, nn
|
3
|
+
from tinygrad import Tensor, GlobalCounters, dtypes, nn, Device
|
4
4
|
from tinygrad.helpers import CI, Context, getenv
|
5
5
|
from tinygrad.engine.realize import run_schedule
|
6
6
|
from tinygrad.codegen.kernel import Opt, OptOps, Kernel, KernelOptError
|
7
7
|
from tinygrad.engine.realize import CompiledRunner, ExecItem
|
8
8
|
from tinygrad.engine.search import get_kernel_actions
|
9
|
+
from tinygrad.ops import Ops
|
9
10
|
|
10
11
|
class TestArange(unittest.TestCase):
|
11
12
|
def _get_flops(self, N, opts=None):
|
@@ -21,7 +22,7 @@ class TestArange(unittest.TestCase):
|
|
21
22
|
#print(p.src)
|
22
23
|
ExecItem(CompiledRunner(p), [tt.lazydata.buffer]).run()
|
23
24
|
np.testing.assert_equal(tt.numpy(), np.arange(N))
|
24
|
-
return p.
|
25
|
+
return p.estimates.ops
|
25
26
|
|
26
27
|
def test_complexity(self, opts=None, limit=None):
|
27
28
|
# add 1 to avoid divide by 0. arange is 0 flops now!
|
@@ -40,7 +41,7 @@ class TestArange(unittest.TestCase):
|
|
40
41
|
def test_complexity_w_upcast_and_unroll(self): return self.test_complexity([Opt(OptOps.UPCAST, 0, 4), Opt(OptOps.UNROLL, 0, 4)], limit=1)
|
41
42
|
|
42
43
|
@unittest.skip("doesn't work yet")
|
43
|
-
def test_complexity_w_local_and_padto(self): return self.test_complexity([Opt(OptOps.LOCAL, 0, 16), Opt(op=OptOps.PADTO, axis=1,
|
44
|
+
def test_complexity_w_local_and_padto(self): return self.test_complexity([Opt(OptOps.LOCAL, 0, 16), Opt(op=OptOps.PADTO, axis=1, arg=32)])
|
44
45
|
|
45
46
|
def test_all_opts(self, opts=None, exclude=None):
|
46
47
|
k = Kernel(Tensor.arange(256).schedule()[-1].ast)
|
@@ -58,11 +59,11 @@ class TestArange(unittest.TestCase):
|
|
58
59
|
self.test_complexity(opts)
|
59
60
|
def test_all_opts_w_local(self):
|
60
61
|
with contextlib.suppress(KernelOptError):
|
61
|
-
return self.test_all_opts([Opt(OptOps.LOCAL, 0, 16)], [Opt(op=OptOps.PADTO, axis=1,
|
62
|
+
return self.test_all_opts([Opt(OptOps.LOCAL, 0, 16)], [Opt(op=OptOps.PADTO, axis=1, arg=32)])
|
62
63
|
def test_all_opts_w_upcast(self): return self.test_all_opts([Opt(OptOps.UPCAST, 0, 4)])
|
63
|
-
def test_all_opts_w_unroll(self): return self.test_all_opts([Opt(OptOps.UNROLL, 0, 4)], [Opt(op=OptOps.GROUP, axis=0,
|
64
|
+
def test_all_opts_w_unroll(self): return self.test_all_opts([Opt(OptOps.UNROLL, 0, 4)], [Opt(op=OptOps.GROUP, axis=0, arg=0)])
|
64
65
|
def test_all_opts_w_upcast_and_unroll(self):
|
65
|
-
return self.test_all_opts([Opt(OptOps.UPCAST, 0, 4), Opt(OptOps.UNROLL, 0, 4)], [Opt(op=OptOps.GROUP, axis=0,
|
66
|
+
return self.test_all_opts([Opt(OptOps.UPCAST, 0, 4), Opt(OptOps.UNROLL, 0, 4)], [Opt(op=OptOps.GROUP, axis=0, arg=0)])
|
66
67
|
|
67
68
|
class TestIndexing(unittest.TestCase):
|
68
69
|
def test_arange_2_reduce(self):
|
@@ -71,12 +72,11 @@ class TestIndexing(unittest.TestCase):
|
|
71
72
|
needle.realize()
|
72
73
|
with Context(NOOPT=1, FUSE_ARANGE=1):
|
73
74
|
GlobalCounters.reset()
|
74
|
-
|
75
|
-
out = ((Tensor.arange(1,16385).reshape(16384,1)-1)*needle.reshape(16384,1)).sum()
|
75
|
+
out = ((Tensor.arange(1,16385)-1)*needle).sum()
|
76
76
|
sched = out.schedule()
|
77
|
-
|
77
|
+
self.assertEqual(len(sched), 1)
|
78
78
|
run_schedule(sched)
|
79
|
-
|
79
|
+
self.assertEqual(out.item(), 1337)
|
80
80
|
|
81
81
|
@unittest.skipIf(getenv("PTX"), "broken on ptx for some reason")
|
82
82
|
def test_manual_index(self):
|
@@ -86,13 +86,13 @@ class TestIndexing(unittest.TestCase):
|
|
86
86
|
print("*** indexing ***")
|
87
87
|
with Context(NOOPT=1, FUSE_ARANGE=1):
|
88
88
|
GlobalCounters.reset()
|
89
|
-
rng = Tensor.ones(4, 256, 16384, dtype=dtypes.int).
|
89
|
+
rng = Tensor.ones(4, 256, 16384, dtype=dtypes.int)._cumalu(axis=-1, op=Ops.ADD, _include_initial=True).reshape(4, 256, 16384, 1)
|
90
90
|
idxs = idxs.reshape(4,1,1,1).expand(4, 256, 16384, 1)
|
91
91
|
reshape_dataset = dataset.T.reshape(1, 256, 16384, 1).expand(4, 256, 16384, 1)
|
92
92
|
full = (rng==idxs).where(reshape_dataset, Tensor.zeros(4, 256, 16384, 1))
|
93
93
|
X = full.sum(axis=(2,3))
|
94
94
|
sched = X.schedule()
|
95
|
-
|
95
|
+
self.assertEqual(len(sched), 1)
|
96
96
|
run_schedule(sched)
|
97
97
|
assert GlobalCounters.global_ops < 4*16384, f"too many ops {GlobalCounters.global_ops}"
|
98
98
|
np.testing.assert_allclose(real_index, X.numpy())
|
@@ -108,7 +108,7 @@ class TestIndexing(unittest.TestCase):
|
|
108
108
|
assert X.shape == (4,256)
|
109
109
|
sched = X.schedule()
|
110
110
|
# TODO: enable these asserts when the scheduler can handle this
|
111
|
-
#
|
111
|
+
#self.assertEqual(len(sched), 1)
|
112
112
|
run_schedule(sched)
|
113
113
|
#assert GlobalCounters.global_ops < 4*16384, f"too many ops {GlobalCounters.global_ops}"
|
114
114
|
np.testing.assert_allclose(real_index, X.numpy())
|
@@ -123,7 +123,7 @@ class TestIndexing(unittest.TestCase):
|
|
123
123
|
X = dataset[idxs]
|
124
124
|
assert X.shape == (4,256)
|
125
125
|
sched = X.schedule()
|
126
|
-
|
126
|
+
self.assertEqual(len(sched), 2)
|
127
127
|
run_schedule(sched)
|
128
128
|
assert GlobalCounters.global_ops < 4*16384, f"too many ops {GlobalCounters.global_ops} != {4*16384}"
|
129
129
|
np.testing.assert_allclose(real_index, X.numpy())
|
@@ -138,7 +138,7 @@ class TestIndexing(unittest.TestCase):
|
|
138
138
|
np.testing.assert_equal(X.numpy(), 0)
|
139
139
|
|
140
140
|
@unittest.skipIf(getenv("PTX"), "broken on ptx for some reason")
|
141
|
-
def test_index_mnist(self, noopt=1, op_limit=512*784*
|
141
|
+
def test_index_mnist(self, noopt=1, op_limit=512*784*13):
|
142
142
|
from tinygrad.nn.datasets import mnist
|
143
143
|
X_train, Y_train, _, _ = mnist()
|
144
144
|
with Context(NOOPT=noopt, FUSE_ARANGE=1, SPLIT_REDUCEOP=0):
|
@@ -152,12 +152,13 @@ class TestIndexing(unittest.TestCase):
|
|
152
152
|
@unittest.skip("not ready")
|
153
153
|
def test_index_mnist_opt(self): self.test_index_mnist(0)
|
154
154
|
|
155
|
-
@unittest.skipIf(getenv("PTX"), "broken on ptx for some reason")
|
155
|
+
@unittest.skipIf(getenv("PTX") or Device.DEFAULT == "WEBGPU", "broken on ptx and WebGPU for some reason")
|
156
156
|
def test_llama_embedding(self, noopt=1, op_limit=65536):
|
157
157
|
# llama3 is 128256
|
158
158
|
vocab_size, embed_size = (10, 3) if CI else (32000, 4096)
|
159
159
|
emb = nn.Embedding(vocab_size, embed_size)
|
160
|
-
|
160
|
+
# TODO: why is a new realize needed here
|
161
|
+
emb_w = emb.weight.realize().numpy()
|
161
162
|
x = Tensor([1,2,3,4])
|
162
163
|
with Context(NOOPT=noopt, FUSE_ARANGE=1):
|
163
164
|
GlobalCounters.reset()
|
@@ -2,7 +2,8 @@
|
|
2
2
|
import unittest
|
3
3
|
import numpy as np
|
4
4
|
from tinygrad import dtypes, Tensor, TinyJit, GlobalCounters, Variable
|
5
|
-
from tinygrad.
|
5
|
+
from tinygrad.device import is_dtype_supported
|
6
|
+
from tinygrad.helpers import temp
|
6
7
|
|
7
8
|
N = 200 # has to be bigger than the cache to fail
|
8
9
|
|
@@ -168,16 +169,6 @@ class TestAssign(unittest.TestCase):
|
|
168
169
|
a += 1
|
169
170
|
np.testing.assert_allclose(a.numpy(), 3)
|
170
171
|
|
171
|
-
# NOTE: this is similar to the resnet failure
|
172
|
-
#@unittest.expectedFailure
|
173
|
-
def test_double_assign_alt(self):
|
174
|
-
a = Tensor.ones(4).contiguous().realize()
|
175
|
-
b = Tensor([1, 2, 3, 4]).realize().lazydata
|
176
|
-
a1 = a.lazydata.assign(b)
|
177
|
-
a2 = a.lazydata.assign(b)
|
178
|
-
sched = create_schedule([a1, a2])
|
179
|
-
self.assertEqual(len(sched), 1)
|
180
|
-
|
181
172
|
def test_crossover_assign(self):
|
182
173
|
a = Tensor.full((4,), 2).contiguous().realize()
|
183
174
|
b = Tensor.full((4,), 3).contiguous().realize()
|
@@ -212,6 +203,7 @@ class TestAssign(unittest.TestCase):
|
|
212
203
|
np.testing.assert_equal(b0.numpy(), 128)
|
213
204
|
np.testing.assert_equal(b1.numpy(), 608)
|
214
205
|
|
206
|
+
@unittest.skip("TODO: bring this assert back")
|
215
207
|
def test_crossunder_assign(self):
|
216
208
|
# NOTE: should *not* raise AssertionError from numpy
|
217
209
|
with self.assertRaisesRegex(RuntimeError, "cycle"):
|
@@ -293,6 +285,7 @@ class TestAssign(unittest.TestCase):
|
|
293
285
|
#assert ba1 == ba2 and ba1 != bb1
|
294
286
|
np.testing.assert_allclose(a.numpy(), np.arange(N*N).reshape((N,N)) + np.arange(N*N).reshape((N,N)).transpose(1,0))
|
295
287
|
|
288
|
+
@unittest.skip("multi output not supported anymore")
|
296
289
|
def test_simple_assignment_multioutput(self):
|
297
290
|
a = Tensor.randn(32, 32).realize()
|
298
291
|
b = Tensor.full((32, ), 1.).contiguous().realize()
|
@@ -331,6 +324,7 @@ class TestAssign(unittest.TestCase):
|
|
331
324
|
b.assign(r + b.permute(1, 0))
|
332
325
|
b.realize()
|
333
326
|
|
327
|
+
@unittest.skip("multi output not supported anymore")
|
334
328
|
def test_permuted_reduceop_multioutput_dual_use(self):
|
335
329
|
a = Tensor.randn(32, 32, 32).realize()
|
336
330
|
b = Tensor.full((32, 32), 1.).contiguous().realize()
|
@@ -343,6 +337,7 @@ class TestAssign(unittest.TestCase):
|
|
343
337
|
c.assign(r + b_perm)
|
344
338
|
Tensor.realize(b, c)
|
345
339
|
|
340
|
+
@unittest.skip("multi output not supported anymore")
|
346
341
|
def test_permuted_reduceop_multioutput_dual_use_possible(self):
|
347
342
|
a = Tensor.randn(32, 32, 32, dtype=dtypes.int).realize()
|
348
343
|
b = Tensor.arange(32 * 32).reshape(32, 32).realize()
|
@@ -376,6 +371,14 @@ class TestAssign(unittest.TestCase):
|
|
376
371
|
|
377
372
|
# TODO: is there a way to sneak in a permute such that it returns the wrong answer?
|
378
373
|
|
374
|
+
@unittest.skipUnless(is_dtype_supported(dtypes.half), "need half")
|
375
|
+
def test_setitem_half(self):
|
376
|
+
a = Tensor.full((8,), 1.0, dtype=dtypes.half).contiguous().realize()
|
377
|
+
b = Tensor.full((4,), 2.0, dtype=dtypes.half).contiguous().realize()
|
378
|
+
assign = a[:4].assign(b)
|
379
|
+
assign.realize()
|
380
|
+
np.testing.assert_allclose(a.numpy(), [2., 2., 2., 2., 1., 1., 1., 1.])
|
381
|
+
|
379
382
|
@unittest.skip("don't use output buffer, and mismatch dtype no longer supported")
|
380
383
|
def test_cast_assignment(self):
|
381
384
|
a = Tensor(np.arange(N*N, dtype=np.float32)).reshape(N,N)
|
@@ -387,5 +390,9 @@ class TestAssign(unittest.TestCase):
|
|
387
390
|
assert oba1 is None and oba2 is None
|
388
391
|
np.testing.assert_allclose(a.numpy(), np.arange(N*N,dtype=np.int32).reshape((N,N)))
|
389
392
|
|
393
|
+
def test_disk_assignment(self):
|
394
|
+
a = Tensor.empty(5, device=f"disk:{temp('disk_assignment')}").assign(Tensor.ones(5)).numpy()
|
395
|
+
np.testing.assert_equal(a, np.ones(5))
|
396
|
+
|
390
397
|
if __name__ == "__main__":
|
391
398
|
unittest.main()
|
@@ -1,16 +1,18 @@
|
|
1
|
-
import unittest, math
|
1
|
+
import unittest, itertools, math
|
2
|
+
from typing import Any
|
2
3
|
from tinygrad import Tensor, Device, dtypes
|
3
|
-
from tinygrad.
|
4
|
-
from tinygrad.
|
4
|
+
from tinygrad.dtype import DType
|
5
|
+
from tinygrad.ops import Ops, UOp
|
5
6
|
from tinygrad.helpers import CI
|
7
|
+
from tinygrad.codegen.devectorizer import full_graph_rewrite
|
6
8
|
import numpy as np
|
7
9
|
from tinygrad.device import is_dtype_supported
|
8
10
|
|
9
11
|
def _check_ast_count(desired_count:int, t:Tensor):
|
10
12
|
# NOTE: this has side effect because everything can be scheduled only once
|
11
|
-
schedule =
|
13
|
+
schedule = t.schedule()
|
12
14
|
asts = [s for s in schedule if s.ast.op is Ops.SINK]
|
13
|
-
assert len(asts) == desired_count
|
15
|
+
assert len(asts) == desired_count, f"{len(asts)} != {desired_count}"
|
14
16
|
|
15
17
|
class TestUnaryOpsConstFolding(unittest.TestCase):
|
16
18
|
def test_all_consts_ops(self):
|
@@ -98,13 +100,47 @@ class TestBinaryOpsConstFolding(unittest.TestCase):
|
|
98
100
|
def test_tensor_one_pow(self):
|
99
101
|
_check_ast_count(0, Tensor.ones(4) ** Tensor([1.0, 2, 3, 4]))
|
100
102
|
|
103
|
+
class TestBitcastConstFolding(unittest.TestCase):
|
104
|
+
def test_scalar_bitcast(self):
|
105
|
+
def t(cases: dict[DType, Any]):
|
106
|
+
for (from_dt, from_v), (to_dt, to_v) in itertools.product(cases.items(), cases.items()):
|
107
|
+
if not math.isnan(from_v):
|
108
|
+
r = full_graph_rewrite(UOp.const(from_dt, from_v).bitcast(to_dt).sink()).src[0]
|
109
|
+
self.assertEqual(r.op, Ops.CONST, msg:=f"{from_dt} -> {to_dt} ({from_v} -> {to_v})")
|
110
|
+
self.assertEqual(r.dtype, to_dt, msg)
|
111
|
+
np.testing.assert_equal(r.arg, to_v, msg)
|
112
|
+
|
113
|
+
t({dtypes.int8: 0, dtypes.uint8: 0, dtypes.bool: False})
|
114
|
+
t({dtypes.int8: 1, dtypes.uint8: 1, dtypes.bool: True})
|
115
|
+
|
116
|
+
t({dtypes.int8: -1, dtypes.uint8: 2**8-1})
|
117
|
+
t({dtypes.int16: -1, dtypes.uint16: 2**16-1, dtypes.float16: float('nan')})
|
118
|
+
t({dtypes.int32: -1, dtypes.uint32: 2**32-1, dtypes.float32: float('nan')})
|
119
|
+
t({dtypes.int64: -1, dtypes.uint64: 2**64-1, dtypes.float64: float('nan')})
|
120
|
+
|
121
|
+
t({dtypes.int8: -2**7, dtypes.uint8: 2**7})
|
122
|
+
t({dtypes.int16: -2**15, dtypes.uint16: 2**15})
|
123
|
+
t({dtypes.int32: -2**31, dtypes.uint32: 2**31})
|
124
|
+
t({dtypes.int64: -2**63, dtypes.uint64: 2**63})
|
125
|
+
|
126
|
+
t({dtypes.int16: 13496, dtypes.uint16: 13496, dtypes.float16: 0.294921875})
|
127
|
+
t({dtypes.int32: 1050081145, dtypes.uint32: 1050081145, dtypes.float32: 0.29485681653022766})
|
128
|
+
t({dtypes.int64: 4598983288165178391, dtypes.uint64: 4598983288165178391, dtypes.float64: 0.29485681936461233})
|
129
|
+
|
130
|
+
def test_vec_bitcast(self):
|
131
|
+
r = full_graph_rewrite(UOp.const(dtypes.int32.vec(3), (-1, -2**31, 75)).bitcast(dtypes.uint32.vec(3)).sink()).src[0]
|
132
|
+
self.assertEqual(r.op, Ops.VECTORIZE)
|
133
|
+
self.assertEqual(r.dtype, dtypes.uint32.vec(3))
|
134
|
+
self.assertEqual(tuple(x.arg for x in r.src), (2**32-1, 2**31, 75))
|
135
|
+
|
101
136
|
# folds advance indexing into basic indexing
|
102
137
|
class TestIndexingConstFolding(unittest.TestCase):
|
103
138
|
def test_scalar_index(self):
|
104
139
|
t = Tensor.arange(16).float().reshape(1,1,4,4).realize()
|
105
|
-
|
106
|
-
_check_ast_count(
|
107
|
-
_check_ast_count(
|
140
|
+
# TODO: fold these
|
141
|
+
_check_ast_count(2, t[:,:,Tensor(1),:])
|
142
|
+
_check_ast_count(2, t[:,:,Tensor(1)+2,:])
|
143
|
+
_check_ast_count(2, t[:,:,Tensor(1),Tensor(0)])
|
108
144
|
|
109
145
|
@unittest.expectedFailure
|
110
146
|
def test_const_tensor_index(self):
|
@@ -130,11 +166,12 @@ class TestMovedConstFolding(unittest.TestCase):
|
|
130
166
|
|
131
167
|
def test_cast_padded(self):
|
132
168
|
# NOTE: this is folded due to CAST_BEFORE_VIEW
|
169
|
+
# update: CAST_BEFORE_VIEW=1 is no longer supported
|
133
170
|
if is_dtype_supported(dtypes.int16):
|
134
|
-
_check_ast_count(
|
171
|
+
_check_ast_count(1, Tensor.ones(4).pad(((1, 1),)).cast(dtypes.int16))
|
135
172
|
np.testing.assert_equal(Tensor.ones(4).pad(((1, 1),)).cast(dtypes.int16).numpy(), [0, 1, 1, 1, 1, 0])
|
136
173
|
if is_dtype_supported(dtypes.uint16):
|
137
|
-
_check_ast_count(
|
174
|
+
_check_ast_count(1, Tensor.full(4, fill_value=-1).pad(((1, 1),)).cast(dtypes.uint16))
|
138
175
|
np.testing.assert_equal(Tensor.full(4, fill_value=-1).pad(((1, 1),)).cast(dtypes.uint16).numpy(), [0, 65535, 65535, 65535, 65535, 0])
|
139
176
|
# not folded
|
140
177
|
if is_dtype_supported(dtypes.int64):
|
@@ -158,6 +195,37 @@ class TestReduceOpsConstFolding(unittest.TestCase):
|
|
158
195
|
_check_ast_count(1, Tensor.ones(4).pad(((1, 1),)).exp().sum())
|
159
196
|
np.testing.assert_allclose(Tensor.ones(4).pad(((1, 1),)).exp().sum().numpy(), 4 * math.e + 2)
|
160
197
|
|
198
|
+
def test_bool_zero_max(self):
|
199
|
+
_check_ast_count(0, Tensor.full((1, 2), True).shrink(((0, 1), (0, 0))).max((1, 0)))
|
200
|
+
np.testing.assert_equal(Tensor.full((1, 2), True).shrink(((0, 1), (0, 0))).max((1, 0)).numpy(), False)
|
201
|
+
|
202
|
+
def test_zero_size_ops(self):
|
203
|
+
for reduceop in [lambda x:x.prod(), lambda x:x.sum()]: # lambda x:x.max() NOTE: numpy gives "reduction operation maximum which has no identity"
|
204
|
+
_check_ast_count(0, reduceop(Tensor.empty(1, 0)))
|
205
|
+
np.testing.assert_equal(reduceop(Tensor.empty(shape:=(1, 0))).numpy(), reduceop(np.empty(shape)))
|
206
|
+
|
207
|
+
def test_zero_size_ops_view(self):
|
208
|
+
for reduceop in [lambda x:x.prod(), lambda x:x.sum()]:
|
209
|
+
_check_ast_count(0, reduceop(Tensor.empty(1, 0, 4).permute((1, 2, 0)).contiguous()))
|
210
|
+
np.testing.assert_equal(reduceop(Tensor.empty(shape:=(1, 0))).numpy(), reduceop(np.empty((shape))))
|
211
|
+
|
212
|
+
def test_zero_size_ops_realized(self):
|
213
|
+
for reduceop in [lambda x:x.prod(), lambda x:x.sum()]:
|
214
|
+
_check_ast_count(0, reduceop((Tensor.randn(0, 1)+1).realize()))
|
215
|
+
np.testing.assert_equal(reduceop((Tensor.randn(shape:=(0, 1))+1).realize()).numpy(), reduceop(np.empty(shape)))
|
216
|
+
|
217
|
+
def test_zero_size_realize_folded(self):
|
218
|
+
# non contiguous folded output doesn't realize
|
219
|
+
_check_ast_count(0, Tensor.empty(1, 0).sum())
|
220
|
+
# contiguous folded const can still schedule
|
221
|
+
a = Tensor.empty(1, 0).sum().contiguous()
|
222
|
+
_check_ast_count(2, a+2)
|
223
|
+
self.assertIsNotNone(a.lazydata.base.realized)
|
224
|
+
np.testing.assert_equal((Tensor.empty(1, 0).sum().contiguous()+2).numpy(), 2)
|
225
|
+
# otherwise we just fuse it
|
226
|
+
_check_ast_count(1, (Tensor.empty(1, 0).sum()+2).contiguous())
|
227
|
+
np.testing.assert_equal((Tensor.empty(1, 0).sum()+2).numpy(), 2)
|
228
|
+
|
161
229
|
def test_const_prod(self):
|
162
230
|
_check_ast_count(0, Tensor.full((2, 3), fill_value=2).prod())
|
163
231
|
np.testing.assert_equal(Tensor.full((2, 3), fill_value=2).prod().numpy(), 2**(2*3))
|
@@ -206,6 +274,8 @@ class TestMultiConstFolding(unittest.TestCase):
|
|
206
274
|
_check_ast_count(0, t ** 1)
|
207
275
|
_check_ast_count(0, 1 ** t)
|
208
276
|
|
277
|
+
# failing because multi calls .contiguous() on every single sharded uop
|
278
|
+
@unittest.expectedFailure
|
209
279
|
def test_multi_const_folding_tensor(self):
|
210
280
|
ds = tuple(f"{Device.DEFAULT}:{i}" for i in range(4))
|
211
281
|
t = Tensor.arange(16).float().realize().to(ds)
|
@@ -3,7 +3,6 @@ import unittest
|
|
3
3
|
from tinygrad.ops import Ops
|
4
4
|
from tinygrad.tensor import Tensor
|
5
5
|
from tinygrad.nn import Conv2d
|
6
|
-
from tinygrad.engine.schedule import create_schedule
|
7
6
|
from tinygrad.shape.shapetracker import ShapeTracker, View
|
8
7
|
from tinygrad.helpers import prod
|
9
8
|
from test.unit.test_shapetracker import shapetracker_getitem
|
@@ -11,13 +10,12 @@ from test.unit.test_shapetracker import shapetracker_getitem
|
|
11
10
|
class TestConvShapetracker(unittest.TestCase):
|
12
11
|
def test_conv_3x3_one_view(self):
|
13
12
|
conv = Conv2d(16, 32, (3, 3))
|
14
|
-
|
15
13
|
# first run to init the weights, they are scheduled.
|
16
|
-
|
14
|
+
conv(Tensor.empty(1, 16, 10, 10)).schedule()
|
17
15
|
# run it again to get the kernels
|
18
|
-
sched = [si for si in
|
16
|
+
sched = [si for si in conv(Tensor.empty(1, 16, 10, 10)).schedule() if si.ast.op is Ops.SINK]
|
19
17
|
assert len(sched) == 1, f"conv should only have one kernel, getting {len(sched)}"
|
20
|
-
for st in [x.st_arg for x in sched[0].ast.
|
18
|
+
for st in [x.st_arg for x in sched[0].ast.toposort if x.op is Ops.LOAD]:
|
21
19
|
assert len(st.views) == 1
|
22
20
|
|
23
21
|
def test_conv_2x2_backward_one_view(self):
|
@@ -26,11 +24,10 @@ class TestConvShapetracker(unittest.TestCase):
|
|
26
24
|
conv(X).mean().backward()
|
27
25
|
si = X.grad.schedule()[-1]
|
28
26
|
print(si)
|
29
|
-
ldb = [x for x in si.ast.
|
27
|
+
ldb = [x for x in si.ast.toposort if x.op is Ops.LOAD][0]
|
30
28
|
st: ShapeTracker = ldb.st_arg.simplify()
|
31
|
-
# NOTE: st.real_size() is broken
|
32
29
|
print(si.inputs[0].size)
|
33
|
-
|
30
|
+
self.assertEqual(si.inputs[0].size, st.real_size())
|
34
31
|
for v in st.views: print(v)
|
35
32
|
|
36
33
|
# same st
|