tinygrad 0.9.0__tar.gz → 0.9.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (160) hide show
  1. {tinygrad-0.9.0 → tinygrad-0.9.2}/PKG-INFO +23 -14
  2. {tinygrad-0.9.0 → tinygrad-0.9.2}/README.md +18 -12
  3. {tinygrad-0.9.0 → tinygrad-0.9.2}/setup.py +7 -3
  4. tinygrad-0.9.2/test/test_arange.py +167 -0
  5. {tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_const_folding.py +8 -3
  6. {tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_conv.py +8 -0
  7. tinygrad-0.9.2/test/test_conv_shapetracker.py +63 -0
  8. {tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_custom_function.py +6 -5
  9. {tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_device_speed.py +2 -2
  10. {tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_dtype.py +94 -13
  11. {tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_dtype_alu.py +54 -15
  12. {tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_fuzz_shape_ops.py +3 -2
  13. tinygrad-0.9.2/test/test_graph.py +235 -0
  14. tinygrad-0.9.2/test/test_hcq.py +463 -0
  15. {tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_image_dtype.py +22 -10
  16. {tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_jit.py +122 -2
  17. {tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_lazybuffer.py +9 -9
  18. {tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_lazyop.py +1 -1
  19. tinygrad-0.9.2/test/test_linearizer.py +2077 -0
  20. tinygrad-0.9.2/test/test_linearizer_dumb.py +104 -0
  21. tinygrad-0.9.2/test/test_linearizer_failures.py +467 -0
  22. {tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_linearizer_overflows.py +3 -3
  23. {tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_multitensor.py +231 -95
  24. {tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_nn.py +147 -68
  25. tinygrad-0.9.2/test/test_ocl.py +20 -0
  26. {tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_ops.py +380 -105
  27. {tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_optim.py +2 -1
  28. tinygrad-0.9.2/test/test_pattern_matcher.py +186 -0
  29. {tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_pickle.py +36 -4
  30. tinygrad-0.9.2/test/test_profiler.py +220 -0
  31. {tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_randomness.py +12 -6
  32. tinygrad-0.9.2/test/test_renderer_failures.py +43 -0
  33. tinygrad-0.9.2/test/test_schedule.py +1589 -0
  34. {tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_search.py +24 -13
  35. {tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_speed_v_torch.py +5 -14
  36. {tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_subbuffer.py +2 -3
  37. {tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_symbolic_jit.py +62 -1
  38. {tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_symbolic_ops.py +37 -29
  39. {tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_symbolic_shapetracker.py +47 -1
  40. {tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_tensor.py +225 -62
  41. {tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_tensor_data.py +12 -1
  42. {tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_tensor_variable.py +23 -18
  43. tinygrad-0.9.2/test/test_transcendental.py +71 -0
  44. tinygrad-0.9.2/test/test_uop_graph.py +662 -0
  45. tinygrad-0.9.2/test/test_uops.py +379 -0
  46. tinygrad-0.9.2/test/test_uops_stats.py +203 -0
  47. tinygrad-0.9.2/test/test_verify_lazyop.py +76 -0
  48. {tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_winograd.py +9 -7
  49. {tinygrad-0.9.0 → tinygrad-0.9.2}/tinygrad/codegen/kernel.py +313 -192
  50. tinygrad-0.9.2/tinygrad/codegen/lowerer.py +215 -0
  51. tinygrad-0.9.2/tinygrad/codegen/transcendental.py +310 -0
  52. tinygrad-0.9.2/tinygrad/codegen/uopgraph.py +622 -0
  53. tinygrad-0.9.2/tinygrad/codegen/uops.py +293 -0
  54. tinygrad-0.9.2/tinygrad/device.py +679 -0
  55. {tinygrad-0.9.0 → tinygrad-0.9.2}/tinygrad/dtype.py +25 -11
  56. tinygrad-0.9.2/tinygrad/engine/__init__.py +0 -0
  57. {tinygrad-0.9.0 → tinygrad-0.9.2}/tinygrad/engine/graph.py +24 -37
  58. tinygrad-0.9.2/tinygrad/engine/jit.py +276 -0
  59. tinygrad-0.9.2/tinygrad/engine/realize.py +268 -0
  60. tinygrad-0.9.2/tinygrad/engine/schedule.py +413 -0
  61. {tinygrad-0.9.0 → tinygrad-0.9.2}/tinygrad/engine/search.py +33 -23
  62. {tinygrad-0.9.0 → tinygrad-0.9.2}/tinygrad/function.py +26 -23
  63. {tinygrad-0.9.0 → tinygrad-0.9.2}/tinygrad/helpers.py +121 -14
  64. {tinygrad-0.9.0 → tinygrad-0.9.2}/tinygrad/lazy.py +55 -56
  65. {tinygrad-0.9.0 → tinygrad-0.9.2}/tinygrad/multi.py +51 -42
  66. {tinygrad-0.9.0 → tinygrad-0.9.2}/tinygrad/nn/__init__.py +40 -23
  67. {tinygrad-0.9.0 → tinygrad-0.9.2}/tinygrad/nn/datasets.py +2 -1
  68. {tinygrad-0.9.0 → tinygrad-0.9.2}/tinygrad/nn/state.py +6 -7
  69. tinygrad-0.9.2/tinygrad/ops.py +170 -0
  70. tinygrad-0.9.2/tinygrad/renderer/__init__.py +87 -0
  71. tinygrad-0.9.2/tinygrad/renderer/assembly.py +267 -0
  72. {tinygrad-0.9.0 → tinygrad-0.9.2}/tinygrad/renderer/cstyle.py +125 -93
  73. {tinygrad-0.9.0 → tinygrad-0.9.2}/tinygrad/renderer/llvmir.py +44 -53
  74. tinygrad-0.9.2/tinygrad/runtime/__init__.py +0 -0
  75. tinygrad-0.9.2/tinygrad/runtime/autogen/amd_gpu.py +32858 -0
  76. {tinygrad-0.9.0 → tinygrad-0.9.2}/tinygrad/runtime/autogen/comgr.py +36 -10
  77. {tinygrad-0.9.0 → tinygrad-0.9.2}/tinygrad/runtime/autogen/cuda.py +6 -162
  78. {tinygrad-0.9.0 → tinygrad-0.9.2}/tinygrad/runtime/autogen/hsa.py +146 -14
  79. tinygrad-0.9.2/tinygrad/runtime/autogen/io_uring.py +1486 -0
  80. {tinygrad-0.9.0 → tinygrad-0.9.2}/tinygrad/runtime/autogen/kfd.py +32 -0
  81. tinygrad-0.9.2/tinygrad/runtime/autogen/libc.py +4260 -0
  82. {tinygrad-0.9.0 → tinygrad-0.9.2}/tinygrad/runtime/autogen/nv_gpu.py +269 -0
  83. tinygrad-0.9.2/tinygrad/runtime/autogen/nvrtc.py +579 -0
  84. tinygrad-0.9.2/tinygrad/runtime/graph/__init__.py +0 -0
  85. {tinygrad-0.9.0 → tinygrad-0.9.2}/tinygrad/runtime/graph/clang.py +5 -4
  86. {tinygrad-0.9.0 → tinygrad-0.9.2}/tinygrad/runtime/graph/cuda.py +9 -12
  87. tinygrad-0.9.2/tinygrad/runtime/graph/hcq.py +200 -0
  88. {tinygrad-0.9.0 → tinygrad-0.9.2}/tinygrad/runtime/graph/metal.py +18 -15
  89. tinygrad-0.9.2/tinygrad/runtime/ops_amd.py +442 -0
  90. {tinygrad-0.9.0 → tinygrad-0.9.2}/tinygrad/runtime/ops_clang.py +2 -2
  91. tinygrad-0.9.2/tinygrad/runtime/ops_cuda.py +127 -0
  92. tinygrad-0.9.2/tinygrad/runtime/ops_disk.py +121 -0
  93. {tinygrad-0.9.0 → tinygrad-0.9.2}/tinygrad/runtime/ops_gpu.py +6 -4
  94. tinygrad-0.9.2/tinygrad/runtime/ops_hip.py +70 -0
  95. {tinygrad-0.9.0 → tinygrad-0.9.2}/tinygrad/runtime/ops_metal.py +43 -33
  96. {tinygrad-0.9.0 → tinygrad-0.9.2}/tinygrad/runtime/ops_npy.py +1 -1
  97. tinygrad-0.9.2/tinygrad/runtime/ops_nv.py +545 -0
  98. {tinygrad-0.9.0 → tinygrad-0.9.2}/tinygrad/runtime/ops_python.py +35 -35
  99. tinygrad-0.9.2/tinygrad/runtime/support/__init__.py +0 -0
  100. tinygrad-0.9.2/tinygrad/runtime/support/compiler_cuda.py +78 -0
  101. tinygrad-0.9.0/tinygrad/runtime/driver/hip_comgr.py → tinygrad-0.9.2/tinygrad/runtime/support/compiler_hip.py +35 -12
  102. tinygrad-0.9.2/tinygrad/runtime/support/elf.py +38 -0
  103. tinygrad-0.9.2/tinygrad/shape/__init__.py +0 -0
  104. {tinygrad-0.9.0 → tinygrad-0.9.2}/tinygrad/shape/shapetracker.py +10 -16
  105. {tinygrad-0.9.0 → tinygrad-0.9.2}/tinygrad/shape/symbolic.py +5 -11
  106. {tinygrad-0.9.0 → tinygrad-0.9.2}/tinygrad/shape/view.py +67 -40
  107. {tinygrad-0.9.0 → tinygrad-0.9.2}/tinygrad/tensor.py +601 -215
  108. {tinygrad-0.9.0 → tinygrad-0.9.2}/tinygrad.egg-info/PKG-INFO +23 -14
  109. {tinygrad-0.9.0 → tinygrad-0.9.2}/tinygrad.egg-info/SOURCES.txt +23 -5
  110. {tinygrad-0.9.0 → tinygrad-0.9.2}/tinygrad.egg-info/requires.txt +4 -1
  111. tinygrad-0.9.0/test/test_arange.py +0 -17
  112. tinygrad-0.9.0/test/test_conv_shapetracker.py +0 -22
  113. tinygrad-0.9.0/test/test_linearizer.py +0 -1453
  114. tinygrad-0.9.0/test/test_linearizer_failures.py +0 -248
  115. tinygrad-0.9.0/test/test_pattern_matcher.py +0 -93
  116. tinygrad-0.9.0/test/test_schedule.py +0 -859
  117. tinygrad-0.9.0/test/test_uop_graph.py +0 -82
  118. tinygrad-0.9.0/test/test_uops.py +0 -245
  119. tinygrad-0.9.0/test/test_uops_stats.py +0 -83
  120. tinygrad-0.9.0/tinygrad/codegen/linearizer.py +0 -460
  121. tinygrad-0.9.0/tinygrad/codegen/uops.py +0 -415
  122. tinygrad-0.9.0/tinygrad/device.py +0 -183
  123. tinygrad-0.9.0/tinygrad/engine/jit.py +0 -195
  124. tinygrad-0.9.0/tinygrad/engine/realize.py +0 -191
  125. tinygrad-0.9.0/tinygrad/engine/schedule.py +0 -362
  126. tinygrad-0.9.0/tinygrad/ops.py +0 -136
  127. tinygrad-0.9.0/tinygrad/renderer/__init__.py +0 -61
  128. tinygrad-0.9.0/tinygrad/renderer/assembly.py +0 -276
  129. tinygrad-0.9.0/tinygrad/runtime/autogen/amd_gpu.py +0 -1900
  130. tinygrad-0.9.0/tinygrad/runtime/driver/hsa.py +0 -143
  131. tinygrad-0.9.0/tinygrad/runtime/graph/hcq.py +0 -143
  132. tinygrad-0.9.0/tinygrad/runtime/graph/hsa.py +0 -171
  133. tinygrad-0.9.0/tinygrad/runtime/ops_amd.py +0 -564
  134. tinygrad-0.9.0/tinygrad/runtime/ops_cuda.py +0 -185
  135. tinygrad-0.9.0/tinygrad/runtime/ops_disk.py +0 -60
  136. tinygrad-0.9.0/tinygrad/runtime/ops_hsa.py +0 -278
  137. tinygrad-0.9.0/tinygrad/runtime/ops_nv.py +0 -630
  138. {tinygrad-0.9.0 → tinygrad-0.9.2}/LICENSE +0 -0
  139. {tinygrad-0.9.0 → tinygrad-0.9.2}/setup.cfg +0 -0
  140. {tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_assign.py +0 -0
  141. {tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_copy_speed.py +0 -0
  142. {tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_fusion_op.py +0 -0
  143. {tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_gc.py +0 -0
  144. {tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_kernel_cache.py +0 -0
  145. {tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_masked_st.py +0 -0
  146. {tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_method_cache.py +0 -0
  147. {tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_net_speed.py +0 -0
  148. {tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_sample.py +0 -0
  149. {tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_setitem.py +0 -0
  150. {tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_specific_conv.py +0 -0
  151. {tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_to_numpy.py +0 -0
  152. {tinygrad-0.9.0 → tinygrad-0.9.2}/test/test_zero_copy.py +0 -0
  153. {tinygrad-0.9.0 → tinygrad-0.9.2}/tinygrad/__init__.py +0 -0
  154. {tinygrad-0.9.0/tinygrad/engine → tinygrad-0.9.2/tinygrad/codegen}/__init__.py +0 -0
  155. {tinygrad-0.9.0 → tinygrad-0.9.2}/tinygrad/nn/optim.py +0 -0
  156. {tinygrad-0.9.0 → tinygrad-0.9.2}/tinygrad/runtime/autogen/hip.py +0 -0
  157. {tinygrad-0.9.0 → tinygrad-0.9.2}/tinygrad/runtime/autogen/opencl.py +0 -0
  158. {tinygrad-0.9.0 → tinygrad-0.9.2}/tinygrad/runtime/ops_llvm.py +0 -0
  159. {tinygrad-0.9.0 → tinygrad-0.9.2}/tinygrad.egg-info/dependency_links.txt +0 -0
  160. {tinygrad-0.9.0 → tinygrad-0.9.2}/tinygrad.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: tinygrad
3
- Version: 0.9.0
3
+ Version: 0.9.2
4
4
  Summary: You like pytorch? You like micrograd? You love tinygrad! <3
5
5
  Author: George Hotz
6
6
  License: MIT
@@ -10,7 +10,6 @@ Requires-Python: >=3.8
10
10
  Description-Content-Type: text/markdown
11
11
  License-File: LICENSE
12
12
  Requires-Dist: numpy
13
- Requires-Dist: tqdm
14
13
  Requires-Dist: pyobjc-framework-Metal; platform_system == "Darwin"
15
14
  Requires-Dist: pyobjc-framework-libdispatch; platform_system == "Darwin"
16
15
  Provides-Extra: llvm
@@ -35,15 +34,19 @@ Requires-Dist: onnx==1.16.0; extra == "testing"
35
34
  Requires-Dist: onnx2torch; extra == "testing"
36
35
  Requires-Dist: opencv-python; extra == "testing"
37
36
  Requires-Dist: tabulate; extra == "testing"
37
+ Requires-Dist: tqdm; extra == "testing"
38
38
  Requires-Dist: safetensors; extra == "testing"
39
39
  Requires-Dist: transformers; extra == "testing"
40
40
  Requires-Dist: sentencepiece; extra == "testing"
41
41
  Requires-Dist: tiktoken; extra == "testing"
42
+ Requires-Dist: blobfile; extra == "testing"
42
43
  Requires-Dist: librosa; extra == "testing"
43
44
  Requires-Dist: networkx; extra == "testing"
44
45
  Requires-Dist: hypothesis; extra == "testing"
45
46
  Requires-Dist: nibabel; extra == "testing"
47
+ Requires-Dist: bottle; extra == "testing"
46
48
  Provides-Extra: docs
49
+ Requires-Dist: mkdocs; extra == "docs"
47
50
  Requires-Dist: mkdocs-material; extra == "docs"
48
51
  Requires-Dist: mkdocstrings[python]; extra == "docs"
49
52
  Requires-Dist: markdown-callouts; extra == "docs"
@@ -64,7 +67,7 @@ tinygrad: For something between [PyTorch](https://github.com/pytorch/pytorch) an
64
67
 
65
68
  <h3>
66
69
 
67
- [Homepage](https://github.com/tinygrad/tinygrad) | [Documentation](/docs) | [Examples](/examples) | [Showcase](/docs/showcase.md) | [Discord](https://discord.gg/ZjZadyC7PK)
70
+ [Homepage](https://github.com/tinygrad/tinygrad) | [Documentation](https://docs.tinygrad.org/) | [Discord](https://discord.gg/ZjZadyC7PK)
68
71
 
69
72
  </h3>
70
73
 
@@ -106,7 +109,7 @@ And we can change `DEBUG` to `4` to see the generated code.
106
109
  As it turns out, 90% of what you need for neural networks are a decent autograd/tensor library.
107
110
  Throw in an optimizer, a data loader, and some compute, and you have all you need.
108
111
 
109
- ```py
112
+ ```python
110
113
  from tinygrad import Tensor, nn
111
114
 
112
115
  class LinearNet:
@@ -121,11 +124,12 @@ optim = nn.optim.Adam([model.l1, model.l2], lr=0.001)
121
124
 
122
125
  x, y = Tensor.rand(4, 1, 28, 28), Tensor([2,4,3,7]) # replace with real mnist dataloader
123
126
 
124
- for i in range(10):
125
- optim.zero_grad()
126
- loss = model(x).sparse_categorical_crossentropy(y).backward()
127
- optim.step()
128
- print(i, loss.item())
127
+ with Tensor.train():
128
+ for i in range(10):
129
+ optim.zero_grad()
130
+ loss = model(x).sparse_categorical_crossentropy(y).backward()
131
+ optim.step()
132
+ print(i, loss.item())
129
133
  ```
130
134
 
131
135
  See [examples/beautiful_mnist.py](examples/beautiful_mnist.py) for the full version that gets 98% in ~5 seconds
@@ -139,7 +143,8 @@ tinygrad already supports numerous accelerators, including:
139
143
  - [x] [LLVM](tinygrad/runtime/ops_llvm.py)
140
144
  - [x] [METAL](tinygrad/runtime/ops_metal.py)
141
145
  - [x] [CUDA](tinygrad/runtime/ops_cuda.py)
142
- - [x] [HSA](tinygrad/runtime/ops_hsa.py)
146
+ - [x] [AMD](tinygrad/runtime/ops_amd.py)
147
+ - [x] [NV](tinygrad/runtime/ops_nv.py)
143
148
 
144
149
  And it is easy to add more! Your accelerator of choice only needs to support a total of ~25 low level ops.
145
150
 
@@ -163,11 +168,11 @@ python3 -m pip install git+https://github.com/tinygrad/tinygrad.git
163
168
 
164
169
  ## Documentation
165
170
 
166
- Documentation along with a quick start guide can be found in the [docs/](/docs) directory.
171
+ Documentation along with a quick start guide can be found on the [docs website](https://docs.tinygrad.org/) built from the [docs/](/docs) directory.
167
172
 
168
173
  ### Quick example comparing to PyTorch
169
174
 
170
- ```py
175
+ ```python
171
176
  from tinygrad import Tensor
172
177
 
173
178
  x = Tensor.eye(3, requires_grad=True)
@@ -180,7 +185,7 @@ print(y.grad.numpy()) # dz/dy
180
185
  ```
181
186
 
182
187
  The same thing but in PyTorch:
183
- ```py
188
+ ```python
184
189
  import torch
185
190
 
186
191
  x = torch.eye(3, requires_grad=True)
@@ -209,7 +214,7 @@ Now, what we want:
209
214
  - Bug fixes (with a regression test) are great! This library isn't 1.0 yet, so if you stumble upon a bug, fix it, write a test, and submit a PR, this is valuable work.
210
215
  - Solving bounties! tinygrad [offers cash bounties](https://docs.google.com/spreadsheets/d/1WKHbT-7KOgjEawq5h5Ic1qUWzpfAzuD_J06N1JwOCGs/edit?usp=sharing) for certain improvements to the library. All new code should be high quality and well tested.
211
216
  - Features. However, if you are adding a feature, consider the line tradeoff. If it's 3 lines, there's less of a bar of usefulness it has to meet over something that's 30 or 300 lines. All features must have regression tests. In general with no other constraints, your feature's API should match torch or numpy.
212
- - Refactors that are clear wins. In general, if your refactor isn't a clear win it will be closed. But some refactors are amazing! Think about readability in a deep core sense. A whitespace change or moving a few functions around is useless, but if you realize that two 100 line functions can actually use the same 110 line function with arguments while also improving readability, this is a big win.
217
+ - Refactors that are clear wins. In general, if your refactor isn't a clear win it will be closed. But some refactors are amazing! Think about readability in a deep core sense. A whitespace change or moving a few functions around is useless, but if you realize that two 100 line functions can actually use the same 110 line function with arguments while also improving readability, this is a big win. Refactors should pass [process replay](#process-replay-tests).
213
218
  - Tests/fuzzers. If you can add tests that are non brittle, they are welcome. We have some fuzzers in here too, and there's a plethora of bugs that can be found with them and by improving them. Finding bugs, even writing broken tests (that should pass) with `@unittest.expectedFailure` is great. This is how we make progress.
214
219
  - Dead code removal from core `tinygrad/` folder. We don't care about the code in extra, but removing dead code from the core library is great. Less for new people to read and be confused by.
215
220
 
@@ -225,3 +230,7 @@ python3 -m pip install -e '.[testing]' # install extra deps for testing
225
230
  python3 test/test_ops.py # just the ops tests
226
231
  python3 -m pytest test/ # whole test suite
227
232
  ```
233
+
234
+ #### Process replay tests
235
+
236
+ [Process replay](https://github.com/tinygrad/tinygrad/blob/master/test/external/process_replay/process_replay.py) compares your PR's generated kernels against master. If your PR is a refactor or speedup without any expected behavior change, It should include [run_process_replay] in the PR title, [example](https://github.com/tinygrad/tinygrad/pull/4995). Note that you should keep your branch up-to-date with master.
@@ -9,7 +9,7 @@ tinygrad: For something between [PyTorch](https://github.com/pytorch/pytorch) an
9
9
 
10
10
  <h3>
11
11
 
12
- [Homepage](https://github.com/tinygrad/tinygrad) | [Documentation](/docs) | [Examples](/examples) | [Showcase](/docs/showcase.md) | [Discord](https://discord.gg/ZjZadyC7PK)
12
+ [Homepage](https://github.com/tinygrad/tinygrad) | [Documentation](https://docs.tinygrad.org/) | [Discord](https://discord.gg/ZjZadyC7PK)
13
13
 
14
14
  </h3>
15
15
 
@@ -51,7 +51,7 @@ And we can change `DEBUG` to `4` to see the generated code.
51
51
  As it turns out, 90% of what you need for neural networks are a decent autograd/tensor library.
52
52
  Throw in an optimizer, a data loader, and some compute, and you have all you need.
53
53
 
54
- ```py
54
+ ```python
55
55
  from tinygrad import Tensor, nn
56
56
 
57
57
  class LinearNet:
@@ -66,11 +66,12 @@ optim = nn.optim.Adam([model.l1, model.l2], lr=0.001)
66
66
 
67
67
  x, y = Tensor.rand(4, 1, 28, 28), Tensor([2,4,3,7]) # replace with real mnist dataloader
68
68
 
69
- for i in range(10):
70
- optim.zero_grad()
71
- loss = model(x).sparse_categorical_crossentropy(y).backward()
72
- optim.step()
73
- print(i, loss.item())
69
+ with Tensor.train():
70
+ for i in range(10):
71
+ optim.zero_grad()
72
+ loss = model(x).sparse_categorical_crossentropy(y).backward()
73
+ optim.step()
74
+ print(i, loss.item())
74
75
  ```
75
76
 
76
77
  See [examples/beautiful_mnist.py](examples/beautiful_mnist.py) for the full version that gets 98% in ~5 seconds
@@ -84,7 +85,8 @@ tinygrad already supports numerous accelerators, including:
84
85
  - [x] [LLVM](tinygrad/runtime/ops_llvm.py)
85
86
  - [x] [METAL](tinygrad/runtime/ops_metal.py)
86
87
  - [x] [CUDA](tinygrad/runtime/ops_cuda.py)
87
- - [x] [HSA](tinygrad/runtime/ops_hsa.py)
88
+ - [x] [AMD](tinygrad/runtime/ops_amd.py)
89
+ - [x] [NV](tinygrad/runtime/ops_nv.py)
88
90
 
89
91
  And it is easy to add more! Your accelerator of choice only needs to support a total of ~25 low level ops.
90
92
 
@@ -108,11 +110,11 @@ python3 -m pip install git+https://github.com/tinygrad/tinygrad.git
108
110
 
109
111
  ## Documentation
110
112
 
111
- Documentation along with a quick start guide can be found in the [docs/](/docs) directory.
113
+ Documentation along with a quick start guide can be found on the [docs website](https://docs.tinygrad.org/) built from the [docs/](/docs) directory.
112
114
 
113
115
  ### Quick example comparing to PyTorch
114
116
 
115
- ```py
117
+ ```python
116
118
  from tinygrad import Tensor
117
119
 
118
120
  x = Tensor.eye(3, requires_grad=True)
@@ -125,7 +127,7 @@ print(y.grad.numpy()) # dz/dy
125
127
  ```
126
128
 
127
129
  The same thing but in PyTorch:
128
- ```py
130
+ ```python
129
131
  import torch
130
132
 
131
133
  x = torch.eye(3, requires_grad=True)
@@ -154,7 +156,7 @@ Now, what we want:
154
156
  - Bug fixes (with a regression test) are great! This library isn't 1.0 yet, so if you stumble upon a bug, fix it, write a test, and submit a PR, this is valuable work.
155
157
  - Solving bounties! tinygrad [offers cash bounties](https://docs.google.com/spreadsheets/d/1WKHbT-7KOgjEawq5h5Ic1qUWzpfAzuD_J06N1JwOCGs/edit?usp=sharing) for certain improvements to the library. All new code should be high quality and well tested.
156
158
  - Features. However, if you are adding a feature, consider the line tradeoff. If it's 3 lines, there's less of a bar of usefulness it has to meet over something that's 30 or 300 lines. All features must have regression tests. In general with no other constraints, your feature's API should match torch or numpy.
157
- - Refactors that are clear wins. In general, if your refactor isn't a clear win it will be closed. But some refactors are amazing! Think about readability in a deep core sense. A whitespace change or moving a few functions around is useless, but if you realize that two 100 line functions can actually use the same 110 line function with arguments while also improving readability, this is a big win.
159
+ - Refactors that are clear wins. In general, if your refactor isn't a clear win it will be closed. But some refactors are amazing! Think about readability in a deep core sense. A whitespace change or moving a few functions around is useless, but if you realize that two 100 line functions can actually use the same 110 line function with arguments while also improving readability, this is a big win. Refactors should pass [process replay](#process-replay-tests).
158
160
  - Tests/fuzzers. If you can add tests that are non brittle, they are welcome. We have some fuzzers in here too, and there's a plethora of bugs that can be found with them and by improving them. Finding bugs, even writing broken tests (that should pass) with `@unittest.expectedFailure` is great. This is how we make progress.
159
161
  - Dead code removal from core `tinygrad/` folder. We don't care about the code in extra, but removing dead code from the core library is great. Less for new people to read and be confused by.
160
162
 
@@ -170,3 +172,7 @@ python3 -m pip install -e '.[testing]' # install extra deps for testing
170
172
  python3 test/test_ops.py # just the ops tests
171
173
  python3 -m pytest test/ # whole test suite
172
174
  ```
175
+
176
+ #### Process replay tests
177
+
178
+ [Process replay](https://github.com/tinygrad/tinygrad/blob/master/test/external/process_replay/process_replay.py) compares your PR's generated kernels against master. If your PR is a refactor or speedup without any expected behavior change, It should include [run_process_replay] in the PR title, [example](https://github.com/tinygrad/tinygrad/pull/4995). Note that you should keep your branch up-to-date with master.
@@ -8,19 +8,19 @@ with open(directory / 'README.md', encoding='utf-8') as f:
8
8
  long_description = f.read()
9
9
 
10
10
  setup(name='tinygrad',
11
- version='0.9.0',
11
+ version='0.9.2',
12
12
  description='You like pytorch? You like micrograd? You love tinygrad! <3',
13
13
  author='George Hotz',
14
14
  license='MIT',
15
15
  long_description=long_description,
16
16
  long_description_content_type='text/markdown',
17
17
  packages = ['tinygrad', 'tinygrad.runtime.autogen', 'tinygrad.codegen', 'tinygrad.nn', 'tinygrad.renderer', 'tinygrad.engine',
18
- 'tinygrad.runtime', 'tinygrad.runtime.driver', 'tinygrad.runtime.graph', 'tinygrad.shape'],
18
+ 'tinygrad.runtime', 'tinygrad.runtime.support', 'tinygrad.runtime.graph', 'tinygrad.shape'],
19
19
  classifiers=[
20
20
  "Programming Language :: Python :: 3",
21
21
  "License :: OSI Approved :: MIT License"
22
22
  ],
23
- install_requires=["numpy", "tqdm",
23
+ install_requires=["numpy",
24
24
  "pyobjc-framework-Metal; platform_system=='Darwin'",
25
25
  "pyobjc-framework-libdispatch; platform_system=='Darwin'"],
26
26
  python_requires='>=3.8',
@@ -46,16 +46,20 @@ setup(name='tinygrad',
46
46
  "onnx2torch",
47
47
  "opencv-python",
48
48
  "tabulate",
49
+ "tqdm",
49
50
  "safetensors",
50
51
  "transformers",
51
52
  "sentencepiece",
52
53
  "tiktoken",
54
+ "blobfile",
53
55
  "librosa",
54
56
  "networkx",
55
57
  "hypothesis",
56
58
  "nibabel",
59
+ "bottle",
57
60
  ],
58
61
  'docs': [
62
+ "mkdocs",
59
63
  "mkdocs-material",
60
64
  "mkdocstrings[python]",
61
65
  "markdown-callouts",
@@ -0,0 +1,167 @@
1
+ import unittest, contextlib
2
+ import numpy as np
3
+ from tinygrad import Tensor, GlobalCounters, dtypes, nn
4
+ from tinygrad.helpers import CI, Context, getenv
5
+ from tinygrad.engine.realize import run_schedule
6
+ from tinygrad.codegen.kernel import Opt, OptOps, Kernel, KernelOptError
7
+ from tinygrad.engine.realize import CompiledRunner, ExecItem
8
+ from tinygrad.engine.search import get_kernel_actions
9
+
10
+ class TestArange(unittest.TestCase):
11
+ def _get_flops(self, N, opts=None):
12
+ GlobalCounters.reset()
13
+ tt = Tensor.arange(N)
14
+ sched = tt.schedule()
15
+ self.assertEqual(len(sched), 1)
16
+ k = Kernel(sched[-1].ast)
17
+ if opts is not None:
18
+ for o in opts: k.apply_opt(o)
19
+ p = k.to_program()
20
+ print(p.name)
21
+ #print(p.src)
22
+ ExecItem(CompiledRunner(p), [tt.lazydata.buffer]).run()
23
+ np.testing.assert_equal(tt.numpy(), np.arange(N))
24
+ return p.op_estimate
25
+
26
+ def test_complexity(self, opts=None):
27
+ # add 1 to avoid divide by 0. arange is 0 flops now!
28
+ f1 = self._get_flops(256, opts) + 1
29
+ f2 = self._get_flops(2560, opts) + 1
30
+ print(f"{f1=}, {f2=}")
31
+ assert (f1 < 5000 and f2 < 5000) or (f2 / f1 < 15), f"bad complexity, flops {f2/f1:.1f}X while inputs 10X"
32
+
33
+ def test_complexity_w_upcast(self): return self.test_complexity([Opt(OptOps.UPCAST, 0, 4)])
34
+ def test_complexity_w_unroll(self): return self.test_complexity([Opt(OptOps.UNROLL, 0, 4)])
35
+ def test_complexity_w_upcast_and_unroll(self): return self.test_complexity([Opt(OptOps.UPCAST, 0, 4), Opt(OptOps.UNROLL, 0, 4)])
36
+
37
+ @unittest.skip("doesn't work yet")
38
+ def test_complexity_w_local_and_padto(self): return self.test_complexity([Opt(OptOps.LOCAL, 0, 16), Opt(op=OptOps.PADTO, axis=1, amt=32)])
39
+
40
+ def test_all_opts(self, opts=None, exclude=None):
41
+ k = Kernel(Tensor.arange(256).schedule()[-1].ast)
42
+ if opts is not None:
43
+ for o in opts: k.apply_opt(o)
44
+ all_opts_256 = [kk.applied_opts for kk in get_kernel_actions(k, include_0=False).values()]
45
+ k = Kernel(Tensor.arange(2560).schedule()[-1].ast)
46
+ if opts is not None:
47
+ for o in opts: k.apply_opt(o)
48
+ all_opts_2560 = [kk.applied_opts for kk in get_kernel_actions(k, include_0=False).values()]
49
+ all_opts = [x for x in all_opts_256 if x in all_opts_2560]
50
+ for opts in all_opts:
51
+ if exclude is not None and opts[-1] in exclude: continue
52
+ print(opts)
53
+ self.test_complexity(opts)
54
+ def test_all_opts_w_local(self):
55
+ with contextlib.suppress(KernelOptError):
56
+ return self.test_all_opts([Opt(OptOps.LOCAL, 0, 16)], [Opt(op=OptOps.PADTO, axis=1, amt=32)])
57
+ def test_all_opts_w_upcast(self): return self.test_all_opts([Opt(OptOps.UPCAST, 0, 4)])
58
+ def test_all_opts_w_unroll(self): return self.test_all_opts([Opt(OptOps.UNROLL, 0, 4)], [Opt(op=OptOps.GROUP, axis=0, amt=0)])
59
+ def test_all_opts_w_upcast_and_unroll(self):
60
+ return self.test_all_opts([Opt(OptOps.UPCAST, 0, 4), Opt(OptOps.UNROLL, 0, 4)], [Opt(op=OptOps.GROUP, axis=0, amt=0)])
61
+
62
+ class TestIndexing(unittest.TestCase):
63
+ def test_arange_2_reduce(self):
64
+ needle = Tensor.zeros(16384, dtype=dtypes.int).contiguous()
65
+ needle[1337] = 1
66
+ needle.realize()
67
+ with Context(NOOPT=1, FUSE_ARANGE=1):
68
+ GlobalCounters.reset()
69
+ # TODO: it should work without these reshapes
70
+ out = ((Tensor.arange(1,16385).reshape(16384,1)-1)*needle.reshape(16384,1)).sum()
71
+ sched = out.schedule()
72
+ assert len(sched) == 1
73
+ run_schedule(sched)
74
+ assert out.item() == 1337, f"expected 1337, got {out.item()}"
75
+
76
+ @unittest.skipIf(getenv("PTX"), "broken on ptx for some reason")
77
+ def test_manual_index(self):
78
+ dataset = Tensor.rand(16384, 256).realize()
79
+ idxs = Tensor([0,3,5,6]).realize()
80
+ real_index = dataset.numpy()[idxs.numpy()]
81
+ print("*** indexing ***")
82
+ with Context(NOOPT=1, FUSE_ARANGE=1):
83
+ GlobalCounters.reset()
84
+ rng = Tensor.ones(4, 256, 16384, dtype=dtypes.int)._cumsum(axis=-1, _first_zero=True).reshape(4, 256, 16384, 1)
85
+ idxs = idxs.reshape(4,1,1,1).expand(4, 256, 16384, 1)
86
+ reshape_dataset = dataset.T.reshape(1, 256, 16384, 1).expand(4, 256, 16384, 1)
87
+ full = (rng==idxs).where(reshape_dataset, Tensor.zeros(4, 256, 16384, 1))
88
+ X = full.sum(axis=(2,3))
89
+ sched = X.schedule()
90
+ assert len(sched) == 1
91
+ run_schedule(sched)
92
+ assert GlobalCounters.global_ops < 4*16384, f"too many ops {GlobalCounters.global_ops}"
93
+ np.testing.assert_allclose(real_index, X.numpy())
94
+
95
+ def test_index(self):
96
+ dataset = Tensor.rand(16384, 256).realize()
97
+ idxs = Tensor([0,3,5,6]).realize()
98
+ real_index = dataset.numpy()[idxs.numpy()]
99
+ print("*** indexing ***")
100
+ with Context(NOOPT=1):
101
+ GlobalCounters.reset()
102
+ X = dataset[idxs]
103
+ assert X.shape == (4,256)
104
+ sched = X.schedule()
105
+ # TODO: enable these asserts when the scheduler can handle this
106
+ #assert len(sched) == 1, f"{len(sched)} != 1"
107
+ run_schedule(sched)
108
+ #assert GlobalCounters.global_ops < 4*16384, f"too many ops {GlobalCounters.global_ops}"
109
+ np.testing.assert_allclose(real_index, X.numpy())
110
+
111
+ def test_index_fused(self, noopt=1):
112
+ dataset = Tensor.rand(16384, 256).realize()
113
+ idxs = Tensor([0,3,5,6]).realize()
114
+ real_index = dataset.numpy()[idxs.numpy()]
115
+ print("*** indexing ***")
116
+ with Context(NOOPT=noopt, FUSE_ARANGE=1):
117
+ GlobalCounters.reset()
118
+ X = dataset[idxs]
119
+ assert X.shape == (4,256)
120
+ sched = X.schedule()
121
+ assert len(sched) == 2
122
+ run_schedule(sched)
123
+ assert GlobalCounters.global_ops < 4*16384, f"too many ops {GlobalCounters.global_ops} != {4*16384}"
124
+ np.testing.assert_allclose(real_index, X.numpy())
125
+ @unittest.skip("not ready")
126
+ def test_index_fused_opt(self): self.test_index_fused(0)
127
+
128
+ @unittest.skipIf(getenv("PTX"), "broken on ptx for some reason")
129
+ def test_index_mnist(self, noopt=1):
130
+ from tinygrad.nn.datasets import mnist
131
+ X_train, Y_train, _, _ = mnist()
132
+ with Context(NOOPT=noopt, FUSE_ARANGE=1, SPLIT_REDUCEOP=0):
133
+ GlobalCounters.reset()
134
+ samples = Tensor.randint(getenv("BS", 512), high=X_train.shape[0])
135
+ x = X_train[samples].numpy()
136
+ y = Y_train[samples].numpy()
137
+ assert GlobalCounters.global_ops < 4*16384, f"too many ops {GlobalCounters.global_ops} != {4*16384}"
138
+ np.testing.assert_allclose(X_train.numpy()[samples.numpy()], x)
139
+ np.testing.assert_allclose(Y_train.numpy()[samples.numpy()], y)
140
+ @unittest.skip("not ready")
141
+ def test_index_mnist_opt(self): self.test_index_mnist(0)
142
+
143
+ @unittest.skipIf(getenv("PTX"), "broken on ptx for some reason")
144
+ def test_llama_embedding(self, noopt=1, op_limit=0):
145
+ # llama3 is 128256
146
+ vocab_size, embed_size = (10, 3) if CI else (32000, 4096)
147
+ emb = nn.Embedding(vocab_size, embed_size)
148
+ emb_w = emb.weight.numpy()
149
+ x = Tensor([1,2,3,4])
150
+ with Context(NOOPT=noopt, FUSE_ARANGE=1):
151
+ GlobalCounters.reset()
152
+ z = emb(x).realize()
153
+ self.assertLessEqual(GlobalCounters.global_ops, op_limit)
154
+ self.assertEqual(GlobalCounters.kernel_count, 2)
155
+ if getenv("CHECK", 1):
156
+ import torch
157
+ with torch.no_grad():
158
+ torch_emb = torch.nn.Embedding(vocab_size, embed_size).eval()
159
+ torch_emb.weight[:] = torch.tensor(emb_w, dtype=torch.float32)
160
+ torch_z = torch_emb(torch.tensor(x.numpy()))
161
+ # TODO: reshape to match torch, should we do this in nn?
162
+ np.testing.assert_allclose(z.numpy().reshape(4, embed_size), torch_z.detach().numpy(), atol=1e-8, rtol=1e-8)
163
+ # at least the arange is being fused
164
+ def test_llama_embedding_opt(self): self.test_llama_embedding(0, 1736704000)
165
+
166
+ if __name__ == "__main__":
167
+ unittest.main()
@@ -2,14 +2,14 @@ import unittest, math
2
2
  from tinygrad import Tensor, Device, dtypes
3
3
  from tinygrad.engine.schedule import create_schedule
4
4
  from tinygrad.helpers import CI
5
- from tinygrad.ops import BufferOps
5
+ from tinygrad.ops import MetaOps
6
6
  import numpy as np
7
7
  from test.helpers import is_dtype_supported
8
8
 
9
9
  def _check_ast_count(desired_count:int, t:Tensor):
10
10
  # NOTE: this has side effect because everything can be scheduled only once
11
11
  schedule = create_schedule(t.lazydata.lbs)
12
- asts = [s for s in schedule if s.ast[0].op is BufferOps.STORE]
12
+ asts = [s for s in schedule if s.ast.op is MetaOps.KERNEL]
13
13
  assert len(asts) == desired_count
14
14
 
15
15
  class TestUnaryOpsConstFolding(unittest.TestCase):
@@ -28,6 +28,11 @@ class TestUnaryOpsConstFolding(unittest.TestCase):
28
28
  _check_ast_count(0, Tensor([1, 2, 3]).neg().mul(-1))
29
29
  _check_ast_count(0, Tensor([1, 2, 3]).neg().neg())
30
30
 
31
+ def test_neg_realized_no_fold(self):
32
+ x = Tensor.randn(32, 32)
33
+ x = x.clip(0, 1).realize()
34
+ _check_ast_count(1, x.neg())
35
+
31
36
  class TestBinaryOpsConstFolding(unittest.TestCase):
32
37
  def test_add_literal_zero(self):
33
38
  _check_ast_count(0, Tensor([1.0, 2, 3, 4]) + 0)
@@ -250,4 +255,4 @@ class TestTautologicalCompare(unittest.TestCase):
250
255
  np.testing.assert_equal((a != a).numpy(), [True, False, False])
251
256
 
252
257
  if __name__ == '__main__':
253
- unittest.main()
258
+ unittest.main()
@@ -42,6 +42,14 @@ class TestConv(unittest.TestCase):
42
42
 
43
43
  print(ret.numpy())
44
44
 
45
+ def test_two_binops_no_rerun_small(self):
46
+ Tensor.no_grad = True
47
+ x = Tensor.rand(1,1,32,32)
48
+ w = Tensor.rand(1,1,3,3)
49
+ out = x.conv2d(w, padding=(1,1))
50
+ np.testing.assert_allclose(out.relu().numpy(), np.maximum(out.numpy(), 0))
51
+ Tensor.no_grad = False
52
+
45
53
  def test_two_binops_no_rerun(self):
46
54
  Tensor.no_grad = True
47
55
  x = Tensor.randn(1,12,128,256)
@@ -0,0 +1,63 @@
1
+ #!/usr/bin/env python
2
+ import unittest
3
+ from tinygrad.tensor import Tensor
4
+ from tinygrad.ops import MetaOps, BufferOps
5
+ from tinygrad.nn import Conv2d
6
+ from tinygrad.engine.schedule import create_schedule
7
+ from tinygrad.shape.shapetracker import ShapeTracker, View
8
+ from tinygrad.helpers import prod
9
+ from test.unit.test_shapetracker import shapetracker_getitem
10
+
11
+ class TestConvShapetracker(unittest.TestCase):
12
+ def test_conv_3x3_one_view(self):
13
+ conv = Conv2d(16, 32, (3, 3))
14
+ seen = set()
15
+
16
+ # first run to init the weights, they are saved in seen
17
+ create_schedule([conv(Tensor.empty(1, 16, 10, 10)).lazydata], seen)
18
+ # run it again to get the kernels
19
+ sched = [si for si in create_schedule([conv(Tensor.empty(1, 16, 10, 10)).lazydata], seen) if si.ast.op is MetaOps.KERNEL]
20
+ assert len(sched) == 1, f"conv should only have one kernel, getting {len(sched)}"
21
+ for st in [x.arg.st for x in sched[0].ast.lazyops if x.op is BufferOps.LOAD]:
22
+ assert len(st.views) == 1
23
+
24
+ @unittest.expectedFailure
25
+ def test_conv_2x2_backward_one_view(self):
26
+ X = Tensor.rand(1, 1, 3, 3, requires_grad=True)
27
+ conv = Conv2d(1, 1, (2, 2), bias=False)
28
+ conv(X).mean().backward()
29
+ si = X.grad.schedule()[-1]
30
+ print(si)
31
+ ldb = [x for x in si.ast.lazyops if x.op is BufferOps.LOAD][0]
32
+ st: ShapeTracker = ldb.arg.st.simplify()
33
+ # NOTE: st.real_size() is broken
34
+ print(si.inputs[0].size)
35
+ #self.assertEqual(si.inputs[0].size, st.real_size())
36
+ for v in st.views: print(v)
37
+
38
+ # same st
39
+ test_st = ShapeTracker((
40
+ View(shape=(1, 1, 2, 4, 2, 4), strides=(0, 0, 2, 8, 1, 4), offset=0, mask=((0, 1), (0, 1), (0, 2), (0, 2), (0, 2), (0, 2)), contiguous=False),
41
+ View(shape=(1, 1, 1, 1, 3, 3, 3, 3), strides=(0, 0, 0, 0, 24, 8, 3, 1), offset=0,
42
+ mask=((0, 1), (0, 1), (0, 1), (0, 1), (0, 2), (0, 3), (0, 2), (0, 3)), contiguous=False)))
43
+ #test_st = ShapeTracker((
44
+ # View(shape=(2,4), strides=(1,4), offset=0, mask=None, contiguous=False),
45
+ #)).simplify()
46
+ #View(shape=(1, 1, 2, 4, 2, 4), strides=(0, 0, 2, 8, 1, 4), offset=0, mask=((0, 1), (0, 1), (0, 2), (0, 2), (0, 2), (0, 2)), contiguous=False),
47
+ #View(shape=(1, 1, 1, 1, 3, 3, 3, 3), strides=(0, 0, 0, 0, 24, 8, 3, 1), offset=0,
48
+ # mask=((0, 1), (0, 1), (0, 1), (0, 1), (0, 2), (0, 3), (0, 2), (0, 3)), contiguous=False))).simplify()
49
+ print("*** new ***")
50
+ for v in test_st.views: print(v)
51
+ for i in range(prod(st.shape)):
52
+ i1, i2 = shapetracker_getitem(st, i), shapetracker_getitem(test_st, i)
53
+ print(i, i1, i2, si.inputs[0].size, i1==i2)
54
+ #self.assertEqual(i1, i2)
55
+
56
+ for stt in [st, test_st]:
57
+ s,va = stt.expr_idxs()
58
+ print(s)
59
+ print(va)
60
+ assert len(st.views) <= 2
61
+
62
+ if __name__ == '__main__':
63
+ unittest.main()
@@ -31,7 +31,7 @@ def atan2_cpu(ret:Buffer, a:Buffer, b:Buffer): ret.copyin(np.require(np.arctan2(
31
31
  # NOTE: The derivative of atan2 doesn't need a custom op! https://www.liquisearch.com/atan2/derivative
32
32
  # In general, it is also optional to write a backward function, just your backward pass won't work without it
33
33
 
34
- from tinygrad.ops import LoadOps, BinaryOps
34
+ from tinygrad.ops import MetaOps, BinaryOps, UnaryOps
35
35
  from tinygrad.lazy import LazyBuffer
36
36
  from tinygrad.tensor import Function
37
37
 
@@ -39,12 +39,13 @@ class ATan2(Function):
39
39
  def forward(self, a:LazyBuffer, b:LazyBuffer) -> LazyBuffer:
40
40
  assert prod(a.shape) == prod(b.shape) and a.device == b.device, "shape or device mismatch"
41
41
  self.a, self.b = a, b
42
- return create_lazybuffer(a.device, ShapeTracker.from_shape(a.shape), max(a.dtype, b.dtype), LoadOps.CUSTOM,
42
+ return create_lazybuffer(a.device, ShapeTracker.from_shape(a.shape), max(a.dtype, b.dtype), MetaOps.CUSTOM,
43
43
  arg={"GPU": atan2_gpu, "CPU": atan2_cpu}[a.device], srcs=(a.contiguous(), b.contiguous()))
44
44
  def backward(self, grad_output:LazyBuffer) -> Tuple[Optional[LazyBuffer], Optional[LazyBuffer]]:
45
- denom = (self.a.e(BinaryOps.MUL, self.a)).e(BinaryOps.ADD, self.b.e(BinaryOps.MUL, self.b))
46
- return grad_output.e(BinaryOps.MUL, self.b.e(BinaryOps.DIV, denom)) if self.needs_input_grad[0] else None, \
47
- grad_output.e(BinaryOps.MUL, self.a.const(0).e(BinaryOps.SUB, self.a).e(BinaryOps.DIV, denom)) if self.needs_input_grad[1] else None
45
+ recip = (self.a.e(BinaryOps.MUL, self.a)).e(BinaryOps.ADD, self.b.e(BinaryOps.MUL, self.b)).e(UnaryOps.RECIP)
46
+ return grad_output.e(BinaryOps.MUL, self.b.e(BinaryOps.MUL, recip)) if self.needs_input_grad[0] else None, \
47
+ grad_output.e(BinaryOps.MUL, self.a.const(0).e(BinaryOps.ADD, self.a.e(UnaryOps.NEG)).e(BinaryOps.MUL, recip)) \
48
+ if self.needs_input_grad[1] else None
48
49
 
49
50
  # *** third, we use our lovely new mlop in some tests ***
50
51
 
@@ -1,13 +1,13 @@
1
1
  import unittest
2
2
  from tinygrad import Device
3
- from tinygrad.codegen.uops import UOpGraph
3
+ from tinygrad.codegen.uopgraph import UOpGraph
4
4
  from tinygrad.helpers import Timing, Profiling
5
5
 
6
6
  class TestDeviceSpeed(unittest.TestCase):
7
7
  @classmethod
8
8
  def setUpClass(cls):
9
9
  cls.dev = Device[Device.DEFAULT]
10
- cls.empty = Device[Device.DEFAULT].renderer.render("test", UOpGraph())
10
+ cls.empty = Device[Device.DEFAULT].renderer.render("test", UOpGraph([]))
11
11
 
12
12
  def test_empty_compile(self):
13
13
  with Timing("compiler "):