tinygrad 0.9.1__tar.gz → 0.9.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {tinygrad-0.9.1 → tinygrad-0.9.2}/PKG-INFO +13 -12
- {tinygrad-0.9.1 → tinygrad-0.9.2}/README.md +10 -11
- {tinygrad-0.9.1 → tinygrad-0.9.2}/setup.py +4 -2
- tinygrad-0.9.2/test/test_arange.py +167 -0
- {tinygrad-0.9.1 → tinygrad-0.9.2}/test/test_const_folding.py +2 -2
- {tinygrad-0.9.1 → tinygrad-0.9.2}/test/test_conv.py +8 -0
- tinygrad-0.9.2/test/test_conv_shapetracker.py +63 -0
- {tinygrad-0.9.1 → tinygrad-0.9.2}/test/test_custom_function.py +2 -2
- {tinygrad-0.9.1 → tinygrad-0.9.2}/test/test_device_speed.py +1 -1
- {tinygrad-0.9.1 → tinygrad-0.9.2}/test/test_dtype.py +40 -3
- {tinygrad-0.9.1 → tinygrad-0.9.2}/test/test_dtype_alu.py +46 -9
- tinygrad-0.9.2/test/test_hcq.py +463 -0
- {tinygrad-0.9.1 → tinygrad-0.9.2}/test/test_image_dtype.py +22 -10
- {tinygrad-0.9.1 → tinygrad-0.9.2}/test/test_jit.py +58 -0
- {tinygrad-0.9.1 → tinygrad-0.9.2}/test/test_lazybuffer.py +9 -9
- {tinygrad-0.9.1 → tinygrad-0.9.2}/test/test_lazyop.py +1 -1
- {tinygrad-0.9.1 → tinygrad-0.9.2}/test/test_linearizer.py +860 -561
- tinygrad-0.9.2/test/test_linearizer_dumb.py +104 -0
- {tinygrad-0.9.1 → tinygrad-0.9.2}/test/test_linearizer_failures.py +248 -36
- {tinygrad-0.9.1 → tinygrad-0.9.2}/test/test_linearizer_overflows.py +2 -2
- {tinygrad-0.9.1 → tinygrad-0.9.2}/test/test_multitensor.py +94 -28
- {tinygrad-0.9.1 → tinygrad-0.9.2}/test/test_nn.py +51 -8
- tinygrad-0.9.2/test/test_ocl.py +20 -0
- {tinygrad-0.9.1 → tinygrad-0.9.2}/test/test_ops.py +194 -8
- {tinygrad-0.9.1 → tinygrad-0.9.2}/test/test_pattern_matcher.py +59 -41
- {tinygrad-0.9.1 → tinygrad-0.9.2}/test/test_pickle.py +25 -3
- tinygrad-0.9.2/test/test_profiler.py +220 -0
- tinygrad-0.9.2/test/test_renderer_failures.py +43 -0
- {tinygrad-0.9.1 → tinygrad-0.9.2}/test/test_schedule.py +490 -57
- {tinygrad-0.9.1 → tinygrad-0.9.2}/test/test_search.py +23 -12
- {tinygrad-0.9.1 → tinygrad-0.9.2}/test/test_speed_v_torch.py +5 -14
- {tinygrad-0.9.1 → tinygrad-0.9.2}/test/test_subbuffer.py +1 -2
- {tinygrad-0.9.1 → tinygrad-0.9.2}/test/test_tensor.py +130 -5
- {tinygrad-0.9.1 → tinygrad-0.9.2}/test/test_tensor_data.py +12 -1
- tinygrad-0.9.2/test/test_transcendental.py +71 -0
- tinygrad-0.9.2/test/test_uop_graph.py +662 -0
- {tinygrad-0.9.1 → tinygrad-0.9.2}/test/test_uops.py +106 -46
- tinygrad-0.9.2/test/test_uops_stats.py +203 -0
- {tinygrad-0.9.1 → tinygrad-0.9.2}/test/test_verify_lazyop.py +22 -10
- {tinygrad-0.9.1 → tinygrad-0.9.2}/test/test_winograd.py +7 -6
- {tinygrad-0.9.1 → tinygrad-0.9.2}/tinygrad/codegen/kernel.py +248 -115
- tinygrad-0.9.2/tinygrad/codegen/lowerer.py +215 -0
- tinygrad-0.9.2/tinygrad/codegen/transcendental.py +310 -0
- tinygrad-0.9.2/tinygrad/codegen/uopgraph.py +622 -0
- tinygrad-0.9.2/tinygrad/codegen/uops.py +293 -0
- tinygrad-0.9.2/tinygrad/device.py +679 -0
- {tinygrad-0.9.1 → tinygrad-0.9.2}/tinygrad/dtype.py +18 -4
- {tinygrad-0.9.1 → tinygrad-0.9.2}/tinygrad/engine/graph.py +19 -32
- tinygrad-0.9.2/tinygrad/engine/jit.py +276 -0
- tinygrad-0.9.2/tinygrad/engine/realize.py +268 -0
- tinygrad-0.9.2/tinygrad/engine/schedule.py +413 -0
- {tinygrad-0.9.1 → tinygrad-0.9.2}/tinygrad/engine/search.py +29 -22
- {tinygrad-0.9.1 → tinygrad-0.9.2}/tinygrad/function.py +9 -0
- {tinygrad-0.9.1 → tinygrad-0.9.2}/tinygrad/helpers.py +87 -49
- {tinygrad-0.9.1 → tinygrad-0.9.2}/tinygrad/lazy.py +34 -35
- {tinygrad-0.9.1 → tinygrad-0.9.2}/tinygrad/multi.py +41 -36
- {tinygrad-0.9.1 → tinygrad-0.9.2}/tinygrad/nn/__init__.py +39 -22
- {tinygrad-0.9.1 → tinygrad-0.9.2}/tinygrad/nn/state.py +3 -3
- {tinygrad-0.9.1 → tinygrad-0.9.2}/tinygrad/ops.py +63 -62
- tinygrad-0.9.2/tinygrad/renderer/__init__.py +87 -0
- {tinygrad-0.9.1 → tinygrad-0.9.2}/tinygrad/renderer/assembly.py +104 -106
- {tinygrad-0.9.1 → tinygrad-0.9.2}/tinygrad/renderer/cstyle.py +87 -60
- {tinygrad-0.9.1 → tinygrad-0.9.2}/tinygrad/renderer/llvmir.py +21 -30
- tinygrad-0.9.2/tinygrad/runtime/autogen/amd_gpu.py +32858 -0
- {tinygrad-0.9.1 → tinygrad-0.9.2}/tinygrad/runtime/autogen/cuda.py +6 -162
- {tinygrad-0.9.1 → tinygrad-0.9.2}/tinygrad/runtime/autogen/kfd.py +32 -0
- tinygrad-0.9.2/tinygrad/runtime/autogen/libc.py +4260 -0
- tinygrad-0.9.2/tinygrad/runtime/autogen/nvrtc.py +579 -0
- {tinygrad-0.9.1 → tinygrad-0.9.2}/tinygrad/runtime/graph/clang.py +2 -2
- {tinygrad-0.9.1 → tinygrad-0.9.2}/tinygrad/runtime/graph/cuda.py +8 -11
- tinygrad-0.9.2/tinygrad/runtime/graph/hcq.py +200 -0
- {tinygrad-0.9.1 → tinygrad-0.9.2}/tinygrad/runtime/graph/metal.py +18 -15
- {tinygrad-0.9.1 → tinygrad-0.9.2}/tinygrad/runtime/ops_amd.py +197 -305
- {tinygrad-0.9.1 → tinygrad-0.9.2}/tinygrad/runtime/ops_clang.py +2 -2
- tinygrad-0.9.2/tinygrad/runtime/ops_cuda.py +127 -0
- {tinygrad-0.9.1 → tinygrad-0.9.2}/tinygrad/runtime/ops_disk.py +3 -7
- {tinygrad-0.9.1 → tinygrad-0.9.2}/tinygrad/runtime/ops_gpu.py +4 -2
- tinygrad-0.9.2/tinygrad/runtime/ops_hip.py +70 -0
- {tinygrad-0.9.1 → tinygrad-0.9.2}/tinygrad/runtime/ops_metal.py +38 -27
- tinygrad-0.9.2/tinygrad/runtime/ops_nv.py +545 -0
- {tinygrad-0.9.1 → tinygrad-0.9.2}/tinygrad/runtime/ops_python.py +26 -30
- tinygrad-0.9.2/tinygrad/runtime/support/compiler_cuda.py +78 -0
- tinygrad-0.9.1/tinygrad/runtime/driver/hip_comgr.py → tinygrad-0.9.2/tinygrad/runtime/support/compiler_hip.py +15 -1
- tinygrad-0.9.2/tinygrad/runtime/support/elf.py +38 -0
- {tinygrad-0.9.1 → tinygrad-0.9.2}/tinygrad/shape/shapetracker.py +5 -14
- {tinygrad-0.9.1 → tinygrad-0.9.2}/tinygrad/shape/symbolic.py +4 -8
- {tinygrad-0.9.1 → tinygrad-0.9.2}/tinygrad/shape/view.py +34 -22
- {tinygrad-0.9.1 → tinygrad-0.9.2}/tinygrad/tensor.py +399 -97
- {tinygrad-0.9.1 → tinygrad-0.9.2}/tinygrad.egg-info/PKG-INFO +13 -12
- {tinygrad-0.9.1 → tinygrad-0.9.2}/tinygrad.egg-info/SOURCES.txt +16 -4
- {tinygrad-0.9.1 → tinygrad-0.9.2}/tinygrad.egg-info/requires.txt +2 -0
- tinygrad-0.9.1/test/test_arange.py +0 -19
- tinygrad-0.9.1/test/test_conv_shapetracker.py +0 -22
- tinygrad-0.9.1/test/test_print_tree.py +0 -66
- tinygrad-0.9.1/test/test_uop_graph.py +0 -190
- tinygrad-0.9.1/test/test_uops_stats.py +0 -81
- tinygrad-0.9.1/tinygrad/codegen/linearizer.py +0 -528
- tinygrad-0.9.1/tinygrad/codegen/uops.py +0 -451
- tinygrad-0.9.1/tinygrad/device.py +0 -320
- tinygrad-0.9.1/tinygrad/engine/jit.py +0 -198
- tinygrad-0.9.1/tinygrad/engine/realize.py +0 -192
- tinygrad-0.9.1/tinygrad/engine/schedule.py +0 -370
- tinygrad-0.9.1/tinygrad/renderer/__init__.py +0 -65
- tinygrad-0.9.1/tinygrad/runtime/autogen/amd_gpu.py +0 -13403
- tinygrad-0.9.1/tinygrad/runtime/graph/hcq.py +0 -187
- tinygrad-0.9.1/tinygrad/runtime/ops_cuda.py +0 -185
- tinygrad-0.9.1/tinygrad/runtime/ops_nv.py +0 -625
- {tinygrad-0.9.1 → tinygrad-0.9.2}/LICENSE +0 -0
- {tinygrad-0.9.1 → tinygrad-0.9.2}/setup.cfg +0 -0
- {tinygrad-0.9.1 → tinygrad-0.9.2}/test/test_assign.py +0 -0
- {tinygrad-0.9.1 → tinygrad-0.9.2}/test/test_copy_speed.py +0 -0
- {tinygrad-0.9.1 → tinygrad-0.9.2}/test/test_fusion_op.py +0 -0
- {tinygrad-0.9.1 → tinygrad-0.9.2}/test/test_fuzz_shape_ops.py +0 -0
- {tinygrad-0.9.1 → tinygrad-0.9.2}/test/test_gc.py +0 -0
- {tinygrad-0.9.1 → tinygrad-0.9.2}/test/test_graph.py +0 -0
- {tinygrad-0.9.1 → tinygrad-0.9.2}/test/test_kernel_cache.py +0 -0
- {tinygrad-0.9.1 → tinygrad-0.9.2}/test/test_masked_st.py +0 -0
- {tinygrad-0.9.1 → tinygrad-0.9.2}/test/test_method_cache.py +0 -0
- {tinygrad-0.9.1 → tinygrad-0.9.2}/test/test_net_speed.py +0 -0
- {tinygrad-0.9.1 → tinygrad-0.9.2}/test/test_optim.py +0 -0
- {tinygrad-0.9.1 → tinygrad-0.9.2}/test/test_randomness.py +0 -0
- {tinygrad-0.9.1 → tinygrad-0.9.2}/test/test_sample.py +0 -0
- {tinygrad-0.9.1 → tinygrad-0.9.2}/test/test_setitem.py +0 -0
- {tinygrad-0.9.1 → tinygrad-0.9.2}/test/test_specific_conv.py +0 -0
- {tinygrad-0.9.1 → tinygrad-0.9.2}/test/test_symbolic_jit.py +0 -0
- {tinygrad-0.9.1 → tinygrad-0.9.2}/test/test_symbolic_ops.py +0 -0
- {tinygrad-0.9.1 → tinygrad-0.9.2}/test/test_symbolic_shapetracker.py +0 -0
- {tinygrad-0.9.1 → tinygrad-0.9.2}/test/test_tensor_variable.py +0 -0
- {tinygrad-0.9.1 → tinygrad-0.9.2}/test/test_to_numpy.py +0 -0
- {tinygrad-0.9.1 → tinygrad-0.9.2}/test/test_zero_copy.py +0 -0
- {tinygrad-0.9.1 → tinygrad-0.9.2}/tinygrad/__init__.py +0 -0
- {tinygrad-0.9.1 → tinygrad-0.9.2}/tinygrad/codegen/__init__.py +0 -0
- {tinygrad-0.9.1 → tinygrad-0.9.2}/tinygrad/engine/__init__.py +0 -0
- {tinygrad-0.9.1 → tinygrad-0.9.2}/tinygrad/nn/datasets.py +0 -0
- {tinygrad-0.9.1 → tinygrad-0.9.2}/tinygrad/nn/optim.py +0 -0
- {tinygrad-0.9.1 → tinygrad-0.9.2}/tinygrad/runtime/__init__.py +0 -0
- {tinygrad-0.9.1 → tinygrad-0.9.2}/tinygrad/runtime/autogen/comgr.py +0 -0
- {tinygrad-0.9.1 → tinygrad-0.9.2}/tinygrad/runtime/autogen/hip.py +0 -0
- {tinygrad-0.9.1 → tinygrad-0.9.2}/tinygrad/runtime/autogen/hsa.py +0 -0
- {tinygrad-0.9.1 → tinygrad-0.9.2}/tinygrad/runtime/autogen/io_uring.py +0 -0
- {tinygrad-0.9.1 → tinygrad-0.9.2}/tinygrad/runtime/autogen/nv_gpu.py +0 -0
- {tinygrad-0.9.1 → tinygrad-0.9.2}/tinygrad/runtime/autogen/opencl.py +0 -0
- {tinygrad-0.9.1/tinygrad/runtime/driver → tinygrad-0.9.2/tinygrad/runtime/graph}/__init__.py +0 -0
- {tinygrad-0.9.1 → tinygrad-0.9.2}/tinygrad/runtime/ops_llvm.py +0 -0
- {tinygrad-0.9.1 → tinygrad-0.9.2}/tinygrad/runtime/ops_npy.py +0 -0
- {tinygrad-0.9.1/tinygrad/runtime/graph → tinygrad-0.9.2/tinygrad/runtime/support}/__init__.py +0 -0
- {tinygrad-0.9.1 → tinygrad-0.9.2}/tinygrad/shape/__init__.py +0 -0
- {tinygrad-0.9.1 → tinygrad-0.9.2}/tinygrad.egg-info/dependency_links.txt +0 -0
- {tinygrad-0.9.1 → tinygrad-0.9.2}/tinygrad.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: tinygrad
|
3
|
-
Version: 0.9.
|
3
|
+
Version: 0.9.2
|
4
4
|
Summary: You like pytorch? You like micrograd? You love tinygrad! <3
|
5
5
|
Author: George Hotz
|
6
6
|
License: MIT
|
@@ -39,12 +39,14 @@ Requires-Dist: safetensors; extra == "testing"
|
|
39
39
|
Requires-Dist: transformers; extra == "testing"
|
40
40
|
Requires-Dist: sentencepiece; extra == "testing"
|
41
41
|
Requires-Dist: tiktoken; extra == "testing"
|
42
|
+
Requires-Dist: blobfile; extra == "testing"
|
42
43
|
Requires-Dist: librosa; extra == "testing"
|
43
44
|
Requires-Dist: networkx; extra == "testing"
|
44
45
|
Requires-Dist: hypothesis; extra == "testing"
|
45
46
|
Requires-Dist: nibabel; extra == "testing"
|
46
47
|
Requires-Dist: bottle; extra == "testing"
|
47
48
|
Provides-Extra: docs
|
49
|
+
Requires-Dist: mkdocs; extra == "docs"
|
48
50
|
Requires-Dist: mkdocs-material; extra == "docs"
|
49
51
|
Requires-Dist: mkdocstrings[python]; extra == "docs"
|
50
52
|
Requires-Dist: markdown-callouts; extra == "docs"
|
@@ -107,7 +109,7 @@ And we can change `DEBUG` to `4` to see the generated code.
|
|
107
109
|
As it turns out, 90% of what you need for neural networks are a decent autograd/tensor library.
|
108
110
|
Throw in an optimizer, a data loader, and some compute, and you have all you need.
|
109
111
|
|
110
|
-
```
|
112
|
+
```python
|
111
113
|
from tinygrad import Tensor, nn
|
112
114
|
|
113
115
|
class LinearNet:
|
@@ -122,11 +124,12 @@ optim = nn.optim.Adam([model.l1, model.l2], lr=0.001)
|
|
122
124
|
|
123
125
|
x, y = Tensor.rand(4, 1, 28, 28), Tensor([2,4,3,7]) # replace with real mnist dataloader
|
124
126
|
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
127
|
+
with Tensor.train():
|
128
|
+
for i in range(10):
|
129
|
+
optim.zero_grad()
|
130
|
+
loss = model(x).sparse_categorical_crossentropy(y).backward()
|
131
|
+
optim.step()
|
132
|
+
print(i, loss.item())
|
130
133
|
```
|
131
134
|
|
132
135
|
See [examples/beautiful_mnist.py](examples/beautiful_mnist.py) for the full version that gets 98% in ~5 seconds
|
@@ -169,7 +172,7 @@ Documentation along with a quick start guide can be found on the [docs website](
|
|
169
172
|
|
170
173
|
### Quick example comparing to PyTorch
|
171
174
|
|
172
|
-
```
|
175
|
+
```python
|
173
176
|
from tinygrad import Tensor
|
174
177
|
|
175
178
|
x = Tensor.eye(3, requires_grad=True)
|
@@ -182,7 +185,7 @@ print(y.grad.numpy()) # dz/dy
|
|
182
185
|
```
|
183
186
|
|
184
187
|
The same thing but in PyTorch:
|
185
|
-
```
|
188
|
+
```python
|
186
189
|
import torch
|
187
190
|
|
188
191
|
x = torch.eye(3, requires_grad=True)
|
@@ -230,6 +233,4 @@ python3 -m pytest test/ # whole test suite
|
|
230
233
|
|
231
234
|
#### Process replay tests
|
232
235
|
|
233
|
-
[Process replay](https://github.com/tinygrad/tinygrad/blob/master/test/external/process_replay/process_replay.py)
|
234
|
-
|
235
|
-
You can enable process replay by adding [run_process_replay] to your PR title. [example](https://github.com/tinygrad/tinygrad/pull/4995). Note that you should keep your branch up-to-date with master.
|
236
|
+
[Process replay](https://github.com/tinygrad/tinygrad/blob/master/test/external/process_replay/process_replay.py) compares your PR's generated kernels against master. If your PR is a refactor or speedup without any expected behavior change, It should include [run_process_replay] in the PR title, [example](https://github.com/tinygrad/tinygrad/pull/4995). Note that you should keep your branch up-to-date with master.
|
@@ -51,7 +51,7 @@ And we can change `DEBUG` to `4` to see the generated code.
|
|
51
51
|
As it turns out, 90% of what you need for neural networks are a decent autograd/tensor library.
|
52
52
|
Throw in an optimizer, a data loader, and some compute, and you have all you need.
|
53
53
|
|
54
|
-
```
|
54
|
+
```python
|
55
55
|
from tinygrad import Tensor, nn
|
56
56
|
|
57
57
|
class LinearNet:
|
@@ -66,11 +66,12 @@ optim = nn.optim.Adam([model.l1, model.l2], lr=0.001)
|
|
66
66
|
|
67
67
|
x, y = Tensor.rand(4, 1, 28, 28), Tensor([2,4,3,7]) # replace with real mnist dataloader
|
68
68
|
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
69
|
+
with Tensor.train():
|
70
|
+
for i in range(10):
|
71
|
+
optim.zero_grad()
|
72
|
+
loss = model(x).sparse_categorical_crossentropy(y).backward()
|
73
|
+
optim.step()
|
74
|
+
print(i, loss.item())
|
74
75
|
```
|
75
76
|
|
76
77
|
See [examples/beautiful_mnist.py](examples/beautiful_mnist.py) for the full version that gets 98% in ~5 seconds
|
@@ -113,7 +114,7 @@ Documentation along with a quick start guide can be found on the [docs website](
|
|
113
114
|
|
114
115
|
### Quick example comparing to PyTorch
|
115
116
|
|
116
|
-
```
|
117
|
+
```python
|
117
118
|
from tinygrad import Tensor
|
118
119
|
|
119
120
|
x = Tensor.eye(3, requires_grad=True)
|
@@ -126,7 +127,7 @@ print(y.grad.numpy()) # dz/dy
|
|
126
127
|
```
|
127
128
|
|
128
129
|
The same thing but in PyTorch:
|
129
|
-
```
|
130
|
+
```python
|
130
131
|
import torch
|
131
132
|
|
132
133
|
x = torch.eye(3, requires_grad=True)
|
@@ -174,6 +175,4 @@ python3 -m pytest test/ # whole test suite
|
|
174
175
|
|
175
176
|
#### Process replay tests
|
176
177
|
|
177
|
-
[Process replay](https://github.com/tinygrad/tinygrad/blob/master/test/external/process_replay/process_replay.py)
|
178
|
-
|
179
|
-
You can enable process replay by adding [run_process_replay] to your PR title. [example](https://github.com/tinygrad/tinygrad/pull/4995). Note that you should keep your branch up-to-date with master.
|
178
|
+
[Process replay](https://github.com/tinygrad/tinygrad/blob/master/test/external/process_replay/process_replay.py) compares your PR's generated kernels against master. If your PR is a refactor or speedup without any expected behavior change, It should include [run_process_replay] in the PR title, [example](https://github.com/tinygrad/tinygrad/pull/4995). Note that you should keep your branch up-to-date with master.
|
@@ -8,14 +8,14 @@ with open(directory / 'README.md', encoding='utf-8') as f:
|
|
8
8
|
long_description = f.read()
|
9
9
|
|
10
10
|
setup(name='tinygrad',
|
11
|
-
version='0.9.
|
11
|
+
version='0.9.2',
|
12
12
|
description='You like pytorch? You like micrograd? You love tinygrad! <3',
|
13
13
|
author='George Hotz',
|
14
14
|
license='MIT',
|
15
15
|
long_description=long_description,
|
16
16
|
long_description_content_type='text/markdown',
|
17
17
|
packages = ['tinygrad', 'tinygrad.runtime.autogen', 'tinygrad.codegen', 'tinygrad.nn', 'tinygrad.renderer', 'tinygrad.engine',
|
18
|
-
'tinygrad.runtime', 'tinygrad.runtime.
|
18
|
+
'tinygrad.runtime', 'tinygrad.runtime.support', 'tinygrad.runtime.graph', 'tinygrad.shape'],
|
19
19
|
classifiers=[
|
20
20
|
"Programming Language :: Python :: 3",
|
21
21
|
"License :: OSI Approved :: MIT License"
|
@@ -51,6 +51,7 @@ setup(name='tinygrad',
|
|
51
51
|
"transformers",
|
52
52
|
"sentencepiece",
|
53
53
|
"tiktoken",
|
54
|
+
"blobfile",
|
54
55
|
"librosa",
|
55
56
|
"networkx",
|
56
57
|
"hypothesis",
|
@@ -58,6 +59,7 @@ setup(name='tinygrad',
|
|
58
59
|
"bottle",
|
59
60
|
],
|
60
61
|
'docs': [
|
62
|
+
"mkdocs",
|
61
63
|
"mkdocs-material",
|
62
64
|
"mkdocstrings[python]",
|
63
65
|
"markdown-callouts",
|
@@ -0,0 +1,167 @@
|
|
1
|
+
import unittest, contextlib
|
2
|
+
import numpy as np
|
3
|
+
from tinygrad import Tensor, GlobalCounters, dtypes, nn
|
4
|
+
from tinygrad.helpers import CI, Context, getenv
|
5
|
+
from tinygrad.engine.realize import run_schedule
|
6
|
+
from tinygrad.codegen.kernel import Opt, OptOps, Kernel, KernelOptError
|
7
|
+
from tinygrad.engine.realize import CompiledRunner, ExecItem
|
8
|
+
from tinygrad.engine.search import get_kernel_actions
|
9
|
+
|
10
|
+
class TestArange(unittest.TestCase):
|
11
|
+
def _get_flops(self, N, opts=None):
|
12
|
+
GlobalCounters.reset()
|
13
|
+
tt = Tensor.arange(N)
|
14
|
+
sched = tt.schedule()
|
15
|
+
self.assertEqual(len(sched), 1)
|
16
|
+
k = Kernel(sched[-1].ast)
|
17
|
+
if opts is not None:
|
18
|
+
for o in opts: k.apply_opt(o)
|
19
|
+
p = k.to_program()
|
20
|
+
print(p.name)
|
21
|
+
#print(p.src)
|
22
|
+
ExecItem(CompiledRunner(p), [tt.lazydata.buffer]).run()
|
23
|
+
np.testing.assert_equal(tt.numpy(), np.arange(N))
|
24
|
+
return p.op_estimate
|
25
|
+
|
26
|
+
def test_complexity(self, opts=None):
|
27
|
+
# add 1 to avoid divide by 0. arange is 0 flops now!
|
28
|
+
f1 = self._get_flops(256, opts) + 1
|
29
|
+
f2 = self._get_flops(2560, opts) + 1
|
30
|
+
print(f"{f1=}, {f2=}")
|
31
|
+
assert (f1 < 5000 and f2 < 5000) or (f2 / f1 < 15), f"bad complexity, flops {f2/f1:.1f}X while inputs 10X"
|
32
|
+
|
33
|
+
def test_complexity_w_upcast(self): return self.test_complexity([Opt(OptOps.UPCAST, 0, 4)])
|
34
|
+
def test_complexity_w_unroll(self): return self.test_complexity([Opt(OptOps.UNROLL, 0, 4)])
|
35
|
+
def test_complexity_w_upcast_and_unroll(self): return self.test_complexity([Opt(OptOps.UPCAST, 0, 4), Opt(OptOps.UNROLL, 0, 4)])
|
36
|
+
|
37
|
+
@unittest.skip("doesn't work yet")
|
38
|
+
def test_complexity_w_local_and_padto(self): return self.test_complexity([Opt(OptOps.LOCAL, 0, 16), Opt(op=OptOps.PADTO, axis=1, amt=32)])
|
39
|
+
|
40
|
+
def test_all_opts(self, opts=None, exclude=None):
|
41
|
+
k = Kernel(Tensor.arange(256).schedule()[-1].ast)
|
42
|
+
if opts is not None:
|
43
|
+
for o in opts: k.apply_opt(o)
|
44
|
+
all_opts_256 = [kk.applied_opts for kk in get_kernel_actions(k, include_0=False).values()]
|
45
|
+
k = Kernel(Tensor.arange(2560).schedule()[-1].ast)
|
46
|
+
if opts is not None:
|
47
|
+
for o in opts: k.apply_opt(o)
|
48
|
+
all_opts_2560 = [kk.applied_opts for kk in get_kernel_actions(k, include_0=False).values()]
|
49
|
+
all_opts = [x for x in all_opts_256 if x in all_opts_2560]
|
50
|
+
for opts in all_opts:
|
51
|
+
if exclude is not None and opts[-1] in exclude: continue
|
52
|
+
print(opts)
|
53
|
+
self.test_complexity(opts)
|
54
|
+
def test_all_opts_w_local(self):
|
55
|
+
with contextlib.suppress(KernelOptError):
|
56
|
+
return self.test_all_opts([Opt(OptOps.LOCAL, 0, 16)], [Opt(op=OptOps.PADTO, axis=1, amt=32)])
|
57
|
+
def test_all_opts_w_upcast(self): return self.test_all_opts([Opt(OptOps.UPCAST, 0, 4)])
|
58
|
+
def test_all_opts_w_unroll(self): return self.test_all_opts([Opt(OptOps.UNROLL, 0, 4)], [Opt(op=OptOps.GROUP, axis=0, amt=0)])
|
59
|
+
def test_all_opts_w_upcast_and_unroll(self):
|
60
|
+
return self.test_all_opts([Opt(OptOps.UPCAST, 0, 4), Opt(OptOps.UNROLL, 0, 4)], [Opt(op=OptOps.GROUP, axis=0, amt=0)])
|
61
|
+
|
62
|
+
class TestIndexing(unittest.TestCase):
|
63
|
+
def test_arange_2_reduce(self):
|
64
|
+
needle = Tensor.zeros(16384, dtype=dtypes.int).contiguous()
|
65
|
+
needle[1337] = 1
|
66
|
+
needle.realize()
|
67
|
+
with Context(NOOPT=1, FUSE_ARANGE=1):
|
68
|
+
GlobalCounters.reset()
|
69
|
+
# TODO: it should work without these reshapes
|
70
|
+
out = ((Tensor.arange(1,16385).reshape(16384,1)-1)*needle.reshape(16384,1)).sum()
|
71
|
+
sched = out.schedule()
|
72
|
+
assert len(sched) == 1
|
73
|
+
run_schedule(sched)
|
74
|
+
assert out.item() == 1337, f"expected 1337, got {out.item()}"
|
75
|
+
|
76
|
+
@unittest.skipIf(getenv("PTX"), "broken on ptx for some reason")
|
77
|
+
def test_manual_index(self):
|
78
|
+
dataset = Tensor.rand(16384, 256).realize()
|
79
|
+
idxs = Tensor([0,3,5,6]).realize()
|
80
|
+
real_index = dataset.numpy()[idxs.numpy()]
|
81
|
+
print("*** indexing ***")
|
82
|
+
with Context(NOOPT=1, FUSE_ARANGE=1):
|
83
|
+
GlobalCounters.reset()
|
84
|
+
rng = Tensor.ones(4, 256, 16384, dtype=dtypes.int)._cumsum(axis=-1, _first_zero=True).reshape(4, 256, 16384, 1)
|
85
|
+
idxs = idxs.reshape(4,1,1,1).expand(4, 256, 16384, 1)
|
86
|
+
reshape_dataset = dataset.T.reshape(1, 256, 16384, 1).expand(4, 256, 16384, 1)
|
87
|
+
full = (rng==idxs).where(reshape_dataset, Tensor.zeros(4, 256, 16384, 1))
|
88
|
+
X = full.sum(axis=(2,3))
|
89
|
+
sched = X.schedule()
|
90
|
+
assert len(sched) == 1
|
91
|
+
run_schedule(sched)
|
92
|
+
assert GlobalCounters.global_ops < 4*16384, f"too many ops {GlobalCounters.global_ops}"
|
93
|
+
np.testing.assert_allclose(real_index, X.numpy())
|
94
|
+
|
95
|
+
def test_index(self):
|
96
|
+
dataset = Tensor.rand(16384, 256).realize()
|
97
|
+
idxs = Tensor([0,3,5,6]).realize()
|
98
|
+
real_index = dataset.numpy()[idxs.numpy()]
|
99
|
+
print("*** indexing ***")
|
100
|
+
with Context(NOOPT=1):
|
101
|
+
GlobalCounters.reset()
|
102
|
+
X = dataset[idxs]
|
103
|
+
assert X.shape == (4,256)
|
104
|
+
sched = X.schedule()
|
105
|
+
# TODO: enable these asserts when the scheduler can handle this
|
106
|
+
#assert len(sched) == 1, f"{len(sched)} != 1"
|
107
|
+
run_schedule(sched)
|
108
|
+
#assert GlobalCounters.global_ops < 4*16384, f"too many ops {GlobalCounters.global_ops}"
|
109
|
+
np.testing.assert_allclose(real_index, X.numpy())
|
110
|
+
|
111
|
+
def test_index_fused(self, noopt=1):
|
112
|
+
dataset = Tensor.rand(16384, 256).realize()
|
113
|
+
idxs = Tensor([0,3,5,6]).realize()
|
114
|
+
real_index = dataset.numpy()[idxs.numpy()]
|
115
|
+
print("*** indexing ***")
|
116
|
+
with Context(NOOPT=noopt, FUSE_ARANGE=1):
|
117
|
+
GlobalCounters.reset()
|
118
|
+
X = dataset[idxs]
|
119
|
+
assert X.shape == (4,256)
|
120
|
+
sched = X.schedule()
|
121
|
+
assert len(sched) == 2
|
122
|
+
run_schedule(sched)
|
123
|
+
assert GlobalCounters.global_ops < 4*16384, f"too many ops {GlobalCounters.global_ops} != {4*16384}"
|
124
|
+
np.testing.assert_allclose(real_index, X.numpy())
|
125
|
+
@unittest.skip("not ready")
|
126
|
+
def test_index_fused_opt(self): self.test_index_fused(0)
|
127
|
+
|
128
|
+
@unittest.skipIf(getenv("PTX"), "broken on ptx for some reason")
|
129
|
+
def test_index_mnist(self, noopt=1):
|
130
|
+
from tinygrad.nn.datasets import mnist
|
131
|
+
X_train, Y_train, _, _ = mnist()
|
132
|
+
with Context(NOOPT=noopt, FUSE_ARANGE=1, SPLIT_REDUCEOP=0):
|
133
|
+
GlobalCounters.reset()
|
134
|
+
samples = Tensor.randint(getenv("BS", 512), high=X_train.shape[0])
|
135
|
+
x = X_train[samples].numpy()
|
136
|
+
y = Y_train[samples].numpy()
|
137
|
+
assert GlobalCounters.global_ops < 4*16384, f"too many ops {GlobalCounters.global_ops} != {4*16384}"
|
138
|
+
np.testing.assert_allclose(X_train.numpy()[samples.numpy()], x)
|
139
|
+
np.testing.assert_allclose(Y_train.numpy()[samples.numpy()], y)
|
140
|
+
@unittest.skip("not ready")
|
141
|
+
def test_index_mnist_opt(self): self.test_index_mnist(0)
|
142
|
+
|
143
|
+
@unittest.skipIf(getenv("PTX"), "broken on ptx for some reason")
|
144
|
+
def test_llama_embedding(self, noopt=1, op_limit=0):
|
145
|
+
# llama3 is 128256
|
146
|
+
vocab_size, embed_size = (10, 3) if CI else (32000, 4096)
|
147
|
+
emb = nn.Embedding(vocab_size, embed_size)
|
148
|
+
emb_w = emb.weight.numpy()
|
149
|
+
x = Tensor([1,2,3,4])
|
150
|
+
with Context(NOOPT=noopt, FUSE_ARANGE=1):
|
151
|
+
GlobalCounters.reset()
|
152
|
+
z = emb(x).realize()
|
153
|
+
self.assertLessEqual(GlobalCounters.global_ops, op_limit)
|
154
|
+
self.assertEqual(GlobalCounters.kernel_count, 2)
|
155
|
+
if getenv("CHECK", 1):
|
156
|
+
import torch
|
157
|
+
with torch.no_grad():
|
158
|
+
torch_emb = torch.nn.Embedding(vocab_size, embed_size).eval()
|
159
|
+
torch_emb.weight[:] = torch.tensor(emb_w, dtype=torch.float32)
|
160
|
+
torch_z = torch_emb(torch.tensor(x.numpy()))
|
161
|
+
# TODO: reshape to match torch, should we do this in nn?
|
162
|
+
np.testing.assert_allclose(z.numpy().reshape(4, embed_size), torch_z.detach().numpy(), atol=1e-8, rtol=1e-8)
|
163
|
+
# at least the arange is being fused
|
164
|
+
def test_llama_embedding_opt(self): self.test_llama_embedding(0, 1736704000)
|
165
|
+
|
166
|
+
if __name__ == "__main__":
|
167
|
+
unittest.main()
|
@@ -2,14 +2,14 @@ import unittest, math
|
|
2
2
|
from tinygrad import Tensor, Device, dtypes
|
3
3
|
from tinygrad.engine.schedule import create_schedule
|
4
4
|
from tinygrad.helpers import CI
|
5
|
-
from tinygrad.ops import
|
5
|
+
from tinygrad.ops import MetaOps
|
6
6
|
import numpy as np
|
7
7
|
from test.helpers import is_dtype_supported
|
8
8
|
|
9
9
|
def _check_ast_count(desired_count:int, t:Tensor):
|
10
10
|
# NOTE: this has side effect because everything can be scheduled only once
|
11
11
|
schedule = create_schedule(t.lazydata.lbs)
|
12
|
-
asts = [s for s in schedule if s.ast
|
12
|
+
asts = [s for s in schedule if s.ast.op is MetaOps.KERNEL]
|
13
13
|
assert len(asts) == desired_count
|
14
14
|
|
15
15
|
class TestUnaryOpsConstFolding(unittest.TestCase):
|
@@ -42,6 +42,14 @@ class TestConv(unittest.TestCase):
|
|
42
42
|
|
43
43
|
print(ret.numpy())
|
44
44
|
|
45
|
+
def test_two_binops_no_rerun_small(self):
|
46
|
+
Tensor.no_grad = True
|
47
|
+
x = Tensor.rand(1,1,32,32)
|
48
|
+
w = Tensor.rand(1,1,3,3)
|
49
|
+
out = x.conv2d(w, padding=(1,1))
|
50
|
+
np.testing.assert_allclose(out.relu().numpy(), np.maximum(out.numpy(), 0))
|
51
|
+
Tensor.no_grad = False
|
52
|
+
|
45
53
|
def test_two_binops_no_rerun(self):
|
46
54
|
Tensor.no_grad = True
|
47
55
|
x = Tensor.randn(1,12,128,256)
|
@@ -0,0 +1,63 @@
|
|
1
|
+
#!/usr/bin/env python
|
2
|
+
import unittest
|
3
|
+
from tinygrad.tensor import Tensor
|
4
|
+
from tinygrad.ops import MetaOps, BufferOps
|
5
|
+
from tinygrad.nn import Conv2d
|
6
|
+
from tinygrad.engine.schedule import create_schedule
|
7
|
+
from tinygrad.shape.shapetracker import ShapeTracker, View
|
8
|
+
from tinygrad.helpers import prod
|
9
|
+
from test.unit.test_shapetracker import shapetracker_getitem
|
10
|
+
|
11
|
+
class TestConvShapetracker(unittest.TestCase):
|
12
|
+
def test_conv_3x3_one_view(self):
|
13
|
+
conv = Conv2d(16, 32, (3, 3))
|
14
|
+
seen = set()
|
15
|
+
|
16
|
+
# first run to init the weights, they are saved in seen
|
17
|
+
create_schedule([conv(Tensor.empty(1, 16, 10, 10)).lazydata], seen)
|
18
|
+
# run it again to get the kernels
|
19
|
+
sched = [si for si in create_schedule([conv(Tensor.empty(1, 16, 10, 10)).lazydata], seen) if si.ast.op is MetaOps.KERNEL]
|
20
|
+
assert len(sched) == 1, f"conv should only have one kernel, getting {len(sched)}"
|
21
|
+
for st in [x.arg.st for x in sched[0].ast.lazyops if x.op is BufferOps.LOAD]:
|
22
|
+
assert len(st.views) == 1
|
23
|
+
|
24
|
+
@unittest.expectedFailure
|
25
|
+
def test_conv_2x2_backward_one_view(self):
|
26
|
+
X = Tensor.rand(1, 1, 3, 3, requires_grad=True)
|
27
|
+
conv = Conv2d(1, 1, (2, 2), bias=False)
|
28
|
+
conv(X).mean().backward()
|
29
|
+
si = X.grad.schedule()[-1]
|
30
|
+
print(si)
|
31
|
+
ldb = [x for x in si.ast.lazyops if x.op is BufferOps.LOAD][0]
|
32
|
+
st: ShapeTracker = ldb.arg.st.simplify()
|
33
|
+
# NOTE: st.real_size() is broken
|
34
|
+
print(si.inputs[0].size)
|
35
|
+
#self.assertEqual(si.inputs[0].size, st.real_size())
|
36
|
+
for v in st.views: print(v)
|
37
|
+
|
38
|
+
# same st
|
39
|
+
test_st = ShapeTracker((
|
40
|
+
View(shape=(1, 1, 2, 4, 2, 4), strides=(0, 0, 2, 8, 1, 4), offset=0, mask=((0, 1), (0, 1), (0, 2), (0, 2), (0, 2), (0, 2)), contiguous=False),
|
41
|
+
View(shape=(1, 1, 1, 1, 3, 3, 3, 3), strides=(0, 0, 0, 0, 24, 8, 3, 1), offset=0,
|
42
|
+
mask=((0, 1), (0, 1), (0, 1), (0, 1), (0, 2), (0, 3), (0, 2), (0, 3)), contiguous=False)))
|
43
|
+
#test_st = ShapeTracker((
|
44
|
+
# View(shape=(2,4), strides=(1,4), offset=0, mask=None, contiguous=False),
|
45
|
+
#)).simplify()
|
46
|
+
#View(shape=(1, 1, 2, 4, 2, 4), strides=(0, 0, 2, 8, 1, 4), offset=0, mask=((0, 1), (0, 1), (0, 2), (0, 2), (0, 2), (0, 2)), contiguous=False),
|
47
|
+
#View(shape=(1, 1, 1, 1, 3, 3, 3, 3), strides=(0, 0, 0, 0, 24, 8, 3, 1), offset=0,
|
48
|
+
# mask=((0, 1), (0, 1), (0, 1), (0, 1), (0, 2), (0, 3), (0, 2), (0, 3)), contiguous=False))).simplify()
|
49
|
+
print("*** new ***")
|
50
|
+
for v in test_st.views: print(v)
|
51
|
+
for i in range(prod(st.shape)):
|
52
|
+
i1, i2 = shapetracker_getitem(st, i), shapetracker_getitem(test_st, i)
|
53
|
+
print(i, i1, i2, si.inputs[0].size, i1==i2)
|
54
|
+
#self.assertEqual(i1, i2)
|
55
|
+
|
56
|
+
for stt in [st, test_st]:
|
57
|
+
s,va = stt.expr_idxs()
|
58
|
+
print(s)
|
59
|
+
print(va)
|
60
|
+
assert len(st.views) <= 2
|
61
|
+
|
62
|
+
if __name__ == '__main__':
|
63
|
+
unittest.main()
|
@@ -31,7 +31,7 @@ def atan2_cpu(ret:Buffer, a:Buffer, b:Buffer): ret.copyin(np.require(np.arctan2(
|
|
31
31
|
# NOTE: The derivative of atan2 doesn't need a custom op! https://www.liquisearch.com/atan2/derivative
|
32
32
|
# In general, it is also optional to write a backward function, just your backward pass won't work without it
|
33
33
|
|
34
|
-
from tinygrad.ops import
|
34
|
+
from tinygrad.ops import MetaOps, BinaryOps, UnaryOps
|
35
35
|
from tinygrad.lazy import LazyBuffer
|
36
36
|
from tinygrad.tensor import Function
|
37
37
|
|
@@ -39,7 +39,7 @@ class ATan2(Function):
|
|
39
39
|
def forward(self, a:LazyBuffer, b:LazyBuffer) -> LazyBuffer:
|
40
40
|
assert prod(a.shape) == prod(b.shape) and a.device == b.device, "shape or device mismatch"
|
41
41
|
self.a, self.b = a, b
|
42
|
-
return create_lazybuffer(a.device, ShapeTracker.from_shape(a.shape), max(a.dtype, b.dtype),
|
42
|
+
return create_lazybuffer(a.device, ShapeTracker.from_shape(a.shape), max(a.dtype, b.dtype), MetaOps.CUSTOM,
|
43
43
|
arg={"GPU": atan2_gpu, "CPU": atan2_cpu}[a.device], srcs=(a.contiguous(), b.contiguous()))
|
44
44
|
def backward(self, grad_output:LazyBuffer) -> Tuple[Optional[LazyBuffer], Optional[LazyBuffer]]:
|
45
45
|
recip = (self.a.e(BinaryOps.MUL, self.a)).e(BinaryOps.ADD, self.b.e(BinaryOps.MUL, self.b)).e(UnaryOps.RECIP)
|
@@ -202,7 +202,7 @@ class TestFloatDType(TestDType):
|
|
202
202
|
|
203
203
|
class TestDoubleDType(TestDType):
|
204
204
|
DTYPE = dtypes.double
|
205
|
-
@unittest.skipIf((CI and Device.DEFAULT in {"CUDA", "NV"}) or getenv("PTX"), "conversion not supported on
|
205
|
+
@unittest.skipIf((CI and Device.DEFAULT in {"CUDA", "NV"}) or getenv("PTX"), "conversion not supported on CI CUDA and PTX") # TODO: why not?
|
206
206
|
def test_float64_increased_precision(self):
|
207
207
|
for func in [
|
208
208
|
lambda t: t.exp(),
|
@@ -267,7 +267,10 @@ class TestInt32DType(TestDType): DTYPE = dtypes.int32
|
|
267
267
|
class TestUint32DType(TestDType): DTYPE = dtypes.uint32
|
268
268
|
|
269
269
|
class TestInt64DType(TestDType): DTYPE = dtypes.int64
|
270
|
-
class TestUint64DType(TestDType):
|
270
|
+
class TestUint64DType(TestDType):
|
271
|
+
DTYPE = dtypes.uint64
|
272
|
+
def test_uint64_load(self):
|
273
|
+
assert Tensor(2**64 - 1, dtype=dtypes.uint64).numpy() == 2**64 - 1
|
271
274
|
|
272
275
|
class TestBoolDType(TestDType): DTYPE = dtypes.bool
|
273
276
|
|
@@ -298,7 +301,7 @@ class TestEqStrDType(unittest.TestCase):
|
|
298
301
|
def test_strs(self):
|
299
302
|
if PtrDType is None: raise unittest.SkipTest("no PtrDType support")
|
300
303
|
self.assertEqual(str(dtypes.imagef((1,2,4))), "dtypes.imagef((1, 2, 4))")
|
301
|
-
self.assertEqual(str(PtrDType(dtypes.float32)), "
|
304
|
+
self.assertEqual(str(PtrDType(dtypes.float32)), "PtrDType(dtypes.float)")
|
302
305
|
|
303
306
|
class TestHelpers(unittest.TestCase):
|
304
307
|
signed_ints = (dtypes.int8, dtypes.int16, dtypes.int32, dtypes.int64)
|
@@ -347,6 +350,20 @@ class TestHelpers(unittest.TestCase):
|
|
347
350
|
with self.assertRaises(RuntimeError): dtypes.from_py({})
|
348
351
|
with self.assertRaises(RuntimeError): dtypes.from_py(set())
|
349
352
|
|
353
|
+
def test_dtype_range(self):
|
354
|
+
for dt in core_dtypes:
|
355
|
+
if dtypes.is_float(dt):
|
356
|
+
np.testing.assert_equal(dtypes.min(dt), -math.inf)
|
357
|
+
np.testing.assert_equal(dtypes.max(dt), math.inf)
|
358
|
+
elif dtypes.is_int(dt):
|
359
|
+
info = np.iinfo(_to_np_dtype(dt))
|
360
|
+
np.testing.assert_equal(dtypes.min(dt), info.min)
|
361
|
+
np.testing.assert_equal(dtypes.max(dt), info.max)
|
362
|
+
else:
|
363
|
+
assert dt == dtypes.bool, dt
|
364
|
+
np.testing.assert_equal(dtypes.min(dt), False)
|
365
|
+
np.testing.assert_equal(dtypes.max(dt), True)
|
366
|
+
|
350
367
|
class TestTypeSpec(unittest.TestCase):
|
351
368
|
def setUp(self):
|
352
369
|
self.old_default_int, self.old_default_float = dtypes.default_int, dtypes.default_float
|
@@ -378,6 +395,23 @@ class TestTypeSpec(unittest.TestCase):
|
|
378
395
|
subprocess.run(['DEFAULT_FLOAT=TYPO python3 -c "from tinygrad import dtypes"'],
|
379
396
|
shell=True, check=True)
|
380
397
|
|
398
|
+
def test_dtype_str_arg(self):
|
399
|
+
n = np.random.normal(0, 1, (10, 10)).astype(np.float32)
|
400
|
+
tested = 0
|
401
|
+
for dtype_str, dtype in [
|
402
|
+
("bool", dtypes.bool), ("int8", dtypes.int8), ("int", dtypes.int), ("uint32", dtypes.uint32), ("float32", dtypes.float32)]:
|
403
|
+
np.testing.assert_equal(Tensor(n, dtype=dtype_str).numpy(), Tensor(n, dtype=dtype).numpy())
|
404
|
+
np.testing.assert_equal(Tensor(n).cast(dtype_str).numpy(), Tensor(n).cast(dtype).numpy())
|
405
|
+
if dtype.itemsize == 4:
|
406
|
+
np.testing.assert_equal(Tensor(n).bitcast(dtype_str).numpy(), Tensor(n).bitcast(dtype).numpy())
|
407
|
+
tested += 1
|
408
|
+
assert tested == 3
|
409
|
+
|
410
|
+
with self.assertRaises(AttributeError): Tensor([1, 2, 3], dtype="nonexistdtype")
|
411
|
+
with self.assertRaises(AttributeError): Tensor([1, 2, 3], dtype="")
|
412
|
+
|
413
|
+
np.testing.assert_equal(Tensor(n).sum(acc_dtype="int16").numpy(), Tensor(n).sum(acc_dtype=dtypes.int16).numpy())
|
414
|
+
|
381
415
|
@given(strat.sampled_from(dtype_ints), strat.sampled_from(dtype_floats))
|
382
416
|
def test_creation(self, default_int, default_float):
|
383
417
|
dtypes.default_int, dtypes.default_float = default_int, default_float
|
@@ -439,6 +473,9 @@ class TestTypeSpec(unittest.TestCase):
|
|
439
473
|
_assert_eq(Tensor.arange(5, dtype=dtypes.float16), dtypes.float16, np.arange(5))
|
440
474
|
_assert_eq(Tensor.arange(3, 9, 0.7), dtypes.default_float, np.arange(3, 9, 0.7))
|
441
475
|
_assert_eq(Tensor.arange(3, 8.5, 3), dtypes.default_float, np.arange(3, 8.5, 3))
|
476
|
+
# stop-start and step have different signs
|
477
|
+
_assert_eq(Tensor.arange(3, 5, -2), dtypes.default_int, np.arange(3, 5, -2))
|
478
|
+
_assert_eq(Tensor.arange(5.0, 3.0), dtypes.default_float, np.arange(5.0, 3.0))
|
442
479
|
|
443
480
|
@given(strat.sampled_from(core_dtypes), strat.sampled_from([operator.gt, operator.ge, operator.le, operator.lt, operator.eq, operator.ne]))
|
444
481
|
def test_bool_ops(self, dtype, op):
|
@@ -16,7 +16,7 @@ settings.register_profile("my_profile", max_examples=200, deadline=None, derando
|
|
16
16
|
settings.load_profile("my_profile")
|
17
17
|
print(settings.default)
|
18
18
|
|
19
|
-
dtypes_float = (dtypes.float32, dtypes.
|
19
|
+
dtypes_float = (dtypes.float16, dtypes.float32, dtypes.float64)
|
20
20
|
dtypes_int = (dtypes.int8, dtypes.int16, dtypes.int32, dtypes.int64, dtypes.uint8, dtypes.uint16, dtypes.uint32, dtypes.uint64)
|
21
21
|
dtypes_bool = (dtypes.bool,)
|
22
22
|
binary_operations = [operator.add, operator.sub, operator.mul, operator.lt, operator.eq]
|
@@ -24,9 +24,9 @@ binary_operations = [operator.add, operator.sub, operator.mul, operator.lt, oper
|
|
24
24
|
# TODO: LLVM comparing with nan is incorrect
|
25
25
|
if Device.DEFAULT == "LLVM":
|
26
26
|
binary_operations.remove(operator.lt)
|
27
|
-
binary_operations.remove(operator.eq)
|
28
27
|
|
29
|
-
integer_binary_operations = binary_operations + [(Tensor.xor, np.bitwise_xor)
|
28
|
+
integer_binary_operations = binary_operations + [(Tensor.xor, np.bitwise_xor), (Tensor.bitwise_and, np.bitwise_and),
|
29
|
+
(Tensor.bitwise_or, np.bitwise_or)]
|
30
30
|
unary_operations = [(Tensor.exp, np.exp), (Tensor.log, np.log), operator.neg, (Tensor.sin, np.sin),
|
31
31
|
(Tensor.sqrt, np.sqrt), (Tensor.reciprocal, np.reciprocal)]
|
32
32
|
|
@@ -39,9 +39,8 @@ unary_operations = [(Tensor.exp, np.exp), (Tensor.log, np.log), operator.neg, (T
|
|
39
39
|
# TODO: (a+b)/2 in tensor.py's maximum can overflow. This requires a new implementation of maximum that can be backpropagated
|
40
40
|
#binary_operations += [(Tensor.maximum, np.maximum)]
|
41
41
|
|
42
|
-
# TODO:
|
43
|
-
|
44
|
-
if getenv("CUDACPU") or (getenv("MOCKGPU") and Device.DEFAULT == "NV") or Device.DEFAULT == "METAL": unary_operations.remove((Tensor.sin, np.sin))
|
42
|
+
# TODO: CI CUDA segfaults on sin
|
43
|
+
if getenv("MOCKGPU") and Device.DEFAULT == "NV": unary_operations.remove((Tensor.sin, np.sin))
|
45
44
|
|
46
45
|
class ht:
|
47
46
|
float64 = strat.floats(width=64, allow_subnormal=False)
|
@@ -68,7 +67,7 @@ def universal_test_unary(a, dtype, op):
|
|
68
67
|
if not isinstance(op, tuple): op = (op, op)
|
69
68
|
out: Tensor = op[0](Tensor([a], dtype=dtype))
|
70
69
|
sched = create_schedule([out.lazydata])
|
71
|
-
ast = sched[-1].ast
|
70
|
+
ast = sched[-1].ast
|
72
71
|
run_schedule(sched)
|
73
72
|
tensor_value = out.numpy()
|
74
73
|
numpy_value = op[1](np.array([a]).astype(_to_np_dtype(dtype)))
|
@@ -145,8 +144,8 @@ class TestDTypeALU(unittest.TestCase):
|
|
145
144
|
@given(ht.int32, ht.int32, ht.float32, strat.sampled_from(integer_binary_operations), strat.sampled_from(binary_operations))
|
146
145
|
def test_int32_midcast_float(self, a, b, c, op1, op2): universal_test_midcast(a, b, c, op1, op2, dtypes.int32, dtypes.float32)
|
147
146
|
|
148
|
-
# Metal and
|
149
|
-
skip_overflow = CI and
|
147
|
+
# Metal and CUDA and HIP behave differently than numpy in CI for overflows
|
148
|
+
skip_overflow = CI and Device.DEFAULT in {"AMD", "NV"}
|
150
149
|
@given(strat.floats(width=32, min_value=0, max_value=10.0) if skip_overflow else ht.float32,
|
151
150
|
strat.floats(width=32, min_value=0, max_value=10.0) if skip_overflow else ht.float32,
|
152
151
|
ht.int32, strat.sampled_from(binary_operations), strat.sampled_from(integer_binary_operations))
|
@@ -161,5 +160,43 @@ class TestDTypeALU(unittest.TestCase):
|
|
161
160
|
@given(ht.int32, strat.sampled_from(dtypes_float+dtypes_int+dtypes_bool))
|
162
161
|
def test_int32_cast(self, a, dtype): universal_test_cast(a, dtypes.int32, dtype)
|
163
162
|
|
163
|
+
class TestFromFuzzer(unittest.TestCase):
|
164
|
+
@given(strat.sampled_from(dtypes_float))
|
165
|
+
def test_sin(self, dtype):
|
166
|
+
if not is_dtype_supported(dtype): return
|
167
|
+
if dtype == dtypes.float64:
|
168
|
+
# crashes in CI CUDA
|
169
|
+
if getenv("MOCKGPU") and Device.DEFAULT == "NV": return
|
170
|
+
def _test_value(n: float, unit: float=1.0):
|
171
|
+
next_float = np.nextafter(1.0, 2.0, dtype=_to_np_dtype(dtype))
|
172
|
+
ulp = next_float - 1.0
|
173
|
+
ulp = unit * ulp
|
174
|
+
np.testing.assert_allclose(Tensor([n], dtype=dtype).sin().numpy(), np.sin(np.array([n], dtype=_to_np_dtype(dtype))), atol=ulp, rtol=1e-5)
|
175
|
+
_test_value(-35.0)
|
176
|
+
_test_value(-25.0)
|
177
|
+
_test_value(25.0)
|
178
|
+
_test_value(30.0) # 30.0 == switch_over
|
179
|
+
_test_value(35.0)
|
180
|
+
_test_value(0.0)
|
181
|
+
_test_value(np.pi / 2)
|
182
|
+
# worst case of ulp 1.5
|
183
|
+
_test_value(np.pi * 2, unit=1.5)
|
184
|
+
@given(strat.sampled_from(dtypes_float))
|
185
|
+
def test_log2(self, dtype):
|
186
|
+
if not is_dtype_supported(dtype): return
|
187
|
+
if dtype == dtypes.float64:
|
188
|
+
# crashes in CI CUDA
|
189
|
+
if getenv("MOCKGPU") and Device.DEFAULT == "NV": return
|
190
|
+
def _test_value(n: float, unit: float=1.0):
|
191
|
+
next_float = np.nextafter(1.0, 2.0, dtype=_to_np_dtype(dtype))
|
192
|
+
ulp = next_float - 1.0
|
193
|
+
ulp = unit * ulp
|
194
|
+
np.testing.assert_allclose(Tensor([n], dtype=dtype).log2().numpy(), np.log2(np.array([n], dtype=_to_np_dtype(dtype))), atol=ulp, rtol=1e-5)
|
195
|
+
fmin = np.finfo(_to_np_dtype(dtype)).tiny
|
196
|
+
for scale in [1.0, 1e10, 1e20, 1e30]:
|
197
|
+
_test_value(fmin * scale)
|
198
|
+
_test_value(-fmin * scale)
|
199
|
+
_test_value(0)
|
200
|
+
|
164
201
|
if __name__ == '__main__':
|
165
202
|
unittest.main()
|