tinygrad 0.7.0__py3-none-any.whl → 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tinygrad/__init__.py +6 -0
- tinygrad/codegen/kernel.py +572 -83
- tinygrad/codegen/linearizer.py +415 -395
- tinygrad/codegen/uops.py +415 -0
- tinygrad/device.py +183 -0
- tinygrad/dtype.py +113 -0
- tinygrad/engine/__init__.py +0 -0
- tinygrad/engine/graph.py +100 -0
- tinygrad/engine/jit.py +195 -0
- tinygrad/engine/realize.py +191 -0
- tinygrad/engine/schedule.py +362 -0
- tinygrad/engine/search.py +196 -0
- tinygrad/{mlops.py → function.py} +76 -55
- tinygrad/helpers.py +196 -89
- tinygrad/lazy.py +210 -371
- tinygrad/multi.py +169 -0
- tinygrad/nn/__init__.py +202 -22
- tinygrad/nn/datasets.py +7 -0
- tinygrad/nn/optim.py +112 -32
- tinygrad/nn/state.py +136 -39
- tinygrad/ops.py +119 -202
- tinygrad/renderer/__init__.py +61 -0
- tinygrad/renderer/assembly.py +276 -0
- tinygrad/renderer/cstyle.py +353 -166
- tinygrad/renderer/llvmir.py +150 -138
- tinygrad/runtime/autogen/amd_gpu.py +1900 -0
- tinygrad/runtime/autogen/comgr.py +865 -0
- tinygrad/runtime/autogen/cuda.py +5923 -0
- tinygrad/runtime/autogen/hip.py +5909 -0
- tinygrad/runtime/autogen/hsa.py +5761 -0
- tinygrad/runtime/autogen/kfd.py +812 -0
- tinygrad/runtime/autogen/nv_gpu.py +33328 -0
- tinygrad/runtime/autogen/opencl.py +1795 -0
- tinygrad/runtime/driver/hip_comgr.py +47 -0
- tinygrad/runtime/driver/hsa.py +143 -0
- tinygrad/runtime/graph/clang.py +38 -0
- tinygrad/runtime/graph/cuda.py +81 -0
- tinygrad/runtime/graph/hcq.py +143 -0
- tinygrad/runtime/graph/hsa.py +171 -0
- tinygrad/runtime/graph/metal.py +75 -0
- tinygrad/runtime/ops_amd.py +564 -0
- tinygrad/runtime/ops_clang.py +24 -77
- tinygrad/runtime/ops_cuda.py +175 -89
- tinygrad/runtime/ops_disk.py +56 -33
- tinygrad/runtime/ops_gpu.py +92 -95
- tinygrad/runtime/ops_hsa.py +278 -0
- tinygrad/runtime/ops_llvm.py +39 -60
- tinygrad/runtime/ops_metal.py +92 -74
- tinygrad/runtime/ops_npy.py +9 -0
- tinygrad/runtime/ops_nv.py +630 -0
- tinygrad/runtime/ops_python.py +204 -0
- tinygrad/shape/shapetracker.py +86 -254
- tinygrad/shape/symbolic.py +166 -141
- tinygrad/shape/view.py +296 -0
- tinygrad/tensor.py +2619 -448
- {tinygrad-0.7.0.dist-info → tinygrad-0.9.0.dist-info}/LICENSE +1 -1
- tinygrad-0.9.0.dist-info/METADATA +227 -0
- tinygrad-0.9.0.dist-info/RECORD +60 -0
- {tinygrad-0.7.0.dist-info → tinygrad-0.9.0.dist-info}/WHEEL +1 -1
- tinygrad/codegen/assembly.py +0 -190
- tinygrad/codegen/optimizer.py +0 -379
- tinygrad/codegen/search.py +0 -72
- tinygrad/graph.py +0 -83
- tinygrad/jit.py +0 -57
- tinygrad/nn/image.py +0 -100
- tinygrad/renderer/assembly_arm64.py +0 -169
- tinygrad/renderer/assembly_ptx.py +0 -98
- tinygrad/renderer/wgsl.py +0 -53
- tinygrad/runtime/lib.py +0 -113
- tinygrad/runtime/ops_cpu.py +0 -51
- tinygrad/runtime/ops_hip.py +0 -82
- tinygrad/runtime/ops_shm.py +0 -29
- tinygrad/runtime/ops_torch.py +0 -30
- tinygrad/runtime/ops_webgpu.py +0 -45
- tinygrad-0.7.0.dist-info/METADATA +0 -212
- tinygrad-0.7.0.dist-info/RECORD +0 -40
- {tinygrad-0.7.0.dist-info → tinygrad-0.9.0.dist-info}/top_level.txt +0 -0
@@ -1,4 +1,4 @@
|
|
1
|
-
Copyright (c)
|
1
|
+
Copyright (c) 2024, the tiny corp
|
2
2
|
|
3
3
|
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
4
4
|
|
@@ -0,0 +1,227 @@
|
|
1
|
+
Metadata-Version: 2.1
|
2
|
+
Name: tinygrad
|
3
|
+
Version: 0.9.0
|
4
|
+
Summary: You like pytorch? You like micrograd? You love tinygrad! <3
|
5
|
+
Author: George Hotz
|
6
|
+
License: MIT
|
7
|
+
Classifier: Programming Language :: Python :: 3
|
8
|
+
Classifier: License :: OSI Approved :: MIT License
|
9
|
+
Requires-Python: >=3.8
|
10
|
+
Description-Content-Type: text/markdown
|
11
|
+
License-File: LICENSE
|
12
|
+
Requires-Dist: numpy
|
13
|
+
Requires-Dist: tqdm
|
14
|
+
Requires-Dist: pyobjc-framework-Metal ; platform_system == "Darwin"
|
15
|
+
Requires-Dist: pyobjc-framework-libdispatch ; platform_system == "Darwin"
|
16
|
+
Provides-Extra: arm
|
17
|
+
Requires-Dist: unicorn ; extra == 'arm'
|
18
|
+
Provides-Extra: docs
|
19
|
+
Requires-Dist: mkdocs-material ; extra == 'docs'
|
20
|
+
Requires-Dist: mkdocstrings[python] ; extra == 'docs'
|
21
|
+
Requires-Dist: markdown-callouts ; extra == 'docs'
|
22
|
+
Requires-Dist: markdown-exec[ansi] ; extra == 'docs'
|
23
|
+
Requires-Dist: black ; extra == 'docs'
|
24
|
+
Provides-Extra: linting
|
25
|
+
Requires-Dist: pylint ; extra == 'linting'
|
26
|
+
Requires-Dist: mypy ; extra == 'linting'
|
27
|
+
Requires-Dist: typing-extensions ; extra == 'linting'
|
28
|
+
Requires-Dist: pre-commit ; extra == 'linting'
|
29
|
+
Requires-Dist: ruff ; extra == 'linting'
|
30
|
+
Requires-Dist: types-tqdm ; extra == 'linting'
|
31
|
+
Provides-Extra: llvm
|
32
|
+
Requires-Dist: llvmlite ; extra == 'llvm'
|
33
|
+
Provides-Extra: testing
|
34
|
+
Requires-Dist: torch ; extra == 'testing'
|
35
|
+
Requires-Dist: pillow ; extra == 'testing'
|
36
|
+
Requires-Dist: pytest ; extra == 'testing'
|
37
|
+
Requires-Dist: pytest-xdist ; extra == 'testing'
|
38
|
+
Requires-Dist: onnx ==1.16.0 ; extra == 'testing'
|
39
|
+
Requires-Dist: onnx2torch ; extra == 'testing'
|
40
|
+
Requires-Dist: opencv-python ; extra == 'testing'
|
41
|
+
Requires-Dist: tabulate ; extra == 'testing'
|
42
|
+
Requires-Dist: safetensors ; extra == 'testing'
|
43
|
+
Requires-Dist: transformers ; extra == 'testing'
|
44
|
+
Requires-Dist: sentencepiece ; extra == 'testing'
|
45
|
+
Requires-Dist: tiktoken ; extra == 'testing'
|
46
|
+
Requires-Dist: librosa ; extra == 'testing'
|
47
|
+
Requires-Dist: networkx ; extra == 'testing'
|
48
|
+
Requires-Dist: hypothesis ; extra == 'testing'
|
49
|
+
Requires-Dist: nibabel ; extra == 'testing'
|
50
|
+
Provides-Extra: testing_tf
|
51
|
+
Requires-Dist: tensorflow ==2.15.1 ; extra == 'testing_tf'
|
52
|
+
Requires-Dist: tensorflow-addons ; extra == 'testing_tf'
|
53
|
+
Provides-Extra: triton
|
54
|
+
Requires-Dist: triton-nightly >=2.1.0.dev20231014192330 ; extra == 'triton'
|
55
|
+
|
56
|
+
<div align="center">
|
57
|
+
|
58
|
+
<picture>
|
59
|
+
<source media="(prefers-color-scheme: light)" srcset="/docs/logo_tiny_light.svg">
|
60
|
+
<img alt="tiny corp logo" src="/docs/logo_tiny_dark.svg" width="50%" height="50%">
|
61
|
+
</picture>
|
62
|
+
|
63
|
+
tinygrad: For something between [PyTorch](https://github.com/pytorch/pytorch) and [karpathy/micrograd](https://github.com/karpathy/micrograd). Maintained by [tiny corp](https://tinygrad.org).
|
64
|
+
|
65
|
+
<h3>
|
66
|
+
|
67
|
+
[Homepage](https://github.com/tinygrad/tinygrad) | [Documentation](/docs) | [Examples](/examples) | [Showcase](/docs/showcase.md) | [Discord](https://discord.gg/ZjZadyC7PK)
|
68
|
+
|
69
|
+
</h3>
|
70
|
+
|
71
|
+
[](https://github.com/tinygrad/tinygrad/stargazers)
|
72
|
+
[](https://github.com/tinygrad/tinygrad/actions/workflows/test.yml)
|
73
|
+
[](https://discord.gg/ZjZadyC7PK)
|
74
|
+
|
75
|
+
</div>
|
76
|
+
|
77
|
+
---
|
78
|
+
|
79
|
+
This may not be the best deep learning framework, but it is a deep learning framework.
|
80
|
+
|
81
|
+
Due to its extreme simplicity, it aims to be the easiest framework to add new accelerators to, with support for both inference and training. If XLA is CISC, tinygrad is RISC.
|
82
|
+
|
83
|
+
tinygrad is still alpha software, but we [raised some money](https://geohot.github.io/blog/jekyll/update/2023/05/24/the-tiny-corp-raised-5M.html) to make it good. Someday, we will tape out chips.
|
84
|
+
|
85
|
+
## Features
|
86
|
+
|
87
|
+
### LLaMA and Stable Diffusion
|
88
|
+
|
89
|
+
tinygrad can run [LLaMA](/docs/showcase.md#llama) and [Stable Diffusion](/docs/showcase.md#stable-diffusion)!
|
90
|
+
|
91
|
+
### Laziness
|
92
|
+
|
93
|
+
Try a matmul. See how, despite the style, it is fused into one kernel with the power of laziness.
|
94
|
+
|
95
|
+
```sh
|
96
|
+
DEBUG=3 python3 -c "from tinygrad import Tensor;
|
97
|
+
N = 1024; a, b = Tensor.rand(N, N), Tensor.rand(N, N);
|
98
|
+
c = (a.reshape(N, 1, N) * b.T.reshape(1, N, N)).sum(axis=2);
|
99
|
+
print((c.numpy() - (a.numpy() @ b.numpy())).mean())"
|
100
|
+
```
|
101
|
+
|
102
|
+
And we can change `DEBUG` to `4` to see the generated code.
|
103
|
+
|
104
|
+
### Neural networks
|
105
|
+
|
106
|
+
As it turns out, 90% of what you need for neural networks are a decent autograd/tensor library.
|
107
|
+
Throw in an optimizer, a data loader, and some compute, and you have all you need.
|
108
|
+
|
109
|
+
```py
|
110
|
+
from tinygrad import Tensor, nn
|
111
|
+
|
112
|
+
class LinearNet:
|
113
|
+
def __init__(self):
|
114
|
+
self.l1 = Tensor.kaiming_uniform(784, 128)
|
115
|
+
self.l2 = Tensor.kaiming_uniform(128, 10)
|
116
|
+
def __call__(self, x:Tensor) -> Tensor:
|
117
|
+
return x.flatten(1).dot(self.l1).relu().dot(self.l2)
|
118
|
+
|
119
|
+
model = LinearNet()
|
120
|
+
optim = nn.optim.Adam([model.l1, model.l2], lr=0.001)
|
121
|
+
|
122
|
+
x, y = Tensor.rand(4, 1, 28, 28), Tensor([2,4,3,7]) # replace with real mnist dataloader
|
123
|
+
|
124
|
+
for i in range(10):
|
125
|
+
optim.zero_grad()
|
126
|
+
loss = model(x).sparse_categorical_crossentropy(y).backward()
|
127
|
+
optim.step()
|
128
|
+
print(i, loss.item())
|
129
|
+
```
|
130
|
+
|
131
|
+
See [examples/beautiful_mnist.py](examples/beautiful_mnist.py) for the full version that gets 98% in ~5 seconds
|
132
|
+
|
133
|
+
## Accelerators
|
134
|
+
|
135
|
+
tinygrad already supports numerous accelerators, including:
|
136
|
+
|
137
|
+
- [x] [GPU (OpenCL)](tinygrad/runtime/ops_gpu.py)
|
138
|
+
- [x] [CLANG (C Code)](tinygrad/runtime/ops_clang.py)
|
139
|
+
- [x] [LLVM](tinygrad/runtime/ops_llvm.py)
|
140
|
+
- [x] [METAL](tinygrad/runtime/ops_metal.py)
|
141
|
+
- [x] [CUDA](tinygrad/runtime/ops_cuda.py)
|
142
|
+
- [x] [HSA](tinygrad/runtime/ops_hsa.py)
|
143
|
+
|
144
|
+
And it is easy to add more! Your accelerator of choice only needs to support a total of ~25 low level ops.
|
145
|
+
|
146
|
+
## Installation
|
147
|
+
|
148
|
+
The current recommended way to install tinygrad is from source.
|
149
|
+
|
150
|
+
### From source
|
151
|
+
|
152
|
+
```sh
|
153
|
+
git clone https://github.com/tinygrad/tinygrad.git
|
154
|
+
cd tinygrad
|
155
|
+
python3 -m pip install -e .
|
156
|
+
```
|
157
|
+
|
158
|
+
### Direct (master)
|
159
|
+
|
160
|
+
```sh
|
161
|
+
python3 -m pip install git+https://github.com/tinygrad/tinygrad.git
|
162
|
+
```
|
163
|
+
|
164
|
+
## Documentation
|
165
|
+
|
166
|
+
Documentation along with a quick start guide can be found in the [docs/](/docs) directory.
|
167
|
+
|
168
|
+
### Quick example comparing to PyTorch
|
169
|
+
|
170
|
+
```py
|
171
|
+
from tinygrad import Tensor
|
172
|
+
|
173
|
+
x = Tensor.eye(3, requires_grad=True)
|
174
|
+
y = Tensor([[2.0,0,-2.0]], requires_grad=True)
|
175
|
+
z = y.matmul(x).sum()
|
176
|
+
z.backward()
|
177
|
+
|
178
|
+
print(x.grad.numpy()) # dz/dx
|
179
|
+
print(y.grad.numpy()) # dz/dy
|
180
|
+
```
|
181
|
+
|
182
|
+
The same thing but in PyTorch:
|
183
|
+
```py
|
184
|
+
import torch
|
185
|
+
|
186
|
+
x = torch.eye(3, requires_grad=True)
|
187
|
+
y = torch.tensor([[2.0,0,-2.0]], requires_grad=True)
|
188
|
+
z = y.matmul(x).sum()
|
189
|
+
z.backward()
|
190
|
+
|
191
|
+
print(x.grad.numpy()) # dz/dx
|
192
|
+
print(y.grad.numpy()) # dz/dy
|
193
|
+
```
|
194
|
+
|
195
|
+
## Contributing
|
196
|
+
|
197
|
+
There has been a lot of interest in tinygrad lately. Following these guidelines will help your PR get accepted.
|
198
|
+
|
199
|
+
We'll start with what will get your PR closed with a pointer to this section:
|
200
|
+
|
201
|
+
- No code golf! While low line count is a guiding light of this project, anything that remotely looks like code golf will be closed. The true goal is reducing complexity and increasing readability, and deleting `\n`s does nothing to help with that.
|
202
|
+
- All docs and whitespace changes will be closed unless you are a well-known contributor. The people writing the docs should be those who know the codebase the absolute best. People who have not demonstrated that shouldn't be messing with docs. Whitespace changes are both useless *and* carry a risk of introducing bugs.
|
203
|
+
- Anything you claim is a "speedup" must be benchmarked. In general, the goal is simplicity, so even if your PR makes things marginally faster, you have to consider the tradeoff with maintainablity and readablity.
|
204
|
+
- In general, the code outside the core `tinygrad/` folder is not well tested, so unless the current code there is broken, you shouldn't be changing it.
|
205
|
+
- If your PR looks "complex", is a big diff, or adds lots of lines, it won't be reviewed or merged. Consider breaking it up into smaller PRs that are individually clear wins. A common pattern I see is prerequisite refactors before adding new functionality. If you can (cleanly) refactor to the point that the feature is a 3 line change, this is great, and something easy for us to review.
|
206
|
+
|
207
|
+
Now, what we want:
|
208
|
+
|
209
|
+
- Bug fixes (with a regression test) are great! This library isn't 1.0 yet, so if you stumble upon a bug, fix it, write a test, and submit a PR, this is valuable work.
|
210
|
+
- Solving bounties! tinygrad [offers cash bounties](https://docs.google.com/spreadsheets/d/1WKHbT-7KOgjEawq5h5Ic1qUWzpfAzuD_J06N1JwOCGs/edit?usp=sharing) for certain improvements to the library. All new code should be high quality and well tested.
|
211
|
+
- Features. However, if you are adding a feature, consider the line tradeoff. If it's 3 lines, there's less of a bar of usefulness it has to meet over something that's 30 or 300 lines. All features must have regression tests. In general with no other constraints, your feature's API should match torch or numpy.
|
212
|
+
- Refactors that are clear wins. In general, if your refactor isn't a clear win it will be closed. But some refactors are amazing! Think about readability in a deep core sense. A whitespace change or moving a few functions around is useless, but if you realize that two 100 line functions can actually use the same 110 line function with arguments while also improving readability, this is a big win.
|
213
|
+
- Tests/fuzzers. If you can add tests that are non brittle, they are welcome. We have some fuzzers in here too, and there's a plethora of bugs that can be found with them and by improving them. Finding bugs, even writing broken tests (that should pass) with `@unittest.expectedFailure` is great. This is how we make progress.
|
214
|
+
- Dead code removal from core `tinygrad/` folder. We don't care about the code in extra, but removing dead code from the core library is great. Less for new people to read and be confused by.
|
215
|
+
|
216
|
+
### Running tests
|
217
|
+
|
218
|
+
You should install the pre-commit hooks with `pre-commit install`. This will run the linter, mypy, and a subset of the tests on every commit.
|
219
|
+
|
220
|
+
For more examples on how to run the full test suite please refer to the [CI workflow](.github/workflows/test.yml).
|
221
|
+
|
222
|
+
Some examples of running tests locally:
|
223
|
+
```sh
|
224
|
+
python3 -m pip install -e '.[testing]' # install extra deps for testing
|
225
|
+
python3 test/test_ops.py # just the ops tests
|
226
|
+
python3 -m pytest test/ # whole test suite
|
227
|
+
```
|
@@ -0,0 +1,60 @@
|
|
1
|
+
tinygrad/__init__.py,sha256=jC-35zswLSXLuRRThG_o6yar6qQjLCqmeaFCj_XKN08,449
|
2
|
+
tinygrad/device.py,sha256=zXcrFjBsiV1rW0aXupszDjD98TWLHin7u8pBd5fdJqo,10446
|
3
|
+
tinygrad/dtype.py,sha256=xg2BlFIPcQw0onHW_0ktGXjved9SXgQcLNrqe6gCXto,6221
|
4
|
+
tinygrad/function.py,sha256=0xkWst2tRsOeN6YcQS65MfVfWwKQYFkAacgkTys0VdQ,9616
|
5
|
+
tinygrad/helpers.py,sha256=XI8MIeBE35wQ4q0NEsUCvkj3QdY0adI80SfCbOySOVI,12773
|
6
|
+
tinygrad/lazy.py,sha256=xqaEqXaIpt_77SP_2U6Pyfw8YeGd_0PzNDXJOOnRJ24,13379
|
7
|
+
tinygrad/multi.py,sha256=gyGXYVviaPfzAkoJjLFUiusVd3no6HRJunOOxD0DaaY,11362
|
8
|
+
tinygrad/ops.py,sha256=aNk1jLuJl--Z_u8DE8Du1iN1AFhMH-6Le5mnvRHLDvI,7124
|
9
|
+
tinygrad/tensor.py,sha256=nznRGHH7-64bMpFeu8gvdSDfJ5CEEoezIcHSxPgeJ7k,129223
|
10
|
+
tinygrad/codegen/kernel.py,sha256=RRRmOX3iOOgu5ISABW_UTVh5vfGdLFfK1UOtKtpghuY,38169
|
11
|
+
tinygrad/codegen/linearizer.py,sha256=jxwEcxxcpWOvYlIgXmlGkgNYZ4sDykiY_5Ve3a8tpYg,27622
|
12
|
+
tinygrad/codegen/uops.py,sha256=yKS-3w9teuS_3BLnHAN4vWtSRvZHmsx194YRBXMOhFI,21872
|
13
|
+
tinygrad/engine/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
14
|
+
tinygrad/engine/graph.py,sha256=eEbb17qbJ0A-2VjN4l7SCbA4yI7jh6YN6EybPShmcbg,5221
|
15
|
+
tinygrad/engine/jit.py,sha256=TrdZQEXnF-SowCAyy69iEp85NPFFtOVaIz_2i1cwCvQ,11049
|
16
|
+
tinygrad/engine/realize.py,sha256=H2CgiLWRqTQSr9udJb1iLX7DFdLcPLwxSJZ2bUfHXDs,11077
|
17
|
+
tinygrad/engine/schedule.py,sha256=I3OxiNwveWbVuhhFWdR1B5G8goEyqXOHWe7UQQ3Ogz8,18487
|
18
|
+
tinygrad/engine/search.py,sha256=M11qHlufIffBS9b7mjk8gniyoGRaengwXnMfUQTHmyw,11289
|
19
|
+
tinygrad/nn/__init__.py,sha256=DoHrq9pUFs1vm9FUA5eet5_tvhlZJlC_uC4kBqo98kI,12932
|
20
|
+
tinygrad/nn/datasets.py,sha256=Mvf_0eCEEqtB9-8iiyDFhm-B7rTyWXwmSY_3xjeHleo,458
|
21
|
+
tinygrad/nn/optim.py,sha256=zf85kwumpS17fk1NBjUfe7tOUE7-XH37SL-LjgAfva8,6799
|
22
|
+
tinygrad/nn/state.py,sha256=nGR05s3kuDNp9lliCIr4-6Ek7Korha7jCAWie5S2rB4,10138
|
23
|
+
tinygrad/renderer/__init__.py,sha256=-LjQ9tC2rI8fveaS_xn24X_knXKILFj-iZFcRTk8fNM,2672
|
24
|
+
tinygrad/renderer/assembly.py,sha256=MD-SSC7-Nqwt3zrwe0aDXVX08W9Ox6Vj_byPS1k1bAQ,17923
|
25
|
+
tinygrad/renderer/cstyle.py,sha256=tFWWW-egorLFEDwX6fA9-rYxvNLc67LjxlZ6JzrWCF0,24043
|
26
|
+
tinygrad/renderer/llvmir.py,sha256=BZViWXj2G6JtEcOgc-CtnIj-d9xP0ZjgNXdTKQT_PJ8,10315
|
27
|
+
tinygrad/runtime/ops_amd.py,sha256=3jOrFqxk8JkPX043tEUFLvyKSgX7Fls785g_gOkdzVM,31811
|
28
|
+
tinygrad/runtime/ops_clang.py,sha256=XWqwobReRdu-Tj-chbWEJFMx6AQfgdGCcpdWcLWUTOQ,1468
|
29
|
+
tinygrad/runtime/ops_cuda.py,sha256=cgeoVpY9bOGU22Eh78XR5YOYY2cgsJt4Vnxl6u8N6co,10840
|
30
|
+
tinygrad/runtime/ops_disk.py,sha256=75-iihZxkhNvA5O3VaW61LOXwmlSX4XwegpnV1C4D5A,2738
|
31
|
+
tinygrad/runtime/ops_gpu.py,sha256=FB3Fp-VVEDGEt_6CfJxsM_TWzhp5giXCP1TSSRMXE80,7532
|
32
|
+
tinygrad/runtime/ops_hsa.py,sha256=YNQLqZjJ9twTJRKS41l2oIrncOAu3wdOdBegs9zYlgo,16188
|
33
|
+
tinygrad/runtime/ops_llvm.py,sha256=dODiyVSlPofHyDIZrD-V74h8W1d94VPnr_-A4gNbSO4,2229
|
34
|
+
tinygrad/runtime/ops_metal.py,sha256=fGSNpwmYIHaif9a5SiwyMX2bub-r5hTNpnrqlaPMeUc,5815
|
35
|
+
tinygrad/runtime/ops_npy.py,sha256=qaAi0AEo6nt7iZ-eWqM8z2aQfNJgZUpmBCEDmrIzWL0,369
|
36
|
+
tinygrad/runtime/ops_nv.py,sha256=PCMAHMrW4J7esgnkpwq3bB91Q3h6hBATr8JuykR9vcA,37633
|
37
|
+
tinygrad/runtime/ops_python.py,sha256=mmsDj1hJ3BtreAq5dfCuuUGbgoIhCKlVwNqMDmXBISs,10871
|
38
|
+
tinygrad/runtime/autogen/amd_gpu.py,sha256=1NDH0ualiZ8OtgTjaYcQ1HjKs_SQ7eUHuJvdrDodvCk,65022
|
39
|
+
tinygrad/runtime/autogen/comgr.py,sha256=Z99Y6K8D_nuMpOs0qDWiA0MV-RxueV65o2OyPFdcsHE,38563
|
40
|
+
tinygrad/runtime/autogen/cuda.py,sha256=GgRl4AfU54JG0G1XJj2dq2FbrUZ8XG_AnFrPAZJpSSg,250380
|
41
|
+
tinygrad/runtime/autogen/hip.py,sha256=1yUHDCwL3KkD15if2Q1Ud3GbJiR7DxsNorKZTCINw54,245532
|
42
|
+
tinygrad/runtime/autogen/hsa.py,sha256=tGpnXUhhQkAIEr0yyCxRImzajmt-nN0KzJn4KnT_bH8,270073
|
43
|
+
tinygrad/runtime/autogen/kfd.py,sha256=dDmLFL6HL_QXRW2rZOCArY55PRuXuLN9563XCipV2jM,29935
|
44
|
+
tinygrad/runtime/autogen/nv_gpu.py,sha256=K9WwwdIitHrY2AXpYy8bbdD9aEwdbz9vL7748pz6Re0,1672024
|
45
|
+
tinygrad/runtime/autogen/opencl.py,sha256=aW-luGFF5PXFEZ6SgrGANhA9qpkI-fZyEsmDfpez2Ss,82654
|
46
|
+
tinygrad/runtime/driver/hip_comgr.py,sha256=rFQRsOYo4XquwcHFTe2mGzMfozdL9hitO3DRYBDFSuM,3376
|
47
|
+
tinygrad/runtime/driver/hsa.py,sha256=PoNy8gHBPoRUhUZligFp0z_Le9fyEXbJrnlgwInt_R0,7218
|
48
|
+
tinygrad/runtime/graph/clang.py,sha256=10Bs64J0r12g6upqCHVoK3LoTrdbBBHQD43efMhlBjo,1957
|
49
|
+
tinygrad/runtime/graph/cuda.py,sha256=LNx6RQLcQSKMlHfVK5r_efujN0lRPhKqi8yp249OAIs,5265
|
50
|
+
tinygrad/runtime/graph/hcq.py,sha256=mspwzroBTwNNHDob7oK-JCt48mhuIhX_G0qNYvFVuVM,8089
|
51
|
+
tinygrad/runtime/graph/hsa.py,sha256=UJgSg2irrKT87LBZ3DfaGmoK7rJk8OZhIHEHhtF8rUE,10035
|
52
|
+
tinygrad/runtime/graph/metal.py,sha256=bwB6uAsqjEbwv5ML5ziWduBtmTpseJboo6J9ssVa4v4,4579
|
53
|
+
tinygrad/shape/shapetracker.py,sha256=hWqh2uWsbBp3lKlRpY8Fj1oTWvEx1YwVsKl0QiA-QnU,6334
|
54
|
+
tinygrad/shape/symbolic.py,sha256=hn2khLoHAJSwyZ91i679oJZCLTaz0Sf2dUG-HRJMtVw,16688
|
55
|
+
tinygrad/shape/view.py,sha256=KMf_KzNwXmcX1NbFPq862-Jv_E6TgeO27lcPjrAweF4,17092
|
56
|
+
tinygrad-0.9.0.dist-info/LICENSE,sha256=ABRhUPEILzINYIukgazD-_rPipkUNUwslrb0RxnV6Xc,1058
|
57
|
+
tinygrad-0.9.0.dist-info/METADATA,sha256=oyGO3WSmMQ7NTAK3RGk0ZXCkr-L3XKltKYhYrKEuifk,10227
|
58
|
+
tinygrad-0.9.0.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
59
|
+
tinygrad-0.9.0.dist-info/top_level.txt,sha256=vDABMCWBFQnx2kn9Azueu88FP-1klQdePoHikQhHymc,9
|
60
|
+
tinygrad-0.9.0.dist-info/RECORD,,
|
tinygrad/codegen/assembly.py
DELETED
@@ -1,190 +0,0 @@
|
|
1
|
-
from typing import Tuple, List, NamedTuple, Any, Dict, Optional, Union, DefaultDict, cast
|
2
|
-
from tinygrad.codegen.linearizer import UOps, Token, ConstOp, MemOp, UOp
|
3
|
-
from tinygrad.ops import BinaryOps, UnaryOps
|
4
|
-
from tinygrad.helpers import DType, dtypes, DEBUG
|
5
|
-
from tinygrad.shape.symbolic import Variable, NumNode, MulNode, DivNode, ModNode, LtNode, SumNode, AndNode
|
6
|
-
import functools
|
7
|
-
import math
|
8
|
-
from collections import defaultdict
|
9
|
-
|
10
|
-
_type_to_letter = {dtypes.float32: 'f', dtypes.bool: 'p', dtypes.int32: 'i', dtypes.int64: 'a', dtypes.uint32: 'u', dtypes.uint64: 'b', dtypes._float4: 'x', dtypes.uint8: 'uc', dtypes.float16: 'h',
|
11
|
-
dtypes.int8: 'c', dtypes.uint16: 'us', dtypes.float64: 'd'}
|
12
|
-
|
13
|
-
class Register(NamedTuple):
|
14
|
-
nm:str
|
15
|
-
dtype:DType
|
16
|
-
scalar:bool
|
17
|
-
off:Optional[int] = None
|
18
|
-
def __repr__(self): return self.nm if self.off is None else f"{self.nm}:{self.off}"
|
19
|
-
def subregs(self):
|
20
|
-
if self.dtype == dtypes._float4:
|
21
|
-
return [Register(self.nm, dtypes.float, False, off=off) for off in range(4)]
|
22
|
-
return []
|
23
|
-
|
24
|
-
class AssemblyInstruction(NamedTuple):
|
25
|
-
op: UOps
|
26
|
-
out: Optional[Register]
|
27
|
-
vin: List[Union[Register, int, float]]
|
28
|
-
arg: Any = None
|
29
|
-
|
30
|
-
# warp size of 32, s registers are shared across the warp, v are 32-wide vectors
|
31
|
-
class AssemblyLanguage:
|
32
|
-
supports_load3: bool = False
|
33
|
-
sin_is_sin2pi: bool = False
|
34
|
-
no_div: bool = False
|
35
|
-
#TODO: these should be global vars
|
36
|
-
cnts:DefaultDict[Tuple[DType, bool], int] = defaultdict(int)
|
37
|
-
tor: Dict[Any, Register] = {}
|
38
|
-
ins: List[AssemblyInstruction] = []
|
39
|
-
|
40
|
-
def type_to_letter(self,x): return _type_to_letter[x[0]].upper() if x[1] else _type_to_letter[x[0]]
|
41
|
-
def newreg(self, tok, dtype=dtypes.float32, scalar=False):
|
42
|
-
if isinstance(tok, Token): dtype = tok.dtype # this
|
43
|
-
self.tor[tok] = ret = Register(f"%{self.type_to_letter((dtype, scalar))}{self.cnts[(dtype, scalar)]}", dtype, scalar)
|
44
|
-
if dtype == dtypes._float4:
|
45
|
-
for off in range(4):
|
46
|
-
self.tor[Token(tok.name, tok.dtype, off)] = Register(ret.nm, dtypes.float, ret.scalar, off)
|
47
|
-
self.cnts[(dtype, scalar)] += 1
|
48
|
-
return ret
|
49
|
-
|
50
|
-
def render_numnode(self, b):
|
51
|
-
key = ("num", b)
|
52
|
-
if key not in self.tor: self.ins.append(AssemblyInstruction(UOps.LOAD, self.newreg(key, scalar=True, dtype=dtypes.int32), [], b))
|
53
|
-
return self.tor[key]
|
54
|
-
|
55
|
-
def render_alu(self, op, a:Register, b:Union[Register, int, float], dtype=dtypes.int32) -> Register:
|
56
|
-
key = (op, a, b)
|
57
|
-
if key not in self.tor:
|
58
|
-
#if not isinstance(b, Register): b = render_numnode(b)
|
59
|
-
self.ins.append(AssemblyInstruction(UOps.ALU, self.newreg(key, dtype=dtype, scalar=a.scalar and (not isinstance(b, Register) or b.scalar)), [a, b], op))
|
60
|
-
return self.tor[key]
|
61
|
-
|
62
|
-
def render_cast(self, a:Register, new_dtype:DType) -> Register:
|
63
|
-
if a.dtype == new_dtype: return a
|
64
|
-
key = (a, new_dtype)
|
65
|
-
if key not in self.tor:
|
66
|
-
self.ins.append(AssemblyInstruction(UOps.CAST, self.newreg(key, dtype=new_dtype), [a]))
|
67
|
-
return self.tor[key]
|
68
|
-
|
69
|
-
render_ops: Any = { Variable: lambda self, ops, ctx: ctx.tor[self], NumNode: lambda self, ops, ctx: ctx.render_numnode(self.b),
|
70
|
-
MulNode: lambda self, ops, ctx: ctx.render_alu(BinaryOps.MUL, self.a.render(ops, ctx), self.b),
|
71
|
-
DivNode: lambda self, ops, ctx: ctx.render_alu(BinaryOps.DIV, self.a.render(ops, ctx), self.b),
|
72
|
-
ModNode: lambda self, ops, ctx: ctx.render_alu(BinaryOps.MOD, self.a.render(ops, ctx), self.b),
|
73
|
-
LtNode: lambda self, ops, ctx: ctx.render_alu(BinaryOps.CMPLT, self.a.render(ops, ctx), self.b, dtype=dtypes.bool),
|
74
|
-
SumNode: lambda self,ops,ctx: functools.reduce(lambda a,b: ctx.render_alu(BinaryOps.ADD, a, b.render(ops,ctx)), self.nodes[1:], self.nodes[0].render(ops,ctx)),
|
75
|
-
AndNode: lambda self,ops,ctx: functools.reduce(lambda a,b: ctx.render_alu(BinaryOps.MUL, a, b.render(ops,ctx), dtype=dtypes.bool), self.nodes[1:], self.nodes[0].render(ops,ctx)) }
|
76
|
-
|
77
|
-
def addr_w_offset(self, args):
|
78
|
-
assert isinstance(args, MemOp)
|
79
|
-
idx = args.idx*args.memory_dtype.itemsize
|
80
|
-
off = 0 # TODO: should this be None?
|
81
|
-
if isinstance(idx, SumNode):
|
82
|
-
nums = [n.b for n in idx.nodes if isinstance(n, NumNode)]
|
83
|
-
if nums and nums[0] < 4096 and (idx-nums[0]).min >= 0: # TODO: different for each GPU?
|
84
|
-
idx -= nums[0]
|
85
|
-
off = cast(int, nums[0])
|
86
|
-
reg = idx.render(self.render_ops, self)
|
87
|
-
if self.supports_load3:
|
88
|
-
if reg.scalar:
|
89
|
-
new_reg = self.newreg((reg.nm, 'vec'), dtype=reg.dtype)
|
90
|
-
self.ins.append(AssemblyInstruction(UOps.ALU, new_reg, [reg], UnaryOps.NOOP))
|
91
|
-
reg = new_reg
|
92
|
-
return self.tor[args.name], reg, off
|
93
|
-
reg = self.render_alu(BinaryOps.ADD, self.render_cast(reg, dtypes.uint64), self.tor[args.name], dtype=dtypes.uint64)
|
94
|
-
return reg, None, off
|
95
|
-
|
96
|
-
def uops_to_asmstyle(lang, function_name:str, uops:List[UOp]):
|
97
|
-
#TODO: Do not use clear()
|
98
|
-
lang.ins.clear()
|
99
|
-
lang.tor.clear()
|
100
|
-
lang.cnts.clear()
|
101
|
-
buf_to_dtype = {args[0]:args[1] for uop,_,_,args in uops if uop == UOps.DEFINE_GLOBAL}
|
102
|
-
global_size, local_size = [], []
|
103
|
-
skipload_branch = 0
|
104
|
-
lang.ins += [AssemblyInstruction(UOps.SPECIAL, lang.newreg(buf, dtype=dtypes.uint64, scalar=True), [], buf) for buf in buf_to_dtype]
|
105
|
-
for uop,newvar,vin,args in uops:
|
106
|
-
if uop == UOps.DEFINE_LOCAL:
|
107
|
-
lang.ins.append(AssemblyInstruction(UOps.DEFINE_LOCAL, None, [], args))
|
108
|
-
lang.ins.append(AssemblyInstruction(UOps.ALU, lang.newreg(args[0], dtype=dtypes.uint64), [args[0]], UnaryOps.NOOP))
|
109
|
-
elif uop == UOps.LOOP:
|
110
|
-
if args[1] == "global":
|
111
|
-
for i,var in enumerate(args[0]):
|
112
|
-
global_size.append(var.max+1)
|
113
|
-
lang.ins.append(AssemblyInstruction(UOps.SPECIAL, lang.newreg(var, dtype=dtypes.int32), [], f"gid{len(args[0])-1-i}"))
|
114
|
-
elif args[1] == "local":
|
115
|
-
for i,var in enumerate(args[0]):
|
116
|
-
local_size.append(var.max+1)
|
117
|
-
lang.ins.append(AssemblyInstruction(UOps.SPECIAL, lang.newreg(var, dtype=dtypes.int32), [], f"lid{len(args[0])-1-i}"))
|
118
|
-
else:
|
119
|
-
for var in args[0]:
|
120
|
-
if not isinstance(var, NumNode): # TODO: why is this coming through?
|
121
|
-
lang.ins.append(AssemblyInstruction(UOps.LOAD, lang.newreg(var, dtype=dtypes.int32, scalar=True), [], 0))
|
122
|
-
lang.ins.append(AssemblyInstruction(UOps.LABEL, None, [], "$loop_"+var.expr))
|
123
|
-
elif uop == UOps.ENDLOOP:
|
124
|
-
if args[1] not in ["global", "local", "global+local"]:
|
125
|
-
for var in reversed(args[0]):
|
126
|
-
if not isinstance(var, NumNode): # TODO: why is this coming through?
|
127
|
-
lang.ins.append(AssemblyInstruction(UOps.ALU, lang.tor[var], [lang.tor[var], 1], BinaryOps.ADD))
|
128
|
-
pred = lang.render_alu(BinaryOps.CMPLT, lang.tor[var], var.max+1, dtypes.bool)
|
129
|
-
lang.ins.append(AssemblyInstruction(UOps.COND_BRANCH, None, [pred], ("$loop_"+var.expr, True)))
|
130
|
-
elif args[1] == "global+local":
|
131
|
-
for i, var in enumerate(reversed(args[0])):
|
132
|
-
lang.ins.append(AssemblyInstruction(UOps.ENDLOOP, None, [lang.tor[var]], (var.max+1, f"gid{i}")))
|
133
|
-
elif args[1] == 'local':
|
134
|
-
for i, var in enumerate(reversed(args[0])):
|
135
|
-
lang.ins.append(AssemblyInstruction(UOps.ENDLOOP, None, [lang.tor[var]], (var.max+1, f"lid{i}")))
|
136
|
-
elif uop == UOps.CAST and newvar is not None:
|
137
|
-
# TODO: we should reconsider outputting CAST in the linearizer. these are needless copies
|
138
|
-
out = lang.newreg(newvar)
|
139
|
-
for i,sr in enumerate(out.subregs()):
|
140
|
-
lang.ins.append(AssemblyInstruction(UOps.ALU, sr, [lang.tor[vin[i]]], UnaryOps.NOOP))
|
141
|
-
elif uop == UOps.ALU and newvar is not None:
|
142
|
-
out = lang.newreg(newvar) if newvar not in lang.tor else lang.tor[newvar]
|
143
|
-
# this is the only thing that can violate SSA
|
144
|
-
if args in [BinaryOps.CMPLT]:
|
145
|
-
pred_reg = lang.newreg((newvar, 'pred'), dtype=dtypes.bool)
|
146
|
-
lang.ins.append(AssemblyInstruction(UOps.ALU, pred_reg, [lang.tor[x] for x in vin], args))
|
147
|
-
lang.ins.append(AssemblyInstruction(UOps.CAST, out, [pred_reg], args))
|
148
|
-
elif args == BinaryOps.DIV and lang.no_div:
|
149
|
-
tmp = lang.newreg((newvar, "rcp"))
|
150
|
-
lang.ins.append(AssemblyInstruction(UOps.ALU, tmp, [lang.tor[vin[1]]], UnaryOps.RECIP))
|
151
|
-
lang.ins.append(AssemblyInstruction(UOps.ALU, out, [lang.tor[vin[0]], tmp], BinaryOps.MUL))
|
152
|
-
elif args == UnaryOps.SIN and lang.sin_is_sin2pi:
|
153
|
-
tmp = lang.newreg((newvar, "2pi"))
|
154
|
-
lang.ins.append(AssemblyInstruction(UOps.ALU, tmp, [lang.tor[vin[0]], 1/(math.pi*2)], BinaryOps.MUL))
|
155
|
-
lang.ins.append(AssemblyInstruction(UOps.ALU, out, [tmp], args))
|
156
|
-
else:
|
157
|
-
lang.ins.append(AssemblyInstruction(UOps.ALU, out, [lang.tor[x] for x in vin], args))
|
158
|
-
elif uop == UOps.LOAD and newvar is not None:
|
159
|
-
if isinstance(args, ConstOp):
|
160
|
-
if args.valid.min == 0 and args.valid.max == 1:
|
161
|
-
reg = lang.newreg(newvar, dtype=newvar.dtype)
|
162
|
-
lang.ins.append(AssemblyInstruction(UOps.LOAD, reg, [], args.invalid_value))
|
163
|
-
pred = args.valid.render(lang.render_ops, lang)
|
164
|
-
lang.ins.append(AssemblyInstruction(UOps.COND_BRANCH, None, [pred], (f"$skipload_{skipload_branch}", False)))
|
165
|
-
lang.ins.append(AssemblyInstruction(UOps.LOAD, reg, [], args.value))
|
166
|
-
lang.ins.append(AssemblyInstruction(UOps.LABEL, None, [], f"$skipload_{skipload_branch}"))
|
167
|
-
skipload_branch += 1
|
168
|
-
else:
|
169
|
-
lang.ins.append(AssemblyInstruction(UOps.LOAD, lang.newreg(newvar, dtype=newvar.dtype), [], args.value if args.valid.min == 1 else args.invalid_value))
|
170
|
-
else:
|
171
|
-
idx, treg, off = lang.addr_w_offset(args)
|
172
|
-
reg = lang.newreg(newvar, dtype=newvar.dtype, scalar=(idx.scalar and (not isinstance(treg, Register) or treg.scalar))) # and not dtypes.is_float(newvar.dtype)))
|
173
|
-
if args.valid.min == 0:
|
174
|
-
lang.ins.append(AssemblyInstruction(UOps.LOAD, reg, [], 0))
|
175
|
-
if args.valid.max == 1:
|
176
|
-
pred = args.valid.render(lang.render_ops, lang)
|
177
|
-
lang.ins.append(AssemblyInstruction(UOps.COND_BRANCH, None, [pred], (f"$skipload_{skipload_branch}", False)))
|
178
|
-
if args.valid.max == 1:
|
179
|
-
# NOTE: you can't compute the index in here, because it assumes it's all available later
|
180
|
-
lang.ins.append(AssemblyInstruction(UOps.LOAD, reg, [idx] + ([treg] if treg is not None else []), (off, 'global' if not args.local else 'shared', args.memory_dtype if args.memory_dtype != dtypes.float else None)))
|
181
|
-
if args.valid.min == 0 and args.valid.max == 1:
|
182
|
-
lang.ins.append(AssemblyInstruction(UOps.LABEL, None, [], f"$skipload_{skipload_branch}"))
|
183
|
-
skipload_branch += 1
|
184
|
-
elif uop == UOps.STORE:
|
185
|
-
idx, treg, off = lang.addr_w_offset(args)
|
186
|
-
lang.ins.append(AssemblyInstruction(UOps.STORE, None, [idx, lang.tor[vin[0]]] + ([treg] if treg is not None else []), (off, 'global' if not args.local else 'shared', args.memory_dtype if args.memory_dtype != dtypes.float else None)))
|
187
|
-
|
188
|
-
if DEBUG >= 4:
|
189
|
-
for tins in lang.ins: print(tins)
|
190
|
-
return global_size, local_size
|