ninetoothed 0.1.0__tar.gz → 0.1.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ninetoothed-0.1.1/.github/workflows/pytest.yml +18 -0
- ninetoothed-0.1.1/.idea/.gitignore +3 -0
- ninetoothed-0.1.1/.idea/inspectionProfiles/profiles_settings.xml +6 -0
- ninetoothed-0.1.1/.idea/misc.xml +4 -0
- ninetoothed-0.1.1/.idea/modules.xml +8 -0
- ninetoothed-0.1.1/.idea/ninetoothed.iml +10 -0
- ninetoothed-0.1.1/.idea/vcs.xml +6 -0
- ninetoothed-0.1.1/.idea/workspace.xml +47 -0
- ninetoothed-0.1.1/PKG-INFO +79 -0
- ninetoothed-0.1.1/README.md +65 -0
- {ninetoothed-0.1.0 → ninetoothed-0.1.1}/docs/README.zh.md +5 -12
- {ninetoothed-0.1.0 → ninetoothed-0.1.1}/pyproject.toml +1 -1
- ninetoothed-0.1.1/requirements.txt +7 -0
- {ninetoothed-0.1.0 → ninetoothed-0.1.1}/src/ninetoothed/jit.py +62 -31
- {ninetoothed-0.1.0 → ninetoothed-0.1.1}/src/ninetoothed/tensor.py +1 -1
- ninetoothed-0.1.1/tests/__init__.py +0 -0
- ninetoothed-0.1.1/tests/skippers.py +16 -0
- ninetoothed-0.1.1/tests/test_add.py +41 -0
- ninetoothed-0.1.1/tests/test_matmul.py +72 -0
- ninetoothed-0.1.0/PKG-INFO +0 -19
- ninetoothed-0.1.0/README.md +0 -5
- {ninetoothed-0.1.0 → ninetoothed-0.1.1}/.github/workflows/ruff.yml +0 -0
- {ninetoothed-0.1.0 → ninetoothed-0.1.1}/.gitignore +0 -0
- {ninetoothed-0.1.0 → ninetoothed-0.1.1}/LICENSE +0 -0
- {ninetoothed-0.1.0 → ninetoothed-0.1.1}/src/ninetoothed/__init__.py +0 -0
- {ninetoothed-0.1.0 → ninetoothed-0.1.1}/src/ninetoothed/language.py +0 -0
- {ninetoothed-0.1.0 → ninetoothed-0.1.1}/src/ninetoothed/symbol.py +0 -0
- {ninetoothed-0.1.0 → ninetoothed-0.1.1}/src/ninetoothed/torchifier.py +0 -0
@@ -0,0 +1,18 @@
|
|
1
|
+
name: pytest
|
2
|
+
on: [push, pull_request]
|
3
|
+
jobs:
|
4
|
+
pytest:
|
5
|
+
runs-on: ubuntu-latest
|
6
|
+
steps:
|
7
|
+
- uses: actions/checkout@v4
|
8
|
+
- name: Set up Python
|
9
|
+
uses: actions/setup-python@v5
|
10
|
+
with:
|
11
|
+
python-version: "3.10"
|
12
|
+
- name: Install dependencies
|
13
|
+
run: |
|
14
|
+
python -m pip install --upgrade pip
|
15
|
+
pip install -r requirements.txt
|
16
|
+
- name: Test with pytest
|
17
|
+
run: |
|
18
|
+
pytest --doctest-modules --junitxml=junit/test-results.xml --cov=ninetoothed --cov-report=xml --cov-report=html
|
@@ -0,0 +1,8 @@
|
|
1
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
2
|
+
<project version="4">
|
3
|
+
<component name="ProjectModuleManager">
|
4
|
+
<modules>
|
5
|
+
<module fileurl="file://$PROJECT_DIR$/.idea/ninetoothed.iml" filepath="$PROJECT_DIR$/.idea/ninetoothed.iml" />
|
6
|
+
</modules>
|
7
|
+
</component>
|
8
|
+
</project>
|
@@ -0,0 +1,10 @@
|
|
1
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
2
|
+
<module type="PYTHON_MODULE" version="4">
|
3
|
+
<component name="NewModuleRootManager">
|
4
|
+
<content url="file://$MODULE_DIR$">
|
5
|
+
<excludeFolder url="file://$MODULE_DIR$/.venv" />
|
6
|
+
</content>
|
7
|
+
<orderEntry type="inheritedJdk" />
|
8
|
+
<orderEntry type="sourceFolder" forTests="false" />
|
9
|
+
</component>
|
10
|
+
</module>
|
@@ -0,0 +1,47 @@
|
|
1
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
2
|
+
<project version="4">
|
3
|
+
<component name="ChangeListManager">
|
4
|
+
<list default="true" id="b7a10642-6e58-4644-a069-aad20bd5add0" name="Changes" comment="" />
|
5
|
+
<option name="SHOW_DIALOG" value="false" />
|
6
|
+
<option name="HIGHLIGHT_CONFLICTS" value="true" />
|
7
|
+
<option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
|
8
|
+
<option name="LAST_RESOLUTION" value="IGNORE" />
|
9
|
+
</component>
|
10
|
+
<component name="Git.Settings">
|
11
|
+
<option name="RECENT_GIT_ROOT_PATH" value="$PROJECT_DIR$" />
|
12
|
+
</component>
|
13
|
+
<component name="ProjectColorInfo"><![CDATA[{
|
14
|
+
"customColor": "",
|
15
|
+
"associatedIndex": 8
|
16
|
+
}]]></component>
|
17
|
+
<component name="ProjectId" id="2jzxGjCO1rbfcApmCQlTd14207t" />
|
18
|
+
<component name="ProjectViewState">
|
19
|
+
<option name="hideEmptyMiddlePackages" value="true" />
|
20
|
+
<option name="showLibraryContents" value="true" />
|
21
|
+
</component>
|
22
|
+
<component name="PropertiesComponent"><![CDATA[{
|
23
|
+
"keyToString": {
|
24
|
+
"RunOnceActivity.ShowReadmeOnStart": "true",
|
25
|
+
"git-widget-placeholder": "dev",
|
26
|
+
"settings.editor.selected.configurable": "advanced.settings"
|
27
|
+
}
|
28
|
+
}]]></component>
|
29
|
+
<component name="SharedIndexes">
|
30
|
+
<attachedChunks>
|
31
|
+
<set>
|
32
|
+
<option value="bundled-python-sdk-975db3bf15a3-31b6be0877a2-com.jetbrains.pycharm.community.sharedIndexes.bundled-PC-241.18034.82" />
|
33
|
+
</set>
|
34
|
+
</attachedChunks>
|
35
|
+
</component>
|
36
|
+
<component name="SpellCheckerSettings" RuntimeDictionaries="0" Folders="0" CustomDictionaries="0" DefaultDictionary="application-level" UseSingleDictionary="true" transferred="true" />
|
37
|
+
<component name="TaskManager">
|
38
|
+
<task active="true" id="Default" summary="Default task">
|
39
|
+
<changelist id="b7a10642-6e58-4644-a069-aad20bd5add0" name="Changes" comment="" />
|
40
|
+
<created>1722403613879</created>
|
41
|
+
<option name="number" value="Default" />
|
42
|
+
<option name="presentableId" value="Default" />
|
43
|
+
<updated>1722403613879</updated>
|
44
|
+
</task>
|
45
|
+
<servers />
|
46
|
+
</component>
|
47
|
+
</project>
|
@@ -0,0 +1,79 @@
|
|
1
|
+
Metadata-Version: 2.3
|
2
|
+
Name: ninetoothed
|
3
|
+
Version: 0.1.1
|
4
|
+
Summary: A domain-specific language based on Triton but providing higher-level abstraction.
|
5
|
+
Project-URL: Homepage, https://github.com/InfiniTensor/ninetoothed
|
6
|
+
Project-URL: Issues, https://github.com/InfiniTensor/ninetoothed/issues
|
7
|
+
Author-email: Jiacheng Huang <huangjiacheng0709@outlook.com>
|
8
|
+
License-File: LICENSE
|
9
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
10
|
+
Classifier: Operating System :: OS Independent
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
12
|
+
Requires-Python: >=3.10
|
13
|
+
Description-Content-Type: text/markdown
|
14
|
+
|
15
|
+
# NineToothed
|
16
|
+
|
17
|
+
A domain-specific language (DSL) based on Triton but providing higher-level abstractions.
|
18
|
+
|
19
|
+
**Other language versions: [English](README.md), [简体中文](docs/README.zh.md).**
|
20
|
+
|
21
|
+
## Installation
|
22
|
+
|
23
|
+
We can use `pip` to install `ninetoothed`.
|
24
|
+
|
25
|
+
```shell
|
26
|
+
pip install ninetoothed
|
27
|
+
```
|
28
|
+
|
29
|
+
After successfully running the above command, `ninetoothed` will be installed. However, to fully utilize its capabilities, you also need to install `triton` and a deep learning framework supported by `ninetoothed`. For trial purposes, we recommend installing `triton` and `torch`.
|
30
|
+
|
31
|
+
## Usage
|
32
|
+
|
33
|
+
Currently, we can use the `Tensor` and `Symbol` classes in the `ninetoothed` package to perform meta-operations like `tile` and `expand` to easily construct kernel functions. Below, we will use these features to create vector addition and matrix multiplication kernel functions.
|
34
|
+
|
35
|
+
### Vector Addition
|
36
|
+
|
37
|
+
```python
|
38
|
+
BLOCK_SIZE = Symbol("BLOCK_SIZE", meta=True)
|
39
|
+
|
40
|
+
@ninetoothed.jit
|
41
|
+
def add_kernel(
|
42
|
+
x: Tensor(1).tile((BLOCK_SIZE,)),
|
43
|
+
y: Tensor(1).tile((BLOCK_SIZE,)),
|
44
|
+
z: Tensor(1).tile((BLOCK_SIZE,)),
|
45
|
+
):
|
46
|
+
z = x + y
|
47
|
+
```
|
48
|
+
|
49
|
+
In this code, we first define `BLOCK_SIZE`, which is a `Symbol`. You can think of `"BLOCK_SIZE"` as its name. We see that `meta` is set to `True`, indicating to the compiler that it is a meta-parameter and its value can be determined by the compiler. The `Tensor(1)` constructs a one-dimensional tensor (vector), and `Tensor(1).tile((BLOCK_SIZE,))` means we want to create a vector and divide it into blocks of size `BLOCK_SIZE`. Suppose the size of this vector is `8192` and `BLOCK_SIZE` is `1024`, then the vector will be divided into `8` blocks, each of size `1024`.
|
50
|
+
|
51
|
+
By using type annotations, we tell the compiler that we will have three tensor parameters, which will be divided into blocks, and `x`, `y`, and `z` are these blocks. It's important to understand that `x`, `y`, and `z` are the blocks, not the tensors themselves. In the function body, `x`, `y`, and `z` are also the blocks. The rest is straightforward (only one line `z = x + y` left, haha), we add each block of `x` and `y` and store it in `z`. Since each block of the parameter tensors undergoes this operation, the addition is completed for the whole tensors as well.
|
52
|
+
|
53
|
+
### Matrix Multiplication
|
54
|
+
|
55
|
+
```python
|
56
|
+
BLOCK_SIZE_M = Symbol("BLOCK_SIZE_M", meta=True)
|
57
|
+
BLOCK_SIZE_N = Symbol("BLOCK_SIZE_N", meta=True)
|
58
|
+
BLOCK_SIZE_K = Symbol("BLOCK_SIZE_K", meta=True)
|
59
|
+
|
60
|
+
a_tiled = Tensor(2).tile((BLOCK_SIZE_M, BLOCK_SIZE_K)).tile((1, -1))
|
61
|
+
b_tiled = Tensor(2).tile((BLOCK_SIZE_K, BLOCK_SIZE_N)).tile((-1, 1))
|
62
|
+
c_tiled = Tensor(2).tile((BLOCK_SIZE_M, BLOCK_SIZE_N))
|
63
|
+
|
64
|
+
a_tiled = a_tiled.expand((-1, c_tiled.shape[1]))
|
65
|
+
b_tiled = b_tiled.expand((c_tiled.shape[0], -1))
|
66
|
+
|
67
|
+
@ninetoothed.jit
|
68
|
+
def matmul_kernel(a: a_tiled, b: b_tiled, c: c_tiled):
|
69
|
+
accumulator = ninetoothed.language.zeros(
|
70
|
+
c.shape, dtype=ninetoothed.language.float32
|
71
|
+
)
|
72
|
+
for k in range(a.shape[1]):
|
73
|
+
accumulator = ninetoothed.language.dot(a[0, k], b[k, 0], accumulator)
|
74
|
+
c = accumulator.to(ninetoothed.language.float16)
|
75
|
+
```
|
76
|
+
|
77
|
+
For matrix multiplication, we also have three tensor parameters, but the tiling method is more complex than vector addition. We denote the three matrices as $A$, $B$, and $C$, where $A$ and $B$ are inputs, and $C$ is the output. Tiling $C$ is simple; we just need to divide it into blocks of size `(BLOCK_SIZE_M, BLOCK_SIZE_N)` by rows and columns. Once each block computes its result, the entire $C$ is computed. However, how should we tile $A$ and $B$? The answer is to introduce another meta-parameter `BLOCK_SIZE_K`. This way, we can divide $A$ into blocks of size `(BLOCK_SIZE_M, BLOCK_SIZE_K)` and $B$ into blocks of size `(BLOCK_SIZE_K, BLOCK_SIZE_N)`. However, for matrix multiplication, $A$ and $B$ do not correspond block by block; each row of $A$ needs to correspond to each column of $B$. Therefore, we need to further `tile` $A$ and $B$ by rows and columns, respectively. Up to this point, we have a set of row blocks of $A$ and column blocks of $B$. However, each row block of $A$ must correspond to every column block of $B$. This is where `expand` comes in. We `expand` the row blocks of $A$ along the columns to the number of columns of $C$ and the column blocks of $B$ along the rows to the number of rows of $C$. This way, we successfully tile $A$, $B$, and $C$.
|
78
|
+
|
79
|
+
With tiling done, the rest is simple. In the function body, we define an `accumulator` to accumulate intermediate results. We then iterate through the corresponding row blocks of $A$ and column blocks of B, multiplying them and accumulating the results in `accumulator`. Finally, we place the `accumulator` in the corresponding block of $C$. Since each block of the parameter tensors undergoes this operation, the multiplication is completed for the whole tensors as well.
|
@@ -0,0 +1,65 @@
|
|
1
|
+
# NineToothed
|
2
|
+
|
3
|
+
A domain-specific language (DSL) based on Triton but providing higher-level abstractions.
|
4
|
+
|
5
|
+
**Other language versions: [English](README.md), [简体中文](docs/README.zh.md).**
|
6
|
+
|
7
|
+
## Installation
|
8
|
+
|
9
|
+
We can use `pip` to install `ninetoothed`.
|
10
|
+
|
11
|
+
```shell
|
12
|
+
pip install ninetoothed
|
13
|
+
```
|
14
|
+
|
15
|
+
After successfully running the above command, `ninetoothed` will be installed. However, to fully utilize its capabilities, you also need to install `triton` and a deep learning framework supported by `ninetoothed`. For trial purposes, we recommend installing `triton` and `torch`.
|
16
|
+
|
17
|
+
## Usage
|
18
|
+
|
19
|
+
Currently, we can use the `Tensor` and `Symbol` classes in the `ninetoothed` package to perform meta-operations like `tile` and `expand` to easily construct kernel functions. Below, we will use these features to create vector addition and matrix multiplication kernel functions.
|
20
|
+
|
21
|
+
### Vector Addition
|
22
|
+
|
23
|
+
```python
|
24
|
+
BLOCK_SIZE = Symbol("BLOCK_SIZE", meta=True)
|
25
|
+
|
26
|
+
@ninetoothed.jit
|
27
|
+
def add_kernel(
|
28
|
+
x: Tensor(1).tile((BLOCK_SIZE,)),
|
29
|
+
y: Tensor(1).tile((BLOCK_SIZE,)),
|
30
|
+
z: Tensor(1).tile((BLOCK_SIZE,)),
|
31
|
+
):
|
32
|
+
z = x + y
|
33
|
+
```
|
34
|
+
|
35
|
+
In this code, we first define `BLOCK_SIZE`, which is a `Symbol`. You can think of `"BLOCK_SIZE"` as its name. We see that `meta` is set to `True`, indicating to the compiler that it is a meta-parameter and its value can be determined by the compiler. The `Tensor(1)` constructs a one-dimensional tensor (vector), and `Tensor(1).tile((BLOCK_SIZE,))` means we want to create a vector and divide it into blocks of size `BLOCK_SIZE`. Suppose the size of this vector is `8192` and `BLOCK_SIZE` is `1024`, then the vector will be divided into `8` blocks, each of size `1024`.
|
36
|
+
|
37
|
+
By using type annotations, we tell the compiler that we will have three tensor parameters, which will be divided into blocks, and `x`, `y`, and `z` are these blocks. It's important to understand that `x`, `y`, and `z` are the blocks, not the tensors themselves. In the function body, `x`, `y`, and `z` are also the blocks. The rest is straightforward (only one line `z = x + y` left, haha), we add each block of `x` and `y` and store it in `z`. Since each block of the parameter tensors undergoes this operation, the addition is completed for the whole tensors as well.
|
38
|
+
|
39
|
+
### Matrix Multiplication
|
40
|
+
|
41
|
+
```python
|
42
|
+
BLOCK_SIZE_M = Symbol("BLOCK_SIZE_M", meta=True)
|
43
|
+
BLOCK_SIZE_N = Symbol("BLOCK_SIZE_N", meta=True)
|
44
|
+
BLOCK_SIZE_K = Symbol("BLOCK_SIZE_K", meta=True)
|
45
|
+
|
46
|
+
a_tiled = Tensor(2).tile((BLOCK_SIZE_M, BLOCK_SIZE_K)).tile((1, -1))
|
47
|
+
b_tiled = Tensor(2).tile((BLOCK_SIZE_K, BLOCK_SIZE_N)).tile((-1, 1))
|
48
|
+
c_tiled = Tensor(2).tile((BLOCK_SIZE_M, BLOCK_SIZE_N))
|
49
|
+
|
50
|
+
a_tiled = a_tiled.expand((-1, c_tiled.shape[1]))
|
51
|
+
b_tiled = b_tiled.expand((c_tiled.shape[0], -1))
|
52
|
+
|
53
|
+
@ninetoothed.jit
|
54
|
+
def matmul_kernel(a: a_tiled, b: b_tiled, c: c_tiled):
|
55
|
+
accumulator = ninetoothed.language.zeros(
|
56
|
+
c.shape, dtype=ninetoothed.language.float32
|
57
|
+
)
|
58
|
+
for k in range(a.shape[1]):
|
59
|
+
accumulator = ninetoothed.language.dot(a[0, k], b[k, 0], accumulator)
|
60
|
+
c = accumulator.to(ninetoothed.language.float16)
|
61
|
+
```
|
62
|
+
|
63
|
+
For matrix multiplication, we also have three tensor parameters, but the tiling method is more complex than vector addition. We denote the three matrices as $A$, $B$, and $C$, where $A$ and $B$ are inputs, and $C$ is the output. Tiling $C$ is simple; we just need to divide it into blocks of size `(BLOCK_SIZE_M, BLOCK_SIZE_N)` by rows and columns. Once each block computes its result, the entire $C$ is computed. However, how should we tile $A$ and $B$? The answer is to introduce another meta-parameter `BLOCK_SIZE_K`. This way, we can divide $A$ into blocks of size `(BLOCK_SIZE_M, BLOCK_SIZE_K)` and $B$ into blocks of size `(BLOCK_SIZE_K, BLOCK_SIZE_N)`. However, for matrix multiplication, $A$ and $B$ do not correspond block by block; each row of $A$ needs to correspond to each column of $B$. Therefore, we need to further `tile` $A$ and $B$ by rows and columns, respectively. Up to this point, we have a set of row blocks of $A$ and column blocks of $B$. However, each row block of $A$ must correspond to every column block of $B$. This is where `expand` comes in. We `expand` the row blocks of $A$ along the columns to the number of columns of $C$ and the column blocks of $B$ along the rows to the number of rows of $C$. This way, we successfully tile $A$, $B$, and $C$.
|
64
|
+
|
65
|
+
With tiling done, the rest is simple. In the function body, we define an `accumulator` to accumulate intermediate results. We then iterate through the corresponding row blocks of $A$ and column blocks of B, multiplying them and accumulating the results in `accumulator`. Finally, we place the `accumulator` in the corresponding block of $C$. Since each block of the parameter tensors undergoes this operation, the multiplication is completed for the whole tensors as well.
|
@@ -6,20 +6,13 @@
|
|
6
6
|
|
7
7
|
## 安装
|
8
8
|
|
9
|
-
|
9
|
+
我们可以使用 `pip` 安装 `ninetoothed`。
|
10
10
|
|
11
11
|
```shell
|
12
|
-
|
13
|
-
pip install ./ninetoothed
|
12
|
+
pip install ninetoothed
|
14
13
|
```
|
15
14
|
|
16
|
-
成功运行完以上两个命令之后,`ninetoothed`
|
17
|
-
|
18
|
-
```shell
|
19
|
-
pip install triton
|
20
|
-
```
|
21
|
-
|
22
|
-
其余包可以根据需要自行安装,如 `torch`、`matplotlib`、`pandas` 等。
|
15
|
+
成功运行完以上两个命令之后,`ninetoothed` 就被安装好了。但是除了 `ninetoothed` 的本体之外,如果我们想要真正发挥它的作用,至少还需要安装 `triton` 和一个 `ninetoothed` 所支持的深度学习框架。以尝试为目的的话,我们推荐安装 `triton` 和 `torch`。
|
23
16
|
|
24
17
|
## 使用
|
25
18
|
|
@@ -67,6 +60,6 @@ def matmul_kernel(a: a_tiled, b: b_tiled, c: c_tiled):
|
|
67
60
|
c = accumulator.to(ninetoothed.language.float16)
|
68
61
|
```
|
69
62
|
|
70
|
-
对于矩阵乘法来说,我们也有三个参数张量,但是分块的方式肯定比向量加法要复杂一些。我们将三个矩阵分别记作 A
|
63
|
+
对于矩阵乘法来说,我们也有三个参数张量,但是分块的方式肯定比向量加法要复杂一些。我们将三个矩阵分别记作 $A$、$B$、$C$,其中 $A$ 和 $B$ 为输入,$C$ 为输出。其中 $C$ 的分块操作很简单,我们只需要按照行和列,将其分成大小为 `(BLOCK_SIZE_M, BLOCK_SIZE_N)` 的块即可,这样只要每个这样的块都算出了结果,整个 $C$ 也就都算出了结果。那么该如何分 $A$ 和 $B$ 呢?答案是再引入一个元参数 `BLOCK_SIZE_K`,这样我们就可以把 $A$ 分成 `(BLOCK_SIZE_M, BLOCK_SIZE_K)` 大小的块,把 $B$ 分成 `(BLOCK_SIZE_K, BLOCK_SIZE_N)` 的块。但是对于矩阵乘法,$A$ 和 $B$ 并不是块块对应,而是需要对应 $A$ 的每一行和 $B$ 的每一列,所以我们还需要继续 `tile`,把 $A$ 和 $B$ 进一步分成以行为单位和以列为单位的块。到目前为止,我们有了一堆 $A$ 的行块和 $B$ 的列块,但是对于每一个 $A$ 的行块,我们都要对应 $B$ 的每一个列块。这个时候,我们就需要进行 `expand` 了,我们把 $A$ 的行块沿着列 `expand` 成 $C$ 的列数那么多列,把 $B$ 的列块沿着行 `expand` 成 $C$ 的行数那么多行。这样,我们就成功地将 $A$、$B$、$C$ 三者都分好了块,并且对于每一个 $C$ 的块,我们都有对应好的 $A$ 的行块和 $B$ 的列块。
|
71
64
|
|
72
|
-
对应好了分块,后续的部分就简单多了。在函数体当中,我们定义了一个 `accumulator`,用于累加中间结果,之后就遍历了对应好的 A 的行块和 B 的列块,并且把他们相乘的结果累加到了 `accumulator` 当中,最后再将 `accumulator` 放到了对应的 C 的分块当中。由于参数张量被分成的每一块都被执行了这样的操作,因此即便对于整体而言,乘法也被完成了。
|
65
|
+
对应好了分块,后续的部分就简单多了。在函数体当中,我们定义了一个 `accumulator`,用于累加中间结果,之后就遍历了对应好的 $A$ 的行块和 $B$ 的列块,并且把他们相乘的结果累加到了 `accumulator` 当中,最后再将 `accumulator` 放到了对应的 $C$ 的分块当中。由于参数张量被分成的每一块都被执行了这样的操作,因此即便对于整体而言,乘法也被完成了。
|
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
|
|
4
4
|
|
5
5
|
[project]
|
6
6
|
name = "ninetoothed"
|
7
|
-
version = "0.1.
|
7
|
+
version = "0.1.1"
|
8
8
|
authors = [{ name = "Jiacheng Huang", email = "huangjiacheng0709@outlook.com" }]
|
9
9
|
description = "A domain-specific language based on Triton but providing higher-level abstraction."
|
10
10
|
readme = "README.md"
|
@@ -1,4 +1,5 @@
|
|
1
1
|
import ast
|
2
|
+
import collections
|
2
3
|
import functools
|
3
4
|
import inspect
|
4
5
|
import itertools
|
@@ -12,6 +13,67 @@ from ninetoothed.tensor import Tensor
|
|
12
13
|
from ninetoothed.torchifier import Torchifier
|
13
14
|
|
14
15
|
|
16
|
+
def jit(func):
|
17
|
+
return JIT(func)()
|
18
|
+
|
19
|
+
|
20
|
+
class JIT:
|
21
|
+
handles = collections.defaultdict(dict)
|
22
|
+
|
23
|
+
def __init__(self, func):
|
24
|
+
self.func = func
|
25
|
+
|
26
|
+
def __call__(self):
|
27
|
+
source_file = inspect.getsourcefile(self.func)
|
28
|
+
source_line = inspect.getsourcelines(self.func)[1]
|
29
|
+
|
30
|
+
if (
|
31
|
+
source_file in type(self).handles
|
32
|
+
and source_line in type(self).handles[source_file]
|
33
|
+
):
|
34
|
+
return type(self).handles[source_file][source_line]
|
35
|
+
|
36
|
+
source = textwrap.dedent(inspect.getsource(self.func))
|
37
|
+
tree = ast.parse(source)
|
38
|
+
|
39
|
+
CodeGenerator(inspect.get_annotations(self.func)).visit(tree)
|
40
|
+
Tritonizer().visit(tree)
|
41
|
+
ast.fix_missing_locations(tree)
|
42
|
+
|
43
|
+
unparsed = ast.unparse(tree).replace("None:", ":").replace(":None", ":")
|
44
|
+
|
45
|
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".py") as temp_file:
|
46
|
+
temp_file.write(unparsed.encode("utf-8"))
|
47
|
+
temp_file_name = temp_file.name
|
48
|
+
|
49
|
+
with open(temp_file_name, "r") as temp_file:
|
50
|
+
code = compile(
|
51
|
+
source=temp_file.read(),
|
52
|
+
filename=temp_file_name,
|
53
|
+
mode="exec",
|
54
|
+
)
|
55
|
+
|
56
|
+
namespace = {}
|
57
|
+
exec(code, namespace)
|
58
|
+
|
59
|
+
class Handle:
|
60
|
+
def __init__(self, kernel, launch):
|
61
|
+
self._kernel = kernel
|
62
|
+
self._launch = launch
|
63
|
+
|
64
|
+
def __call__(self, *args, **kwargs):
|
65
|
+
return self._launch(*args, **kwargs)
|
66
|
+
|
67
|
+
handle = Handle(
|
68
|
+
namespace[self.func.__name__],
|
69
|
+
namespace[f"launch_{self.func.__name__}"],
|
70
|
+
)
|
71
|
+
|
72
|
+
type(self).handles[source_file][source_line] = handle
|
73
|
+
|
74
|
+
return handle
|
75
|
+
|
76
|
+
|
15
77
|
class CodeGenerator(ast.NodeTransformer):
|
16
78
|
def __init__(self, context):
|
17
79
|
super().__init__()
|
@@ -286,34 +348,3 @@ class Tritonizer(ast.NodeTransformer):
|
|
286
348
|
)
|
287
349
|
|
288
350
|
return node
|
289
|
-
|
290
|
-
|
291
|
-
def jit(func):
|
292
|
-
source = textwrap.dedent(inspect.getsource(func))
|
293
|
-
tree = ast.parse(source)
|
294
|
-
|
295
|
-
CodeGenerator(func.__annotations__).visit(tree)
|
296
|
-
Tritonizer().visit(tree)
|
297
|
-
ast.fix_missing_locations(tree)
|
298
|
-
|
299
|
-
unparsed = ast.unparse(tree).replace("None:", ":").replace(":None", ":")
|
300
|
-
|
301
|
-
with tempfile.NamedTemporaryFile(delete=False, suffix=".py") as temp_file:
|
302
|
-
temp_file.write(unparsed.encode("utf-8"))
|
303
|
-
temp_file_name = temp_file.name
|
304
|
-
|
305
|
-
with open(temp_file_name, "r") as temp_file:
|
306
|
-
code = compile(source=temp_file.read(), filename=temp_file_name, mode="exec")
|
307
|
-
|
308
|
-
namespace = {}
|
309
|
-
exec(code, namespace)
|
310
|
-
|
311
|
-
class Handle:
|
312
|
-
def __init__(self, kernel, launch):
|
313
|
-
self._kernel = kernel
|
314
|
-
self._launch = launch
|
315
|
-
|
316
|
-
def __call__(self, *args, **kwargs):
|
317
|
-
return self._launch(*args, **kwargs)
|
318
|
-
|
319
|
-
return Handle(namespace[func.__name__], namespace[f"launch_{func.__name__}"])
|
@@ -46,7 +46,7 @@ class Tensor:
|
|
46
46
|
new_size = call("cdiv", size, tile_size)
|
47
47
|
outer_shape.append(new_size)
|
48
48
|
|
49
|
-
new_stride =
|
49
|
+
new_stride = stride * tile_size // tile_stride
|
50
50
|
outer_strides.append(new_stride)
|
51
51
|
|
52
52
|
inner_shape.append(tile_size)
|
File without changes
|
@@ -0,0 +1,16 @@
|
|
1
|
+
import pytest
|
2
|
+
import torch
|
3
|
+
|
4
|
+
|
5
|
+
def skip_if_cuda_not_available(func):
|
6
|
+
return pytest.mark.skipif(
|
7
|
+
not torch.cuda.is_available(),
|
8
|
+
reason="CUDA not available",
|
9
|
+
)(func)
|
10
|
+
|
11
|
+
|
12
|
+
def skip_if_float8_e5m2_not_supported(func):
|
13
|
+
return pytest.mark.skipif(
|
14
|
+
not hasattr(torch, "float8_e5m2"),
|
15
|
+
reason="`float8_e5m2` not supported",
|
16
|
+
)(func)
|
@@ -0,0 +1,41 @@
|
|
1
|
+
import ninetoothed
|
2
|
+
import torch
|
3
|
+
from ninetoothed import Symbol, Tensor
|
4
|
+
|
5
|
+
from tests.skippers import skip_if_cuda_not_available
|
6
|
+
|
7
|
+
|
8
|
+
def add(lhs, rhs):
|
9
|
+
BLOCK_SIZE = Symbol("BLOCK_SIZE", meta=True)
|
10
|
+
|
11
|
+
@ninetoothed.jit
|
12
|
+
def add_kernel(
|
13
|
+
lhs: Tensor(1).tile((BLOCK_SIZE,)),
|
14
|
+
rhs: Tensor(1).tile((BLOCK_SIZE,)),
|
15
|
+
output: Tensor(1).tile((BLOCK_SIZE,)),
|
16
|
+
):
|
17
|
+
output = lhs + rhs # noqa: F841
|
18
|
+
|
19
|
+
output = torch.empty_like(lhs)
|
20
|
+
|
21
|
+
add_kernel(lhs, rhs, output)
|
22
|
+
|
23
|
+
return output
|
24
|
+
|
25
|
+
|
26
|
+
@skip_if_cuda_not_available
|
27
|
+
class TestCUDA:
|
28
|
+
@classmethod
|
29
|
+
def setup_class(cls):
|
30
|
+
torch.manual_seed(0)
|
31
|
+
|
32
|
+
size = 98432
|
33
|
+
|
34
|
+
cls.lhs = torch.rand(size, device="cuda")
|
35
|
+
cls.rhs = torch.rand(size, device="cuda")
|
36
|
+
|
37
|
+
def test_fp32(self):
|
38
|
+
lhs = type(self).lhs.to(torch.float32)
|
39
|
+
rhs = type(self).rhs.to(torch.float32)
|
40
|
+
|
41
|
+
assert torch.allclose(add(lhs, rhs), lhs + rhs)
|
@@ -0,0 +1,72 @@
|
|
1
|
+
import ninetoothed
|
2
|
+
import torch
|
3
|
+
from ninetoothed import Symbol, Tensor
|
4
|
+
|
5
|
+
from tests.skippers import skip_if_cuda_not_available, skip_if_float8_e5m2_not_supported
|
6
|
+
|
7
|
+
|
8
|
+
def matmul(lhs, rhs):
|
9
|
+
BLOCK_SIZE_M = Symbol("BLOCK_SIZE_M", meta=True)
|
10
|
+
BLOCK_SIZE_N = Symbol("BLOCK_SIZE_N", meta=True)
|
11
|
+
BLOCK_SIZE_K = Symbol("BLOCK_SIZE_K", meta=True)
|
12
|
+
|
13
|
+
output_tiled = Tensor(2).tile((BLOCK_SIZE_M, BLOCK_SIZE_N))
|
14
|
+
|
15
|
+
lhs_tiled = (
|
16
|
+
Tensor(2)
|
17
|
+
.tile((BLOCK_SIZE_M, BLOCK_SIZE_K))
|
18
|
+
.tile((1, -1))
|
19
|
+
.expand((-1, output_tiled.shape[1]))
|
20
|
+
)
|
21
|
+
rhs_tiled = (
|
22
|
+
Tensor(2)
|
23
|
+
.tile((BLOCK_SIZE_K, BLOCK_SIZE_N))
|
24
|
+
.tile((-1, 1))
|
25
|
+
.expand((output_tiled.shape[0], -1))
|
26
|
+
)
|
27
|
+
|
28
|
+
@ninetoothed.jit
|
29
|
+
def matmul_kernel(lhs: lhs_tiled, rhs: rhs_tiled, output: output_tiled):
|
30
|
+
accumulator = ninetoothed.language.zeros(
|
31
|
+
output.shape, dtype=ninetoothed.language.float32
|
32
|
+
)
|
33
|
+
for k in range(lhs.shape[1]):
|
34
|
+
accumulator = ninetoothed.language.dot(lhs[0, k], rhs[k, 0], accumulator)
|
35
|
+
output = accumulator.to(ninetoothed.language.float16)
|
36
|
+
|
37
|
+
output = torch.empty(
|
38
|
+
(lhs.shape[0], rhs.shape[1]), device=lhs.device, dtype=torch.float16
|
39
|
+
)
|
40
|
+
|
41
|
+
matmul_kernel(lhs, rhs, output)
|
42
|
+
|
43
|
+
return output
|
44
|
+
|
45
|
+
|
46
|
+
@skip_if_cuda_not_available
|
47
|
+
class TestCUDA:
|
48
|
+
@classmethod
|
49
|
+
def setup_class(cls):
|
50
|
+
torch.manual_seed(0)
|
51
|
+
|
52
|
+
shape = (512, 512)
|
53
|
+
|
54
|
+
cls.lhs = torch.randn(shape, device="cuda")
|
55
|
+
cls.rhs = torch.randn(shape, device="cuda")
|
56
|
+
|
57
|
+
def test_fp16(self):
|
58
|
+
lhs = type(self).lhs.to(torch.float16)
|
59
|
+
rhs = type(self).rhs.to(torch.float16)
|
60
|
+
|
61
|
+
assert torch.allclose(matmul(lhs, rhs), torch.matmul(lhs, rhs))
|
62
|
+
|
63
|
+
@skip_if_float8_e5m2_not_supported
|
64
|
+
def test_fp8(self):
|
65
|
+
lhs = type(self).lhs.to(torch.float8_e5m2)
|
66
|
+
rhs = type(self).rhs.T.to(torch.float8_e5m2)
|
67
|
+
|
68
|
+
assert torch.allclose(
|
69
|
+
matmul(lhs, rhs),
|
70
|
+
torch.matmul(lhs.to(torch.float16), rhs.to(torch.float16)),
|
71
|
+
atol=0.125,
|
72
|
+
)
|
ninetoothed-0.1.0/PKG-INFO
DELETED
@@ -1,19 +0,0 @@
|
|
1
|
-
Metadata-Version: 2.3
|
2
|
-
Name: ninetoothed
|
3
|
-
Version: 0.1.0
|
4
|
-
Summary: A domain-specific language based on Triton but providing higher-level abstraction.
|
5
|
-
Project-URL: Homepage, https://github.com/InfiniTensor/ninetoothed
|
6
|
-
Project-URL: Issues, https://github.com/InfiniTensor/ninetoothed/issues
|
7
|
-
Author-email: Jiacheng Huang <huangjiacheng0709@outlook.com>
|
8
|
-
License-File: LICENSE
|
9
|
-
Classifier: License :: OSI Approved :: Apache Software License
|
10
|
-
Classifier: Operating System :: OS Independent
|
11
|
-
Classifier: Programming Language :: Python :: 3
|
12
|
-
Requires-Python: >=3.10
|
13
|
-
Description-Content-Type: text/markdown
|
14
|
-
|
15
|
-
# Nine-Toothed
|
16
|
-
|
17
|
-
A domain-specific language based on Triton but providing higher-level abstraction.
|
18
|
-
|
19
|
-
**Read this in other languages: [English](README.md), [简体中文](docs/README.zh.md).**
|
ninetoothed-0.1.0/README.md
DELETED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|