ninetoothed 0.15.0__tar.gz → 0.15.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ninetoothed-0.15.0 → ninetoothed-0.15.1}/PKG-INFO +1 -1
- {ninetoothed-0.15.0 → ninetoothed-0.15.1}/pyproject.toml +1 -1
- {ninetoothed-0.15.0 → ninetoothed-0.15.1}/src/ninetoothed/aot.py +2 -2
- {ninetoothed-0.15.0 → ninetoothed-0.15.1}/src/ninetoothed/generation.py +1 -0
- {ninetoothed-0.15.0 → ninetoothed-0.15.1}/src/ninetoothed/tensor.py +1 -1
- {ninetoothed-0.15.0 → ninetoothed-0.15.1}/src/ninetoothed/visualization.py +10 -4
- ninetoothed-0.15.1/tests/test_aot.py +153 -0
- {ninetoothed-0.15.0 → ninetoothed-0.15.1}/tests/test_conv2d.py +16 -2
- {ninetoothed-0.15.0 → ninetoothed-0.15.1}/tests/test_matmul.py +13 -6
- {ninetoothed-0.15.0 → ninetoothed-0.15.1}/.gitattributes +0 -0
- {ninetoothed-0.15.0 → ninetoothed-0.15.1}/.github/ISSUE_TEMPLATE/bug-report.yml +0 -0
- {ninetoothed-0.15.0 → ninetoothed-0.15.1}/.github/ISSUE_TEMPLATE/feature-request.yml +0 -0
- {ninetoothed-0.15.0 → ninetoothed-0.15.1}/.github/pull_request_template.md +0 -0
- {ninetoothed-0.15.0 → ninetoothed-0.15.1}/.github/workflows/publish-to-pypi.yml +0 -0
- {ninetoothed-0.15.0 → ninetoothed-0.15.1}/.github/workflows/pytest.yml +0 -0
- {ninetoothed-0.15.0 → ninetoothed-0.15.1}/.github/workflows/ruff.yml +0 -0
- {ninetoothed-0.15.0 → ninetoothed-0.15.1}/.github/workflows/sphinx.yml +0 -0
- {ninetoothed-0.15.0 → ninetoothed-0.15.1}/.gitignore +0 -0
- {ninetoothed-0.15.0 → ninetoothed-0.15.1}/LICENSE +0 -0
- {ninetoothed-0.15.0 → ninetoothed-0.15.1}/README.md +0 -0
- {ninetoothed-0.15.0 → ninetoothed-0.15.1}/docs/Makefile +0 -0
- {ninetoothed-0.15.0 → ninetoothed-0.15.1}/docs/README.zh.md +0 -0
- {ninetoothed-0.15.0 → ninetoothed-0.15.1}/docs/make.bat +0 -0
- {ninetoothed-0.15.0 → ninetoothed-0.15.1}/docs/requirements.txt +0 -0
- {ninetoothed-0.15.0 → ninetoothed-0.15.1}/docs/source/_static/matmul-tiling.png +0 -0
- {ninetoothed-0.15.0 → ninetoothed-0.15.1}/docs/source/_static/ninetoothed-logo.png +0 -0
- {ninetoothed-0.15.0 → ninetoothed-0.15.1}/docs/source/_static/vecadd-tiling.png +0 -0
- {ninetoothed-0.15.0 → ninetoothed-0.15.1}/docs/source/code_generation.rst +0 -0
- {ninetoothed-0.15.0 → ninetoothed-0.15.1}/docs/source/conf.py +0 -0
- {ninetoothed-0.15.0 → ninetoothed-0.15.1}/docs/source/index.rst +0 -0
- {ninetoothed-0.15.0 → ninetoothed-0.15.1}/docs/source/installation.rst +0 -0
- {ninetoothed-0.15.0 → ninetoothed-0.15.1}/docs/source/python_api.rst +0 -0
- {ninetoothed-0.15.0 → ninetoothed-0.15.1}/docs/source/symbol.rst +0 -0
- {ninetoothed-0.15.0 → ninetoothed-0.15.1}/docs/source/tensor.rst +0 -0
- {ninetoothed-0.15.0 → ninetoothed-0.15.1}/docs/source/visualization.rst +0 -0
- {ninetoothed-0.15.0 → ninetoothed-0.15.1}/requirements.txt +0 -0
- {ninetoothed-0.15.0 → ninetoothed-0.15.1}/src/ninetoothed/__init__.py +0 -0
- {ninetoothed-0.15.0 → ninetoothed-0.15.1}/src/ninetoothed/cudaifier.py +0 -0
- {ninetoothed-0.15.0 → ninetoothed-0.15.1}/src/ninetoothed/dtype.py +0 -0
- {ninetoothed-0.15.0 → ninetoothed-0.15.1}/src/ninetoothed/jit.py +0 -0
- {ninetoothed-0.15.0 → ninetoothed-0.15.1}/src/ninetoothed/language.py +0 -0
- {ninetoothed-0.15.0 → ninetoothed-0.15.1}/src/ninetoothed/make.py +0 -0
- {ninetoothed-0.15.0 → ninetoothed-0.15.1}/src/ninetoothed/naming.py +0 -0
- {ninetoothed-0.15.0 → ninetoothed-0.15.1}/src/ninetoothed/symbol.py +0 -0
- {ninetoothed-0.15.0 → ninetoothed-0.15.1}/src/ninetoothed/torchifier.py +0 -0
- {ninetoothed-0.15.0 → ninetoothed-0.15.1}/tests/__init__.py +0 -0
- {ninetoothed-0.15.0 → ninetoothed-0.15.1}/tests/skippers.py +0 -0
- {ninetoothed-0.15.0 → ninetoothed-0.15.1}/tests/test_add.py +0 -0
- {ninetoothed-0.15.0 → ninetoothed-0.15.1}/tests/test_addmm.py +0 -0
- {ninetoothed-0.15.0 → ninetoothed-0.15.1}/tests/test_attention.py +0 -0
- {ninetoothed-0.15.0 → ninetoothed-0.15.1}/tests/test_max_pool2d.py +0 -0
- {ninetoothed-0.15.0 → ninetoothed-0.15.1}/tests/test_naming.py +0 -0
- {ninetoothed-0.15.0 → ninetoothed-0.15.1}/tests/test_softmax.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: ninetoothed
|
3
|
-
Version: 0.15.
|
3
|
+
Version: 0.15.1
|
4
4
|
Summary: A domain-specific language based on Triton but providing higher-level abstraction.
|
5
5
|
Project-URL: Homepage, https://github.com/InfiniTensor/ninetoothed
|
6
6
|
Project-URL: Issues, https://github.com/InfiniTensor/ninetoothed/issues
|
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
|
|
4
4
|
|
5
5
|
[project]
|
6
6
|
name = "ninetoothed"
|
7
|
-
version = "0.15.
|
7
|
+
version = "0.15.1"
|
8
8
|
authors = [{ name = "Jiacheng Huang", email = "huangjiacheng0709@outlook.com" }]
|
9
9
|
description = "A domain-specific language based on Triton but providing higher-level abstraction."
|
10
10
|
readme = "README.md"
|
@@ -4,7 +4,7 @@ import subprocess
|
|
4
4
|
import tempfile
|
5
5
|
import uuid
|
6
6
|
|
7
|
-
from ninetoothed.dtype import int64
|
7
|
+
from ninetoothed.dtype import int64
|
8
8
|
from ninetoothed.generation import CACHE_DIR, CodeGenerator
|
9
9
|
from ninetoothed.tensor import Tensor
|
10
10
|
|
@@ -55,7 +55,7 @@ def _aot(func, caller, kernel_name, num_warps, num_stages):
|
|
55
55
|
|
56
56
|
param_types.append(f"*{dtype}")
|
57
57
|
elif Tensor.size_pattern().fullmatch(param):
|
58
|
-
param_types.append(
|
58
|
+
param_types.append(int64)
|
59
59
|
elif Tensor.stride_pattern().fullmatch(param):
|
60
60
|
param_types.append(int64)
|
61
61
|
|
@@ -82,6 +82,7 @@ class CodeGenerator(ast.NodeTransformer):
|
|
82
82
|
dependencies = _find_dependencies(func)
|
83
83
|
source = "\n\n".join((unparsed, dependencies)).strip()
|
84
84
|
source = source.replace(func.__name__, kernel_name)
|
85
|
+
source += "\n"
|
85
86
|
|
86
87
|
if prettify:
|
87
88
|
for original, simplified in name_collector.simplified_names.items():
|
@@ -118,10 +118,16 @@ def _visualize_unit_square(ax, x, y, color):
|
|
118
118
|
|
119
119
|
|
120
120
|
def _visualize_rect(ax, width, height, x, y, color):
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
121
|
+
ax.add_patch(
|
122
|
+
plt.Rectangle(
|
123
|
+
(x, y),
|
124
|
+
width,
|
125
|
+
height,
|
126
|
+
edgecolor="k",
|
127
|
+
facecolor=color,
|
128
|
+
linewidth=plt.rcParams["lines.linewidth"],
|
129
|
+
)
|
130
|
+
)
|
125
131
|
|
126
132
|
|
127
133
|
def _verts_of_rect(width, height, x, y):
|
@@ -0,0 +1,153 @@
|
|
1
|
+
import ctypes
|
2
|
+
import functools
|
3
|
+
import subprocess
|
4
|
+
|
5
|
+
import torch
|
6
|
+
import torch.nn.functional as F
|
7
|
+
|
8
|
+
import ninetoothed
|
9
|
+
import ninetoothed.generation
|
10
|
+
import tests.test_conv2d as conv2d
|
11
|
+
import tests.test_matmul as matmul
|
12
|
+
from ninetoothed import Tensor
|
13
|
+
from tests.skippers import skip_if_cuda_not_available
|
14
|
+
|
15
|
+
|
16
|
+
@skip_if_cuda_not_available
|
17
|
+
class TestCUDA:
|
18
|
+
@classmethod
|
19
|
+
def setup_class(cls):
|
20
|
+
torch.manual_seed(0)
|
21
|
+
|
22
|
+
def test_matmul(self):
|
23
|
+
arrangement = functools.partial(
|
24
|
+
matmul.arrangement, BLOCK_SIZE_M=64, BLOCK_SIZE_N=64, BLOCK_SIZE_K=64
|
25
|
+
)
|
26
|
+
application = matmul.application
|
27
|
+
tensors = tuple(Tensor(2, dtype=ninetoothed.float16) for _ in range(3))
|
28
|
+
caller = "cuda"
|
29
|
+
kernel_name = "matmul"
|
30
|
+
output_dir = ninetoothed.generation.CACHE_DIR
|
31
|
+
|
32
|
+
launch_func = _generate_launch_func(
|
33
|
+
arrangement,
|
34
|
+
application,
|
35
|
+
tensors,
|
36
|
+
caller=caller,
|
37
|
+
kernel_name=kernel_name,
|
38
|
+
output_dir=output_dir,
|
39
|
+
)
|
40
|
+
|
41
|
+
shape = (512, 512)
|
42
|
+
dtype = torch.float16
|
43
|
+
device = caller
|
44
|
+
|
45
|
+
lhs = torch.randn(shape, dtype=dtype, device=device)
|
46
|
+
rhs = torch.randn(shape, dtype=dtype, device=device)
|
47
|
+
output = torch.empty((lhs.shape[0], rhs.shape[1]), dtype=dtype, device=device)
|
48
|
+
|
49
|
+
_run_launch_func(launch_func, lhs, rhs, output)
|
50
|
+
|
51
|
+
assert torch.allclose(output, torch.matmul(lhs, rhs))
|
52
|
+
|
53
|
+
def test_conv2d(self):
|
54
|
+
arrangement = functools.partial(
|
55
|
+
conv2d.arrangement, BLOCK_SIZE_M=64, BLOCK_SIZE_N=64, BLOCK_SIZE_K=64
|
56
|
+
)
|
57
|
+
application = matmul.application
|
58
|
+
tensors = tuple(Tensor(4, dtype=ninetoothed.float16) for _ in range(3))
|
59
|
+
caller = "cuda"
|
60
|
+
kernel_name = "conv2d"
|
61
|
+
output_dir = ninetoothed.generation.CACHE_DIR
|
62
|
+
|
63
|
+
launch_func = _generate_launch_func(
|
64
|
+
arrangement,
|
65
|
+
application,
|
66
|
+
tensors,
|
67
|
+
caller=caller,
|
68
|
+
kernel_name=kernel_name,
|
69
|
+
output_dir=output_dir,
|
70
|
+
)
|
71
|
+
|
72
|
+
n, c, h, w = 4, 64, 16, 16
|
73
|
+
k, _, r, s = 512, c, 3, 3
|
74
|
+
p = h - r + 1
|
75
|
+
q = w - s + 1
|
76
|
+
dtype = torch.float16
|
77
|
+
device = caller
|
78
|
+
|
79
|
+
input = torch.randn(n, c, h, w, dtype=dtype, device=device)
|
80
|
+
filter = torch.randn(k, c, r, s, dtype=dtype, device=device)
|
81
|
+
output = torch.empty(n, k, p, q, dtype=dtype, device=device)
|
82
|
+
|
83
|
+
_run_launch_func(launch_func, input, filter, output)
|
84
|
+
|
85
|
+
assert torch.allclose(output, F.conv2d(input, filter), atol=0.001, rtol=0.001)
|
86
|
+
|
87
|
+
|
88
|
+
class _ArgumentTensor(ctypes.Structure):
|
89
|
+
_fields_ = [
|
90
|
+
("data", ctypes.c_void_p),
|
91
|
+
("shape", ctypes.POINTER(ctypes.c_uint64)),
|
92
|
+
("strides", ctypes.POINTER(ctypes.c_int64)),
|
93
|
+
]
|
94
|
+
|
95
|
+
@staticmethod
|
96
|
+
def from_torch_tensor(tensor):
|
97
|
+
data = ctypes.c_void_p(tensor.data_ptr())
|
98
|
+
shape = (ctypes.c_uint64 * len(tensor.shape))(*tensor.shape)
|
99
|
+
strides = (ctypes.c_int64 * len(tensor.stride()))(*tensor.stride())
|
100
|
+
|
101
|
+
return _ArgumentTensor(data, shape, strides)
|
102
|
+
|
103
|
+
|
104
|
+
def _run_launch_func(launch_func, *tensors):
|
105
|
+
stream = torch.cuda.Stream()
|
106
|
+
|
107
|
+
arg_tensors = tuple(_ArgumentTensor.from_torch_tensor(tensor) for tensor in tensors)
|
108
|
+
|
109
|
+
with torch.cuda.stream(stream):
|
110
|
+
launch_func(ctypes.c_void_p(stream.cuda_stream), *arg_tensors)
|
111
|
+
|
112
|
+
stream.synchronize()
|
113
|
+
|
114
|
+
|
115
|
+
def _generate_launch_func(
|
116
|
+
arrangement, application, tensors, caller, kernel_name, output_dir
|
117
|
+
):
|
118
|
+
ninetoothed.make(
|
119
|
+
arrangement,
|
120
|
+
application,
|
121
|
+
tensors,
|
122
|
+
caller=caller,
|
123
|
+
kernel_name=kernel_name,
|
124
|
+
output_dir=output_dir,
|
125
|
+
)
|
126
|
+
|
127
|
+
_compile_library(kernel_name, output_dir)
|
128
|
+
library = _load_library(kernel_name, output_dir)
|
129
|
+
launch_func_name = f"launch_{kernel_name}"
|
130
|
+
launch_func = getattr(library, launch_func_name)
|
131
|
+
launch_func.argtypes = (ctypes.c_void_p,) + tuple(_ArgumentTensor for _ in tensors)
|
132
|
+
launch_func.restype = ctypes.c_int
|
133
|
+
|
134
|
+
return launch_func
|
135
|
+
|
136
|
+
|
137
|
+
def _compile_library(kernel_name, output_dir):
|
138
|
+
command = [
|
139
|
+
"nvcc",
|
140
|
+
"-shared",
|
141
|
+
"-Xcompiler",
|
142
|
+
"-fPIC",
|
143
|
+
"-lcuda",
|
144
|
+
"-o",
|
145
|
+
output_dir / f"{kernel_name}.so",
|
146
|
+
output_dir / f"{kernel_name}.c",
|
147
|
+
]
|
148
|
+
|
149
|
+
subprocess.run(command, check=True)
|
150
|
+
|
151
|
+
|
152
|
+
def _load_library(kernel_name, kernel_dir):
|
153
|
+
return ctypes.CDLL(kernel_dir / f"{kernel_name}.so")
|
@@ -1,3 +1,5 @@
|
|
1
|
+
import functools
|
2
|
+
|
1
3
|
import torch
|
2
4
|
import torch.nn.functional as F
|
3
5
|
|
@@ -7,7 +9,14 @@ from ninetoothed import Tensor
|
|
7
9
|
from tests.skippers import skip_if_cuda_not_available
|
8
10
|
|
9
11
|
|
10
|
-
def arrangement(
|
12
|
+
def arrangement(
|
13
|
+
input,
|
14
|
+
filter,
|
15
|
+
output,
|
16
|
+
BLOCK_SIZE_M=matmul.BLOCK_SIZE_M,
|
17
|
+
BLOCK_SIZE_N=matmul.BLOCK_SIZE_N,
|
18
|
+
BLOCK_SIZE_K=matmul.BLOCK_SIZE_K,
|
19
|
+
):
|
11
20
|
input_tiled = input.tile((1, *filter.shape[1:]), strides=(-1, -1, 1, 1))
|
12
21
|
input_squeezed = input_tiled.squeeze(1)
|
13
22
|
input_squeezed.dtype = input_squeezed.dtype.squeeze(0)
|
@@ -19,7 +28,12 @@ def arrangement(input, filter, output):
|
|
19
28
|
|
20
29
|
output_flattened = output.permute((0, 2, 3, 1)).flatten(end_dim=3)
|
21
30
|
|
22
|
-
return
|
31
|
+
return functools.partial(
|
32
|
+
matmul.arrangement,
|
33
|
+
BLOCK_SIZE_M=BLOCK_SIZE_M,
|
34
|
+
BLOCK_SIZE_N=BLOCK_SIZE_N,
|
35
|
+
BLOCK_SIZE_K=BLOCK_SIZE_K,
|
36
|
+
)(input_flattened, filter_permuted, output_flattened)
|
23
37
|
|
24
38
|
|
25
39
|
def conv2d(input, filter):
|
@@ -5,12 +5,19 @@ import ninetoothed.language as ntl
|
|
5
5
|
from ninetoothed import Symbol, Tensor
|
6
6
|
from tests.skippers import skip_if_cuda_not_available, skip_if_float8_e5m2_not_supported
|
7
7
|
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
8
|
+
BLOCK_SIZE_M = Symbol("BLOCK_SIZE_M", meta=True)
|
9
|
+
BLOCK_SIZE_N = Symbol("BLOCK_SIZE_N", meta=True)
|
10
|
+
BLOCK_SIZE_K = Symbol("BLOCK_SIZE_K", meta=True)
|
11
|
+
|
12
|
+
|
13
|
+
def arrangement(
|
14
|
+
lhs,
|
15
|
+
rhs,
|
16
|
+
output,
|
17
|
+
BLOCK_SIZE_M=BLOCK_SIZE_M,
|
18
|
+
BLOCK_SIZE_N=BLOCK_SIZE_N,
|
19
|
+
BLOCK_SIZE_K=BLOCK_SIZE_K,
|
20
|
+
):
|
14
21
|
output_tiled = output.tile((BLOCK_SIZE_M, BLOCK_SIZE_N))
|
15
22
|
|
16
23
|
lhs_tiled = (
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|