ninetoothed 0.15.0__tar.gz → 0.15.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. {ninetoothed-0.15.0 → ninetoothed-0.15.1}/PKG-INFO +1 -1
  2. {ninetoothed-0.15.0 → ninetoothed-0.15.1}/pyproject.toml +1 -1
  3. {ninetoothed-0.15.0 → ninetoothed-0.15.1}/src/ninetoothed/aot.py +2 -2
  4. {ninetoothed-0.15.0 → ninetoothed-0.15.1}/src/ninetoothed/generation.py +1 -0
  5. {ninetoothed-0.15.0 → ninetoothed-0.15.1}/src/ninetoothed/tensor.py +1 -1
  6. {ninetoothed-0.15.0 → ninetoothed-0.15.1}/src/ninetoothed/visualization.py +10 -4
  7. ninetoothed-0.15.1/tests/test_aot.py +153 -0
  8. {ninetoothed-0.15.0 → ninetoothed-0.15.1}/tests/test_conv2d.py +16 -2
  9. {ninetoothed-0.15.0 → ninetoothed-0.15.1}/tests/test_matmul.py +13 -6
  10. {ninetoothed-0.15.0 → ninetoothed-0.15.1}/.gitattributes +0 -0
  11. {ninetoothed-0.15.0 → ninetoothed-0.15.1}/.github/ISSUE_TEMPLATE/bug-report.yml +0 -0
  12. {ninetoothed-0.15.0 → ninetoothed-0.15.1}/.github/ISSUE_TEMPLATE/feature-request.yml +0 -0
  13. {ninetoothed-0.15.0 → ninetoothed-0.15.1}/.github/pull_request_template.md +0 -0
  14. {ninetoothed-0.15.0 → ninetoothed-0.15.1}/.github/workflows/publish-to-pypi.yml +0 -0
  15. {ninetoothed-0.15.0 → ninetoothed-0.15.1}/.github/workflows/pytest.yml +0 -0
  16. {ninetoothed-0.15.0 → ninetoothed-0.15.1}/.github/workflows/ruff.yml +0 -0
  17. {ninetoothed-0.15.0 → ninetoothed-0.15.1}/.github/workflows/sphinx.yml +0 -0
  18. {ninetoothed-0.15.0 → ninetoothed-0.15.1}/.gitignore +0 -0
  19. {ninetoothed-0.15.0 → ninetoothed-0.15.1}/LICENSE +0 -0
  20. {ninetoothed-0.15.0 → ninetoothed-0.15.1}/README.md +0 -0
  21. {ninetoothed-0.15.0 → ninetoothed-0.15.1}/docs/Makefile +0 -0
  22. {ninetoothed-0.15.0 → ninetoothed-0.15.1}/docs/README.zh.md +0 -0
  23. {ninetoothed-0.15.0 → ninetoothed-0.15.1}/docs/make.bat +0 -0
  24. {ninetoothed-0.15.0 → ninetoothed-0.15.1}/docs/requirements.txt +0 -0
  25. {ninetoothed-0.15.0 → ninetoothed-0.15.1}/docs/source/_static/matmul-tiling.png +0 -0
  26. {ninetoothed-0.15.0 → ninetoothed-0.15.1}/docs/source/_static/ninetoothed-logo.png +0 -0
  27. {ninetoothed-0.15.0 → ninetoothed-0.15.1}/docs/source/_static/vecadd-tiling.png +0 -0
  28. {ninetoothed-0.15.0 → ninetoothed-0.15.1}/docs/source/code_generation.rst +0 -0
  29. {ninetoothed-0.15.0 → ninetoothed-0.15.1}/docs/source/conf.py +0 -0
  30. {ninetoothed-0.15.0 → ninetoothed-0.15.1}/docs/source/index.rst +0 -0
  31. {ninetoothed-0.15.0 → ninetoothed-0.15.1}/docs/source/installation.rst +0 -0
  32. {ninetoothed-0.15.0 → ninetoothed-0.15.1}/docs/source/python_api.rst +0 -0
  33. {ninetoothed-0.15.0 → ninetoothed-0.15.1}/docs/source/symbol.rst +0 -0
  34. {ninetoothed-0.15.0 → ninetoothed-0.15.1}/docs/source/tensor.rst +0 -0
  35. {ninetoothed-0.15.0 → ninetoothed-0.15.1}/docs/source/visualization.rst +0 -0
  36. {ninetoothed-0.15.0 → ninetoothed-0.15.1}/requirements.txt +0 -0
  37. {ninetoothed-0.15.0 → ninetoothed-0.15.1}/src/ninetoothed/__init__.py +0 -0
  38. {ninetoothed-0.15.0 → ninetoothed-0.15.1}/src/ninetoothed/cudaifier.py +0 -0
  39. {ninetoothed-0.15.0 → ninetoothed-0.15.1}/src/ninetoothed/dtype.py +0 -0
  40. {ninetoothed-0.15.0 → ninetoothed-0.15.1}/src/ninetoothed/jit.py +0 -0
  41. {ninetoothed-0.15.0 → ninetoothed-0.15.1}/src/ninetoothed/language.py +0 -0
  42. {ninetoothed-0.15.0 → ninetoothed-0.15.1}/src/ninetoothed/make.py +0 -0
  43. {ninetoothed-0.15.0 → ninetoothed-0.15.1}/src/ninetoothed/naming.py +0 -0
  44. {ninetoothed-0.15.0 → ninetoothed-0.15.1}/src/ninetoothed/symbol.py +0 -0
  45. {ninetoothed-0.15.0 → ninetoothed-0.15.1}/src/ninetoothed/torchifier.py +0 -0
  46. {ninetoothed-0.15.0 → ninetoothed-0.15.1}/tests/__init__.py +0 -0
  47. {ninetoothed-0.15.0 → ninetoothed-0.15.1}/tests/skippers.py +0 -0
  48. {ninetoothed-0.15.0 → ninetoothed-0.15.1}/tests/test_add.py +0 -0
  49. {ninetoothed-0.15.0 → ninetoothed-0.15.1}/tests/test_addmm.py +0 -0
  50. {ninetoothed-0.15.0 → ninetoothed-0.15.1}/tests/test_attention.py +0 -0
  51. {ninetoothed-0.15.0 → ninetoothed-0.15.1}/tests/test_max_pool2d.py +0 -0
  52. {ninetoothed-0.15.0 → ninetoothed-0.15.1}/tests/test_naming.py +0 -0
  53. {ninetoothed-0.15.0 → ninetoothed-0.15.1}/tests/test_softmax.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ninetoothed
3
- Version: 0.15.0
3
+ Version: 0.15.1
4
4
  Summary: A domain-specific language based on Triton but providing higher-level abstraction.
5
5
  Project-URL: Homepage, https://github.com/InfiniTensor/ninetoothed
6
6
  Project-URL: Issues, https://github.com/InfiniTensor/ninetoothed/issues
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "ninetoothed"
7
- version = "0.15.0"
7
+ version = "0.15.1"
8
8
  authors = [{ name = "Jiacheng Huang", email = "huangjiacheng0709@outlook.com" }]
9
9
  description = "A domain-specific language based on Triton but providing higher-level abstraction."
10
10
  readme = "README.md"
@@ -4,7 +4,7 @@ import subprocess
4
4
  import tempfile
5
5
  import uuid
6
6
 
7
- from ninetoothed.dtype import int64, uint64
7
+ from ninetoothed.dtype import int64
8
8
  from ninetoothed.generation import CACHE_DIR, CodeGenerator
9
9
  from ninetoothed.tensor import Tensor
10
10
 
@@ -55,7 +55,7 @@ def _aot(func, caller, kernel_name, num_warps, num_stages):
55
55
 
56
56
  param_types.append(f"*{dtype}")
57
57
  elif Tensor.size_pattern().fullmatch(param):
58
- param_types.append(uint64)
58
+ param_types.append(int64)
59
59
  elif Tensor.stride_pattern().fullmatch(param):
60
60
  param_types.append(int64)
61
61
 
@@ -82,6 +82,7 @@ class CodeGenerator(ast.NodeTransformer):
82
82
  dependencies = _find_dependencies(func)
83
83
  source = "\n\n".join((unparsed, dependencies)).strip()
84
84
  source = source.replace(func.__name__, kernel_name)
85
+ source += "\n"
85
86
 
86
87
  if prettify:
87
88
  for original, simplified in name_collector.simplified_names.items():
@@ -146,7 +146,7 @@ class Tensor:
146
146
  )
147
147
  outer_shape.append(new_size)
148
148
 
149
- new_stride = self_stride * stride // spacing
149
+ new_stride = self_stride * stride
150
150
  outer_strides.append(new_stride)
151
151
 
152
152
  inner_shape.append(tile_size)
@@ -118,10 +118,16 @@ def _visualize_unit_square(ax, x, y, color):
118
118
 
119
119
 
120
120
  def _visualize_rect(ax, width, height, x, y, color):
121
- pos_x, pos_y = zip(*_verts_of_rect(width, height, x, y))
122
-
123
- ax.fill(pos_x, pos_y, color)
124
- ax.plot(pos_x + (pos_x[0],), pos_y + (pos_y[0],), "k")
121
+ ax.add_patch(
122
+ plt.Rectangle(
123
+ (x, y),
124
+ width,
125
+ height,
126
+ edgecolor="k",
127
+ facecolor=color,
128
+ linewidth=plt.rcParams["lines.linewidth"],
129
+ )
130
+ )
125
131
 
126
132
 
127
133
  def _verts_of_rect(width, height, x, y):
@@ -0,0 +1,153 @@
1
+ import ctypes
2
+ import functools
3
+ import subprocess
4
+
5
+ import torch
6
+ import torch.nn.functional as F
7
+
8
+ import ninetoothed
9
+ import ninetoothed.generation
10
+ import tests.test_conv2d as conv2d
11
+ import tests.test_matmul as matmul
12
+ from ninetoothed import Tensor
13
+ from tests.skippers import skip_if_cuda_not_available
14
+
15
+
16
+ @skip_if_cuda_not_available
17
+ class TestCUDA:
18
+ @classmethod
19
+ def setup_class(cls):
20
+ torch.manual_seed(0)
21
+
22
+ def test_matmul(self):
23
+ arrangement = functools.partial(
24
+ matmul.arrangement, BLOCK_SIZE_M=64, BLOCK_SIZE_N=64, BLOCK_SIZE_K=64
25
+ )
26
+ application = matmul.application
27
+ tensors = tuple(Tensor(2, dtype=ninetoothed.float16) for _ in range(3))
28
+ caller = "cuda"
29
+ kernel_name = "matmul"
30
+ output_dir = ninetoothed.generation.CACHE_DIR
31
+
32
+ launch_func = _generate_launch_func(
33
+ arrangement,
34
+ application,
35
+ tensors,
36
+ caller=caller,
37
+ kernel_name=kernel_name,
38
+ output_dir=output_dir,
39
+ )
40
+
41
+ shape = (512, 512)
42
+ dtype = torch.float16
43
+ device = caller
44
+
45
+ lhs = torch.randn(shape, dtype=dtype, device=device)
46
+ rhs = torch.randn(shape, dtype=dtype, device=device)
47
+ output = torch.empty((lhs.shape[0], rhs.shape[1]), dtype=dtype, device=device)
48
+
49
+ _run_launch_func(launch_func, lhs, rhs, output)
50
+
51
+ assert torch.allclose(output, torch.matmul(lhs, rhs))
52
+
53
+ def test_conv2d(self):
54
+ arrangement = functools.partial(
55
+ conv2d.arrangement, BLOCK_SIZE_M=64, BLOCK_SIZE_N=64, BLOCK_SIZE_K=64
56
+ )
57
+ application = matmul.application
58
+ tensors = tuple(Tensor(4, dtype=ninetoothed.float16) for _ in range(3))
59
+ caller = "cuda"
60
+ kernel_name = "conv2d"
61
+ output_dir = ninetoothed.generation.CACHE_DIR
62
+
63
+ launch_func = _generate_launch_func(
64
+ arrangement,
65
+ application,
66
+ tensors,
67
+ caller=caller,
68
+ kernel_name=kernel_name,
69
+ output_dir=output_dir,
70
+ )
71
+
72
+ n, c, h, w = 4, 64, 16, 16
73
+ k, _, r, s = 512, c, 3, 3
74
+ p = h - r + 1
75
+ q = w - s + 1
76
+ dtype = torch.float16
77
+ device = caller
78
+
79
+ input = torch.randn(n, c, h, w, dtype=dtype, device=device)
80
+ filter = torch.randn(k, c, r, s, dtype=dtype, device=device)
81
+ output = torch.empty(n, k, p, q, dtype=dtype, device=device)
82
+
83
+ _run_launch_func(launch_func, input, filter, output)
84
+
85
+ assert torch.allclose(output, F.conv2d(input, filter), atol=0.001, rtol=0.001)
86
+
87
+
88
+ class _ArgumentTensor(ctypes.Structure):
89
+ _fields_ = [
90
+ ("data", ctypes.c_void_p),
91
+ ("shape", ctypes.POINTER(ctypes.c_uint64)),
92
+ ("strides", ctypes.POINTER(ctypes.c_int64)),
93
+ ]
94
+
95
+ @staticmethod
96
+ def from_torch_tensor(tensor):
97
+ data = ctypes.c_void_p(tensor.data_ptr())
98
+ shape = (ctypes.c_uint64 * len(tensor.shape))(*tensor.shape)
99
+ strides = (ctypes.c_int64 * len(tensor.stride()))(*tensor.stride())
100
+
101
+ return _ArgumentTensor(data, shape, strides)
102
+
103
+
104
+ def _run_launch_func(launch_func, *tensors):
105
+ stream = torch.cuda.Stream()
106
+
107
+ arg_tensors = tuple(_ArgumentTensor.from_torch_tensor(tensor) for tensor in tensors)
108
+
109
+ with torch.cuda.stream(stream):
110
+ launch_func(ctypes.c_void_p(stream.cuda_stream), *arg_tensors)
111
+
112
+ stream.synchronize()
113
+
114
+
115
+ def _generate_launch_func(
116
+ arrangement, application, tensors, caller, kernel_name, output_dir
117
+ ):
118
+ ninetoothed.make(
119
+ arrangement,
120
+ application,
121
+ tensors,
122
+ caller=caller,
123
+ kernel_name=kernel_name,
124
+ output_dir=output_dir,
125
+ )
126
+
127
+ _compile_library(kernel_name, output_dir)
128
+ library = _load_library(kernel_name, output_dir)
129
+ launch_func_name = f"launch_{kernel_name}"
130
+ launch_func = getattr(library, launch_func_name)
131
+ launch_func.argtypes = (ctypes.c_void_p,) + tuple(_ArgumentTensor for _ in tensors)
132
+ launch_func.restype = ctypes.c_int
133
+
134
+ return launch_func
135
+
136
+
137
+ def _compile_library(kernel_name, output_dir):
138
+ command = [
139
+ "nvcc",
140
+ "-shared",
141
+ "-Xcompiler",
142
+ "-fPIC",
143
+ "-lcuda",
144
+ "-o",
145
+ output_dir / f"{kernel_name}.so",
146
+ output_dir / f"{kernel_name}.c",
147
+ ]
148
+
149
+ subprocess.run(command, check=True)
150
+
151
+
152
+ def _load_library(kernel_name, kernel_dir):
153
+ return ctypes.CDLL(kernel_dir / f"{kernel_name}.so")
@@ -1,3 +1,5 @@
1
+ import functools
2
+
1
3
  import torch
2
4
  import torch.nn.functional as F
3
5
 
@@ -7,7 +9,14 @@ from ninetoothed import Tensor
7
9
  from tests.skippers import skip_if_cuda_not_available
8
10
 
9
11
 
10
- def arrangement(input, filter, output):
12
+ def arrangement(
13
+ input,
14
+ filter,
15
+ output,
16
+ BLOCK_SIZE_M=matmul.BLOCK_SIZE_M,
17
+ BLOCK_SIZE_N=matmul.BLOCK_SIZE_N,
18
+ BLOCK_SIZE_K=matmul.BLOCK_SIZE_K,
19
+ ):
11
20
  input_tiled = input.tile((1, *filter.shape[1:]), strides=(-1, -1, 1, 1))
12
21
  input_squeezed = input_tiled.squeeze(1)
13
22
  input_squeezed.dtype = input_squeezed.dtype.squeeze(0)
@@ -19,7 +28,12 @@ def arrangement(input, filter, output):
19
28
 
20
29
  output_flattened = output.permute((0, 2, 3, 1)).flatten(end_dim=3)
21
30
 
22
- return matmul.arrangement(input_flattened, filter_permuted, output_flattened)
31
+ return functools.partial(
32
+ matmul.arrangement,
33
+ BLOCK_SIZE_M=BLOCK_SIZE_M,
34
+ BLOCK_SIZE_N=BLOCK_SIZE_N,
35
+ BLOCK_SIZE_K=BLOCK_SIZE_K,
36
+ )(input_flattened, filter_permuted, output_flattened)
23
37
 
24
38
 
25
39
  def conv2d(input, filter):
@@ -5,12 +5,19 @@ import ninetoothed.language as ntl
5
5
  from ninetoothed import Symbol, Tensor
6
6
  from tests.skippers import skip_if_cuda_not_available, skip_if_float8_e5m2_not_supported
7
7
 
8
-
9
- def arrangement(lhs, rhs, output):
10
- BLOCK_SIZE_M = Symbol("BLOCK_SIZE_M", meta=True)
11
- BLOCK_SIZE_N = Symbol("BLOCK_SIZE_N", meta=True)
12
- BLOCK_SIZE_K = Symbol("BLOCK_SIZE_K", meta=True)
13
-
8
+ BLOCK_SIZE_M = Symbol("BLOCK_SIZE_M", meta=True)
9
+ BLOCK_SIZE_N = Symbol("BLOCK_SIZE_N", meta=True)
10
+ BLOCK_SIZE_K = Symbol("BLOCK_SIZE_K", meta=True)
11
+
12
+
13
+ def arrangement(
14
+ lhs,
15
+ rhs,
16
+ output,
17
+ BLOCK_SIZE_M=BLOCK_SIZE_M,
18
+ BLOCK_SIZE_N=BLOCK_SIZE_N,
19
+ BLOCK_SIZE_K=BLOCK_SIZE_K,
20
+ ):
14
21
  output_tiled = output.tile((BLOCK_SIZE_M, BLOCK_SIZE_N))
15
22
 
16
23
  lhs_tiled = (
File without changes
File without changes
File without changes
File without changes
File without changes