PyPI - ninetoothed - Versions diffs - 0.1.1__py3-none-any.whl → 0.3.0__py3-none-any.whl - Mend

ninetoothed 0.1.1py3-none-any.whl → 0.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

ninetoothed/jit.py +206 -43
ninetoothed/language.py +4 -1
ninetoothed/symbol.py +63 -8
ninetoothed/tensor.py +105 -39
ninetoothed/torchifier.py +15 -11
{ninetoothed-0.1.1.dist-info → ninetoothed-0.3.0.dist-info}/METADATA +9 -5
ninetoothed-0.3.0.dist-info/RECORD +10 -0
ninetoothed-0.1.1.dist-info/RECORD +0 -10
{ninetoothed-0.1.1.dist-info → ninetoothed-0.3.0.dist-info}/WHEEL +0 -0
{ninetoothed-0.1.1.dist-info → ninetoothed-0.3.0.dist-info}/licenses/LICENSE +0 -0

ninetoothed/jit.py CHANGED Viewed

@@ -5,7 +5,8 @@ import inspect
 import itertools
 import math
 import tempfile
-import textwrap
+import triton
 from ninetoothed.language import attribute, call
 from ninetoothed.symbol import Symbol
@@ -33,8 +34,7 @@ class JIT:
         ):
             return type(self).handles[source_file][source_line]
-        source = textwrap.dedent(inspect.getsource(self.func))
-        tree = ast.parse(source)
+        tree = self._get_tree()
         CodeGenerator(inspect.get_annotations(self.func)).visit(tree)
         Tritonizer().visit(tree)
@@ -56,15 +56,7 @@ class JIT:
         namespace = {}
         exec(code, namespace)
-        class Handle:
-            def __init__(self, kernel, launch):
-                self._kernel = kernel
-                self._launch = launch
-            def __call__(self, *args, **kwargs):
-                return self._launch(*args, **kwargs)
-        handle = Handle(
+        handle = _Handle(
             namespace[self.func.__name__],
             namespace[f"launch_{self.func.__name__}"],
         )
@@ -73,6 +65,15 @@ class JIT:
         return handle
+    def _get_tree(self):
+        module = ast.parse(inspect.getsource(inspect.getmodule(self.func)))
+        _AliasRestorer().visit(module)
+        finder = _FunctionDefFinder(self.func.__name__)
+        finder.visit(module)
+        return ast.Module(body=[finder.result], type_ignores=[])
 class CodeGenerator(ast.NodeTransformer):
     def __init__(self, context):
@@ -100,6 +101,29 @@ class CodeGenerator(ast.NodeTransformer):
         self.generic_visit(node)
+        for arg in self._args:
+            if not isinstance(arg, Tensor):
+                continue
+            offsets = arg.offsets()
+            initializations = {
+                type(self)._name_for_offsets(arg, dim): offs
+                for dim, offs in enumerate(offsets)
+            } | {
+                type(self)._name_for_pointers(arg): arg.original.pointer_string()
+                + sum(
+                    type(self)._name_for_offsets(arg, dim)[
+                        type(self)._generate_slices(arg, dim)
+                    ]
+                    * stride
+                    for dim, stride in enumerate(arg.original.strides)
+                )
+            }
+            for target, value in reversed(initializations.items()):
+                node.body.insert(0, ast.Assign(targets=[target.node], value=value.node))
         return node
     def visit_arguments(self, node):
@@ -136,14 +160,12 @@ class CodeGenerator(ast.NodeTransformer):
             value = self._context[node.value.id]
             if isinstance(value, Tensor):
-                if isinstance(node.slice, ast.Tuple):
-                    indices = value.indices() + tuple(node.slice.elts)
-                else:
-                    indices = value.indices() + (node.slice,)
-                offsets = value.offsets(indices)
-                pointers = value.pointers(offsets)
-                return call("load", pointers).node
+                return type(self)._generate_load(
+                    value,
+                    intermediate_indices=node.slice.elts
+                    if isinstance(node.slice, ast.Tuple)
+                    else (node.slice,),
+                )
         self.generic_visit(node)
@@ -166,7 +188,7 @@ class CodeGenerator(ast.NodeTransformer):
         self.generic_visit(node)
         if node.id in self._context and isinstance(node.ctx, ast.Load):
-            return call("load", self._context[node.id].pointers().node).node
+            return type(self)._generate_load(self._context[node.id])
         return node
@@ -178,11 +200,7 @@ class CodeGenerator(ast.NodeTransformer):
                 self.generic_visit(node)
                 return ast.Expr(
-                    call(
-                        "store",
-                        self._context[target.id].pointers().node,
-                        node.value,
-                    ).node
+                    type(self)._generate_store(self._context[target.id], node.value)
                 )
             elif (
                 isinstance(target, ast.Subscript)
@@ -195,20 +213,14 @@ class CodeGenerator(ast.NodeTransformer):
                 if isinstance(value, Tensor):
                     self.generic_visit(node)
-                    indices = value.indices() + tuple(
-                        target.slice.elts
-                        if isinstance(target.slice, ast.Tuple)
-                        else target.slice
-                    )
-                    offsets = value.offsets(indices)
-                    pointers = value.pointers(offsets)
                     return ast.Expr(
-                        call(
-                            "store",
-                            pointers.node,
+                        type(self)._generate_store(
+                            value,
                             node.value,
-                        ).node
+                            intermediate_indices=target.slice.elts
+                            if isinstance(target.slice, ast.Tuple)
+                            else (target.slice,),
+                        )
                     )
         self.generic_visit(node)
@@ -216,6 +228,13 @@ class CodeGenerator(ast.NodeTransformer):
         return node
     def _generate_autotune(self, params, meta):
+        device = triton.runtime.driver.active.get_current_device()
+        properties = triton.runtime.driver.active.utils.get_device_properties(device)
+        max_shared_mem = properties["max_shared_mem"]
+        num_warps = 8
+        num_stages = max_shared_mem // 2**15
         configs = [
             ast.Call(
                 func=ast.Attribute(
@@ -229,7 +248,10 @@ class CodeGenerator(ast.NodeTransformer):
                         values=[ast.Constant(value=value) for value in values],
                     )
                 ],
-                keywords=[],
+                keywords=[
+                    ast.keyword(arg="num_warps", value=ast.Constant(value=num_warps)),
+                    ast.keyword(arg="num_stages", value=ast.Constant(value=num_stages)),
+                ],
             )
             for values in itertools.product(self._POWER_OF_TWOS, repeat=len(meta))
             if self._MIN_PRODUCT <= math.prod(values) <= self._MAX_PRODUCT
@@ -256,7 +278,7 @@ class CodeGenerator(ast.NodeTransformer):
                         elts=[
                             ast.Constant(value=param)
                             for param in params
-                            if not Tensor.is_pointer(param)
+                            if not Tensor.pointer_pattern().fullmatch(param)
                         ],
                         ctx=ast.Load(),
                     ),
@@ -269,7 +291,7 @@ class CodeGenerator(ast.NodeTransformer):
             name=f"launch_{self._func_def.name}",
             args=ast.arguments(
                 posonlyargs=[],
-                args=[ast.arg(arg.name) for arg in self._args],
+                args=[ast.arg(arg.original.name) for arg in self._args],
                 kwonlyargs=[],
                 defaults=[],
             ),
@@ -316,6 +338,77 @@ class CodeGenerator(ast.NodeTransformer):
         return ast.parse(f"lambda meta: ({num_elements},)", mode="eval").body
+    @staticmethod
+    def _generate_load(tensor, intermediate_indices=()):
+        pointers, mask = CodeGenerator._generate_pointers_and_mask(
+            tensor, intermediate_indices
+        )
+        other = CodeGenerator._generate_other(tensor)
+        return call("load", pointers, mask=mask, other=other).node
+    @staticmethod
+    def _generate_store(tensor, value, intermediate_indices=()):
+        pointers, mask = CodeGenerator._generate_pointers_and_mask(
+            tensor, intermediate_indices
+        )
+        return call("store", pointers, value, mask=mask).node
+    @staticmethod
+    def _generate_pointers_and_mask(tensor, intermediate_indices):
+        intermediate_offsets = CodeGenerator._generate_intermediate_offsets(
+            tensor, intermediate_indices
+        )
+        offsets = [
+            CodeGenerator._name_for_offsets(tensor, dim) + intermediate_offsets[dim]
+            for dim in range(tensor.original.ndim)
+        ]
+        pointers = CodeGenerator._name_for_pointers(tensor) + sum(
+            map(lambda x, y: x * y, intermediate_offsets, tensor.original.strides)
+        )
+        mask = functools.reduce(
+            lambda x, y: x & y,
+            (
+                offs[CodeGenerator._generate_slices(tensor, dim)] < size
+                for dim, (offs, size) in enumerate(zip(offsets, tensor.original.shape))
+            ),
+        )
+        return pointers, mask
+    @staticmethod
+    def _generate_other(tensor):
+        other = tensor.original.other
+        if isinstance(other, float) and not math.isfinite(other):
+            return f"float('{other}')"
+        return other
+    @staticmethod
+    def _generate_slices(tensor, dim):
+        return tuple(slice(None) if i == dim else None for i in range(tensor.ndim))
+    @staticmethod
+    def _generate_intermediate_offsets(tensor, intermediate_indices):
+        return tuple(
+            offs
+            for offs in tensor.offsets(
+                [0 for _ in range(tensor.ndim)]
+                + list(intermediate_indices)
+                + [0 for _ in range(tensor.inmost().ndim)]
+            )
+        )
+    @staticmethod
+    def _name_for_pointers(tensor):
+        return Symbol(f"{tensor.original.name}_pointers")
+    @staticmethod
+    def _name_for_offsets(tensor, dim):
+        return Symbol(f"{tensor.original.name}_offsets_{dim}")
 class Tritonizer(ast.NodeTransformer):
     def visit_Module(self, node):
@@ -329,8 +422,8 @@ class Tritonizer(ast.NodeTransformer):
     def visit_Name(self, node):
         self.generic_visit(node)
-        if node.id == "ninetoothed":
-            node.id = "triton"
+        if node.id == "ninetoothed" or "ninetoothed." in node.id:
+            node.id = node.id.replace("ninetoothed", "triton")
         return node
@@ -348,3 +441,73 @@ class Tritonizer(ast.NodeTransformer):
             )
         return node
+class _Handle:
+    def __init__(self, kernel, launch):
+        self._kernel = kernel
+        self._launch = launch
+    def __call__(self, *args, **kwargs):
+        return self._launch(*args, **kwargs)
+class _AliasRestorer(ast.NodeTransformer):
+    def __init__(self):
+        super().__init__()
+        self._aliases = {}
+        self._redefined = set()
+    def visit_Import(self, node):
+        for alias in node.names:
+            if alias.asname:
+                self._aliases[alias.asname] = alias.name
+        return node
+    def visit_ImportFrom(self, node):
+        for alias in node.names:
+            full_name = f"{node.module}.{alias.name}"
+            if alias.asname:
+                self._aliases[alias.asname] = full_name
+        return node
+    def visit_Assign(self, node):
+        for target in node.targets:
+            if isinstance(target, ast.Name):
+                self._redefined.add(target.id)
+        return self.generic_visit(node)
+    def visit_FunctionDef(self, node):
+        original_redefined = self._redefined.copy()
+        self.generic_visit(node)
+        self._redefined = original_redefined
+        return node
+    def visit_Name(self, node):
+        if node.id in self._redefined:
+            return node
+        if node.id in self._aliases:
+            return ast.Name(id=self._aliases[node.id], ctx=node.ctx)
+        return node
+class _FunctionDefFinder(ast.NodeVisitor):
+    def __init__(self, name):
+        self._name = name
+        self.result = None
+    def visit_FunctionDef(self, node):
+        if node.name == self._name:
+            self.result = node
+        self.generic_visit(node)

ninetoothed/language.py CHANGED Viewed

@@ -10,7 +10,10 @@ def call(func, *args, **kwargs):
         ast.Call(
             func=attribute(func).node,
             args=[Symbol(arg).node for arg in args],
-            keywords=[(kwarg, Symbol(kwargs[kwarg]).node) for kwarg in kwargs],
+            keywords=[
+                ast.keyword(arg=kwarg, value=Symbol(kwargs[kwarg]).node)
+                for kwarg in kwargs
+            ],
         )
     )

ninetoothed/symbol.py CHANGED Viewed

@@ -34,37 +34,80 @@ class Symbol:
             self._node.id = type(self)._create_constexpr(self._node.id)
     def __add__(self, other):
-        return type(self)(
-            ast.BinOp(left=self._node, op=ast.Add(), right=type(self)(other)._node)
-        )
+        other = type(self)(other)
+        if isinstance(self._node, ast.Constant) and self._node.value == 0:
+            return other
+        if isinstance(other._node, ast.Constant) and other._node.value == 0:
+            return self
+        return type(self)(ast.BinOp(left=self._node, op=ast.Add(), right=other._node))
     def __radd__(self, other):
         return self.__add__(other)
     def __mul__(self, other):
-        return type(self)(
-            ast.BinOp(left=self._node, op=ast.Mult(), right=type(self)(other)._node)
-        )
+        other = type(self)(other)
+        if isinstance(self._node, ast.Constant) and self._node.value == 0:
+            return type(self)(0)
+        if isinstance(other._node, ast.Constant) and other._node.value == 0:
+            return type(self)(0)
+        if isinstance(self._node, ast.Constant) and self._node.value == 1:
+            return other
+        if isinstance(other._node, ast.Constant) and other._node.value == 1:
+            return self
+        return type(self)(ast.BinOp(left=self._node, op=ast.Mult(), right=other._node))
     def __rmul__(self, other):
         return self.__mul__(other)
     def __floordiv__(self, other):
+        other = type(self)(other)
+        if isinstance(other._node, ast.Constant) and other._node.value == 1:
+            return self
         return type(self)(
-            ast.BinOp(left=self._node, op=ast.FloorDiv(), right=type(self)(other)._node)
+            ast.BinOp(left=self._node, op=ast.FloorDiv(), right=other._node)
         )
     def __mod__(self, other):
+        other = type(self)(other)
+        return type(self)(ast.BinOp(left=self._node, op=ast.Mod(), right=other._node))
+    def __lt__(self, other):
+        other = type(self)(other)
+        return type(self)(
+            ast.Compare(left=self._node, ops=[ast.Lt()], comparators=[other._node])
+        )
+    def __and__(self, other):
+        other = type(self)(other)
         return type(self)(
-            ast.BinOp(left=self._node, op=ast.Mod(), right=type(self)(other)._node)
+            ast.BinOp(left=self._node, op=ast.BitAnd(), right=other._node)
         )
+    def __rand__(self, other):
+        return self.__and__(other)
     def __getitem__(self, key):
         return type(self)(ast.Subscript(value=self._node, slice=type(self)(key)._node))
     def __repr__(self):
         return ast.unparse(self._node)
+    def find_and_replace(self, target, replacement):
+        _FindAndReplacer(target.node, replacement.node).visit(self._node)
     def names(self):
         class NameCollector(ast.NodeVisitor):
             def __init__(self):
@@ -107,3 +150,15 @@ class Symbol:
     @staticmethod
     def _create_meta(name):
         return f"_ninetoothed_meta_{name}"
+class _FindAndReplacer(ast.NodeTransformer):
+    def __init__(self, target, replacement):
+        self._target_id = target.id
+        self._replacement = replacement
+    def visit_Name(self, node):
+        if node.id == self._target_id:
+            return self._replacement
+        return self.generic_visit(node)

ninetoothed/tensor.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import itertools
+import re
 from ninetoothed.language import call
 from ninetoothed.symbol import Symbol
@@ -7,19 +8,24 @@ from ninetoothed.symbol import Symbol
 class Tensor:
     num_instances = 0
-    def __init__(self, ndim=None, shape=None, dtype=None, strides=None, name=None):
+    def __init__(
+        self,
+        ndim=None,
+        shape=None,
+        dtype=None,
+        strides=None,
+        other=None,
+        original=None,
+    ):
         type(self).num_instances += 1
         self.dtype = dtype
-        if name is not None:
-            self.name = name
-        else:
-            self.name = f"tensor_{type(self).num_instances}"
+        self.name = f"tensor_{type(self).num_instances}"
         if ndim is not None:
-            self.shape = [Symbol(f"{self.name}_size_{i}") for i in range(ndim)]
-            self.strides = [Symbol(f"{self.name}_stride_{i}") for i in range(ndim)]
+            self.shape = [Symbol(self.size_string(i)) for i in range(ndim)]
+            self.strides = [Symbol(self.stride_string(i)) for i in range(ndim)]
         else:
             self.shape = shape
@@ -28,6 +34,13 @@ class Tensor:
             else:
                 self.strides = self._calculate_default_strides(shape)
+        self.other = other
+        if original is not None:
+            self.original = original
+        else:
+            self.original = self
     def tile(self, tile_shape, tile_strides=None):
         if tile_strides is None:
             tile_strides = [1 for _ in tile_shape]
@@ -59,10 +72,10 @@ class Tensor:
                 shape=inner_shape,
                 dtype=self.dtype,
                 strides=inner_strides,
-                name=self.name,
+                original=self.original,
             ),
             strides=outer_strides,
-            name=self.name,
+            original=self.original,
         )
     def expand(self, shape):
@@ -77,12 +90,21 @@ class Tensor:
                 stride if new_size == -1 else 0
                 for new_size, stride in zip(shape, self.strides)
             ],
-            name=self.name,
+            original=self.original,
+        )
+    def squeeze(self, dim):
+        # TODO: Add error handling.
+        return type(self)(
+            shape=[size for i, size in enumerate(self.shape) if dim != i],
+            dtype=self.dtype,
+            strides=[stride for i, stride in enumerate(self.strides) if dim != i],
+            original=self.original,
         )
     def names(self):
         return (
-            {self._pointer()}
+            {self.original.pointer_string()}
             | {
                 name
                 for value in itertools.chain(self.shape, self.strides)
@@ -92,34 +114,31 @@ class Tensor:
             | (self.dtype.names() if isinstance(self.dtype, type(self)) else set())
         )
-    def pointers(self, offsets=None):
-        if offsets is None:
-            offsets = self.offsets()
-        return self._pointer() + offsets
     def offsets(self, indices=None):
         if indices is None:
             indices = self.indices()
-        if not isinstance(self.dtype, type(self)):
-            if indices:
-                raise IndexError("Incorrect number of indices.")
+        offsets = [[] for _ in range(self.original.ndim)]
+        curr = self
+        start = 0
+        while isinstance(curr, type(self)):
+            stop = start + curr.ndim
+            curr_indices = indices[start:stop]
+            for index, stride in zip(curr_indices, curr.strides):
+                for dim in self._dims_of(stride):
+                    offsets[dim].append(index * stride)
-            return sum(
-                self.stride(idx)
-                * call("arange", 0, self.size(idx))[
-                    tuple(slice(None) if i == idx else None for i in range(self.ndim()))
-                ]
-                for idx in range(self.ndim())
-            )
+            start = stop
+            curr = curr.dtype
-        outer_indices = indices[: self.ndim()]
-        inner_indices = indices[self.ndim() :]
+        for dim in range(self.original.ndim):
+            offsets[dim] = sum(offsets[dim])
+            offsets[dim].find_and_replace(Symbol(self.original.strides[dim]), Symbol(1))
-        return sum(
-            index * stride for index, stride in zip(outer_indices, self.strides)
-        ) + self.dtype.offsets(inner_indices)
+        return offsets
     def indices(self, index=None):
         if index is None:
@@ -127,14 +146,38 @@ class Tensor:
         indices = []
-        for stride in type(self)(shape=self.shape, name=self.name).strides:
+        for stride in type(self)(shape=self.shape, original=self.original).strides:
             indices.append(index // stride)
             index %= stride
+        curr = self.dtype
+        while isinstance(curr.dtype, type(self)):
+            for _ in range(curr.ndim):
+                indices.append(0)
+            curr = curr.dtype
+        if isinstance(curr, type(self)):
+            for dim in range(curr.ndim):
+                indices.append(call("arange", 0, curr.shape[dim]))
         return tuple(indices)
-    def ndim(self):
-        return len(self.shape)
+    def inmost(self):
+        if not isinstance(self.dtype, type(self)):
+            return self
+        return self.dtype.inmost()
+    def pointer_string(self):
+        return f"{self.name}_pointer"
+    def size_string(self, dim):
+        return f"{self.name}_size_{dim}"
+    def stride_string(self, dim):
+        return f"{self.name}_stride_{dim}"
     def size(self, dim=None):
         if dim is None:
@@ -148,12 +191,31 @@ class Tensor:
         return self.strides[dim]
+    @property
+    def ndim(self):
+        return len(self.shape)
     @staticmethod
-    def is_pointer(name):
-        return name.endswith("_ptr")
+    def pointer_pattern():
+        return re.compile(rf"({_identifier_pattern_raw_string()})_(pointer)")
-    def _pointer(self):
-        return f"{self.name}_ptr"
+    @staticmethod
+    def size_pattern():
+        return re.compile(rf"({_identifier_pattern_raw_string()})_(size)_(.+)")
+    @staticmethod
+    def stride_pattern():
+        return re.compile(rf"({_identifier_pattern_raw_string()})_(stride)_(.+)")
+    def _dims_of(self, stride):
+        dims = set()
+        names = stride.names() if isinstance(stride, Symbol) else {stride}
+        for dim, original_stride in enumerate(self.original.strides):
+            if str(original_stride) in names:
+                dims.add(dim)
+        return dims
     @staticmethod
     def _calculate_default_strides(shape):
@@ -163,3 +225,7 @@ class Tensor:
             strides.append(size * strides[-1])
         return reversed(strides)
+def _identifier_pattern_raw_string():
+    return r"[a-zA-Z_][a-zA-Z0-9_]*"

ninetoothed/torchifier.py CHANGED Viewed

@@ -1,23 +1,27 @@
 import ast
-import re
+from ninetoothed.tensor import Tensor
 class Torchifier(ast.NodeTransformer):
     def visit_Name(self, node):
         self.generic_visit(node)
-        pattern = re.compile(r"([a-zA-Z_][a-zA-Z0-9_]*)_(size|stride)_(.+)")
+        source = node.id
+        def repl(match):
+            return f"{match.group(1)}"
+        source = Tensor.pointer_pattern().sub(repl, source)
+        def repl(match):
+            return f"{match.group(1)}.{match.group(2)}({match.group(3)})"
-        node.id = node.id.replace("_ptr", "")
+        source = Tensor.size_pattern().sub(repl, source)
+        source = Tensor.stride_pattern().sub(repl, source)
-        if re.fullmatch(pattern, node.id):
-            return ast.parse(
-                pattern.sub(
-                    lambda match: f"{match.group(1)}.{match.group(2)}({match.group(3)})",
-                    node.id,
-                ),
-                mode="eval",
-            ).body
+        if source != node.id:
+            return ast.parse(source, mode="eval").body
         return node

{ninetoothed-0.1.1.dist-info → ninetoothed-0.3.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: ninetoothed
-Version: 0.1.1
+Version: 0.3.0
 Summary: A domain-specific language based on Triton but providing higher-level abstraction.
 Project-URL: Homepage, https://github.com/InfiniTensor/ninetoothed
 Project-URL: Issues, https://github.com/InfiniTensor/ninetoothed/issues
@@ -10,6 +10,7 @@ Classifier: License :: OSI Approved :: Apache Software License
 Classifier: Operating System :: OS Independent
 Classifier: Programming Language :: Python :: 3
 Requires-Python: >=3.10
+Requires-Dist: triton>=3.0.0
 Description-Content-Type: text/markdown
 # NineToothed
@@ -26,7 +27,7 @@ We can use `pip` to install `ninetoothed`.
 pip install ninetoothed
 ```
-After successfully running the above command, `ninetoothed` will be installed. However, to fully utilize its capabilities, you also need to install `triton` and a deep learning framework supported by `ninetoothed`. For trial purposes, we recommend installing `triton` and `torch`.
+After successfully running the above command, `ninetoothed` will be installed. However, to fully utilize its capabilities, you also need to install a deep learning framework supported by `ninetoothed`. For trial purposes, we recommend installing `torch`.
 ## Usage
@@ -64,16 +65,19 @@ c_tiled = Tensor(2).tile((BLOCK_SIZE_M, BLOCK_SIZE_N))
 a_tiled = a_tiled.expand((-1, c_tiled.shape[1]))
 b_tiled = b_tiled.expand((c_tiled.shape[0], -1))
+a_tiled.dtype = a_tiled.dtype.squeeze(0)
+b_tiled.dtype = b_tiled.dtype.squeeze(1)
 @ninetoothed.jit
 def matmul_kernel(a: a_tiled, b: b_tiled, c: c_tiled):
     accumulator = ninetoothed.language.zeros(
         c.shape, dtype=ninetoothed.language.float32
     )
-    for k in range(a.shape[1]):
-        accumulator = ninetoothed.language.dot(a[0, k], b[k, 0], accumulator)
+    for k in range(a.shape[0]):
+        accumulator += ninetoothed.language.dot(a[k], b[k])
     c = accumulator.to(ninetoothed.language.float16)
 ```
-For matrix multiplication, we also have three tensor parameters, but the tiling method is more complex than vector addition. We denote the three matrices as $A$, $B$, and $C$, where $A$ and $B$ are inputs, and $C$ is the output. Tiling $C$ is simple; we just need to divide it into blocks of size `(BLOCK_SIZE_M, BLOCK_SIZE_N)` by rows and columns. Once each block computes its result, the entire $C$ is computed. However, how should we tile $A$ and $B$? The answer is to introduce another meta-parameter `BLOCK_SIZE_K`. This way, we can divide $A$ into blocks of size `(BLOCK_SIZE_M, BLOCK_SIZE_K)` and $B$ into blocks of size `(BLOCK_SIZE_K, BLOCK_SIZE_N)`. However, for matrix multiplication, $A$ and $B$ do not correspond block by block; each row of $A$ needs to correspond to each column of $B$. Therefore, we need to further `tile` $A$ and $B$ by rows and columns, respectively. Up to this point, we have a set of row blocks of $A$ and column blocks of $B$. However, each row block of $A$ must correspond to every column block of $B$. This is where `expand` comes in. We `expand` the row blocks of $A$ along the columns to the number of columns of $C$ and the column blocks of $B$ along the rows to the number of rows of $C$. This way, we successfully tile $A$, $B$, and $C$.
+For matrix multiplication, we also have three tensor parameters, but the tiling method is more complex than vector addition. We denote the three matrices as $A$, $B$, and $C$, where $A$ and $B$ are inputs, and $C$ is the output. Tiling $C$ is simple; we just need to divide it into blocks of size `(BLOCK_SIZE_M, BLOCK_SIZE_N)` by rows and columns. Once each block computes its result, the entire $C$ is computed. However, how should we tile $A$ and $B$? The answer is to introduce another meta-parameter `BLOCK_SIZE_K`. This way, we can divide $A$ into blocks of size `(BLOCK_SIZE_M, BLOCK_SIZE_K)` and $B$ into blocks of size `(BLOCK_SIZE_K, BLOCK_SIZE_N)`. However, for matrix multiplication, $A$ and $B$ do not correspond block by block; each row of $A$ needs to correspond to each column of $B$. Therefore, we need to further `tile` $A$ and $B$ by rows and columns, respectively. Up to this point, we have a set of row blocks of $A$ and column blocks of $B$. However, each row block of $A$ must correspond to every column block of $B$. This is where `expand` comes in. We `expand` the row blocks of $A$ along the columns to the number of columns of $C$ and the column blocks of $B$ along the rows to the number of rows of $C$. This way, we successfully tile $A$, $B$, and $C$. In fact, our meta-operations up to this point have already enabled us to write kernel functions. However, we notice that the levels where the row blocks and column blocks reside, which we mentioned earlier, are two-dimensional, and their sizes are of the forms `(1, ...)` and `(..., 1)`. This means that if no other operations are performed, the way we access row blocks and column blocks would have to be `a[0, k]` and `b[k, 0]`. If we want to use `a` to find the range of `k`, we would need to use `a.shape[1]`, but we know that dimensions of size `1` can actually be removed completely. This is why we added two lines of `squeeze`. The `dtype` refers to the data type, which in PyTorch can generally be some integer or floating-point type, such as `torch.float32`. However, since meta-operations like `tile` can be performed in NineToothed, `dtype` can also be a `Tensor`. In other words, there is a concept of "tensors that store tensors" in NineToothed. In summary, these two lines perform operations on the tensors stored in the outmost tensor, removing the dimensions of size `1`. This way, when we access the row and column blocks, we can use `a[k]` and `b[k]`, and when finding the range of `k`, we can use `a.shape[0]`.
 With tiling done, the rest is simple. In the function body, we define an `accumulator` to accumulate intermediate results. We then iterate through the corresponding row blocks of $A$ and column blocks of B, multiplying them and accumulating the results in `accumulator`. Finally, we place the `accumulator` in the corresponding block of $C$. Since each block of the parameter tensors undergoes this operation, the multiplication is completed for the whole tensors as well.

ninetoothed-0.3.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,10 @@
+ninetoothed/__init__.py,sha256=T5UJXlC-wbo8JKPbLUNT65Kccp12xP52WFV5FsugETI,147
+ninetoothed/jit.py,sha256=nhjZRi8_kcjWZX0eOrnxLlzJfVg5vn12f9oi0Er2ABE,15515
+ninetoothed/language.py,sha256=YwjlBENmmKPTnhaQ2uYbj5MwzrCAT7MLJ6VkQ6NeXJE,504
+ninetoothed/symbol.py,sha256=Bd54qcI8KQAX0JRE_wPXycswtdSofhZ6Rr5MtZcv9fo,4665
+ninetoothed/tensor.py,sha256=_DrjOJ-pBvEbSNUvUoYJduLQXmuKgNcqhe4xUDMVoZw,6275
+ninetoothed/torchifier.py,sha256=8M2PDwyFIfVypX6Z-Vt_bGbsCPqxqKnftL0rXeh9bOM,911
+ninetoothed-0.3.0.dist-info/METADATA,sha256=CqdtfdV0eHzSwxJmFpD2IG5d4WTc6RDlpqMZue4Ml2Q,6720
+ninetoothed-0.3.0.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
+ninetoothed-0.3.0.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+ninetoothed-0.3.0.dist-info/RECORD,,

ninetoothed-0.1.1.dist-info/RECORD DELETED Viewed

@@ -1,10 +0,0 @@
-ninetoothed/__init__.py,sha256=T5UJXlC-wbo8JKPbLUNT65Kccp12xP52WFV5FsugETI,147
-ninetoothed/jit.py,sha256=DdRdZ7DhfZwJeS7AcO_RhD9TZcCebKI55V4_6UHs3bo,10523
-ninetoothed/language.py,sha256=cSuTgi5OwmLFy-dy_AHGZzRm18wz01ByHQ2vioP1vTg,437
-ninetoothed/symbol.py,sha256=8BI4ekeLuUdHTEREvMMlAzwrJ93pqiCdSHGc38clBFA,3034
-ninetoothed/tensor.py,sha256=o_HLEuaBzojmbMLnbPGLcw4iqBI34TNdES3YLTagztE,4590
-ninetoothed/torchifier.py,sha256=JmIVQE8r0zr_RLExsRDOGNsMu0F7v6J_o22aWqlw81k,841
-ninetoothed-0.1.1.dist-info/METADATA,sha256=1Nv6Xcz7CrpEUrzAYH93bYVX8GfPtHwzj4yofeaoJro,5422
-ninetoothed-0.1.1.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
-ninetoothed-0.1.1.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-ninetoothed-0.1.1.dist-info/RECORD,,

{ninetoothed-0.1.1.dist-info → ninetoothed-0.3.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{ninetoothed-0.1.1.dist-info → ninetoothed-0.3.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

ninetoothed 0.1.1__py3-none-any.whl → 0.3.0__py3-none-any.whl

ninetoothed 0.1.1py3-none-any.whl → 0.3.0py3-none-any.whl