PyPI - ninetoothed - Versions diffs - 0.6.0__py3-none-any.whl → 0.8.0__py3-none-any.whl - Mend

ninetoothed 0.6.0py3-none-any.whl → 0.8.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

ninetoothed/__init__.py +2 -2
ninetoothed/jit.py +304 -90
ninetoothed/naming.py +50 -0
ninetoothed/symbol.py +39 -44
ninetoothed/tensor.py +162 -78
{ninetoothed-0.6.0.dist-info → ninetoothed-0.8.0.dist-info}/METADATA +10 -2
ninetoothed-0.8.0.dist-info/RECORD +11 -0
{ninetoothed-0.6.0.dist-info → ninetoothed-0.8.0.dist-info}/WHEEL +1 -1
ninetoothed-0.6.0.dist-info/RECORD +0 -10
{ninetoothed-0.6.0.dist-info → ninetoothed-0.8.0.dist-info}/licenses/LICENSE +0 -0

ninetoothed/__init__.py CHANGED Viewed

@@ -1,5 +1,5 @@
-from ninetoothed.jit import jit
+from ninetoothed.jit import jit, make
 from ninetoothed.symbol import Symbol
 from ninetoothed.tensor import Tensor
-__all__ = ["Symbol", "Tensor", "jit"]
+__all__ = ["Symbol", "Tensor", "jit", "make"]

ninetoothed/jit.py CHANGED Viewed

@@ -1,31 +1,51 @@
 import ast
 import collections
+import copy
 import functools
 import importlib.util
 import inspect
 import itertools
 import math
+import subprocess
 import sys
 import tempfile
 import triton
+import ninetoothed.naming as naming
 from ninetoothed.language import attribute, call
 from ninetoothed.symbol import Symbol
 from ninetoothed.tensor import Tensor
 from ninetoothed.torchifier import Torchifier
-def jit(func):
-    return JIT(func)()
+def make(arrangement, application, tensors):
+    params = inspect.signature(application).parameters
+    types = arrangement(*tensors)
+    annotations = {param: type for param, type in zip(params, types)}
+    application.__annotations__ = annotations
+    return jit(application)
+def jit(_func=None, *, _prettify=False):
+    def wrapper(func):
+        return JIT(func, _prettify=_prettify)()
+    if _func is None:
+        return wrapper
+    return wrapper(_func)
 class JIT:
     handles = collections.defaultdict(dict)
-    def __init__(self, func):
+    def __init__(self, func, _prettify=False):
         self.func = func
+        self._prettify = _prettify
     def __call__(self):
         source_file = inspect.getsourcefile(self.func)
         source_line = inspect.getsourcelines(self.func)[1]
@@ -40,12 +60,26 @@ class JIT:
         CodeGenerator(inspect.get_annotations(self.func)).visit(tree)
         Tritonizer().visit(tree)
+        _BinOpSimplifier().visit(tree)
         ast.fix_missing_locations(tree)
+        if self._prettify:
+            name_collector = _SimplifiedNameCollector()
+            name_collector.visit(tree)
         unparsed = ast.unparse(tree).replace("None:", ":").replace(":None", ":")
         dependencies = self._find_dependencies()
         source = "\n\n".join((unparsed, dependencies)).strip()
+        if self._prettify:
+            for original, simplified in name_collector.simplified_names.items():
+                if simplified not in name_collector.simplified_names:
+                    source = source.replace(original, simplified)
+            source = subprocess.check_output(
+                ["ruff", "format", "-"], input=source, encoding="utf-8"
+            )
         with tempfile.NamedTemporaryFile(delete=False, suffix=".py") as temp_file:
             temp_file.write(source.encode("utf-8"))
             temp_file_name = temp_file.name
@@ -67,10 +101,12 @@ class JIT:
         module = ast.parse(inspect.getsource(inspect.getmodule(self.func)))
         _AliasRestorer().visit(module)
+        collector = _ImportCollector()
+        collector.visit(module)
         finder = _FunctionDefFinder(self.func.__name__)
         finder.visit(module)
-        return ast.Module(body=[finder.result], type_ignores=[])
+        return ast.Module(body=collector.imports + [finder.result], type_ignores=[])
     def _find_dependencies(self):
         dependencies = set()
@@ -115,30 +151,12 @@ class CodeGenerator(ast.NodeTransformer):
     def visit_FunctionDef(self, node):
         self._func_def = node
-        self.generic_visit(node)
+        self._invariants = {}
-        for arg in self._args:
-            if not isinstance(arg, Tensor) or arg.ndim == 0:
-                continue
-            offsets = arg.offsets()
-            initializations = {
-                type(self)._name_for_offsets(arg, dim): offs
-                for dim, offs in enumerate(offsets)
-            } | {
-                type(self)._name_for_pointers(arg): arg.original.pointer_string()
-                + sum(
-                    type(self)._name_for_offsets(arg, dim)[
-                        type(self)._generate_slices(arg, dim)
-                    ]
-                    * stride
-                    for dim, stride in enumerate(arg.original.strides)
-                )
-            }
+        self.generic_visit(node)
-            for target, value in reversed(initializations.items()):
-                node.body.insert(0, ast.Assign(targets=[target.node], value=value.node))
+        for target, value in reversed(self._invariants.items()):
+            node.body.insert(0, ast.Assign(targets=[target.node], value=value.node))
         return node
@@ -147,12 +165,17 @@ class CodeGenerator(ast.NodeTransformer):
         names_of_args = [arg.names() - {"ninetoothed"} for arg in self._args]
         names = functools.reduce(lambda x, y: x | y, names_of_args)
-        meta_names = {name for name in names if Symbol.is_meta(name)}
+        meta_names = {name for name in names if naming.is_meta(name)}
         non_meta_names = {name for name in names if name not in meta_names}
+        non_meta_names |= {
+            naming.make_next_power_of_2(name)
+            for name in non_meta_names
+            if naming.is_constexpr(name)
+        }
         node.args = [
             ast.arg(arg=name)
-            if not Symbol.is_constexpr(name)
+            if not naming.is_constexpr(name)
             else ast.arg(arg=name, annotation=attribute("constexpr").node)
             for name in non_meta_names
         ] + [
@@ -161,24 +184,20 @@ class CodeGenerator(ast.NodeTransformer):
         ]
         autotune = self._generate_autotune(non_meta_names, meta_names)
-        self._func_def.decorator_list.insert(0, autotune)
+        self._func_def.decorator_list = [autotune, Symbol("triton.jit").node]
         self._launch = self._generate_launch(non_meta_names, meta_names)
         return node
     def visit_Subscript(self, node):
-        if (
-            isinstance(node.value, ast.Name)
-            and node.value.id in self._context
-            and isinstance(node.ctx, ast.Load)
-        ):
+        if self._in_context(node.value) and isinstance(node.ctx, ast.Load):
             value = self._context[node.value.id]
             if isinstance(value, Tensor):
-                return type(self)._generate_load(
+                return self._generate_load(
                     value,
-                    intermediate_indices=node.slice.elts
+                    indices=node.slice.elts
                     if isinstance(node.slice, ast.Tuple)
                     else (node.slice,),
                 )
@@ -188,7 +207,7 @@ class CodeGenerator(ast.NodeTransformer):
         return node
     def visit_Attribute(self, node):
-        if isinstance(node.value, ast.Name) and node.value.id in self._context:
+        if self._in_context(node.value):
             value = self._context[node.value.id]
             if isinstance(value, Tensor):
@@ -203,8 +222,8 @@ class CodeGenerator(ast.NodeTransformer):
     def visit_Name(self, node):
         self.generic_visit(node)
-        if node.id in self._context and isinstance(node.ctx, ast.Load):
-            return type(self)._generate_load(self._context[node.id])
+        if self._in_context(node) and isinstance(node.ctx, ast.Load):
+            return self._generate_load(self._context[node.id])
         return node
@@ -212,16 +231,15 @@ class CodeGenerator(ast.NodeTransformer):
         if len(node.targets) == 1:
             target = node.targets[0]
-            if isinstance(target, ast.Name) and target.id in self._context:
+            if self._in_context(target):
                 self.generic_visit(node)
                 return ast.Expr(
-                    type(self)._generate_store(self._context[target.id], node.value)
+                    self._generate_store(self._context[target.id], node.value)
                 )
             elif (
                 isinstance(target, ast.Subscript)
-                and isinstance(target.value, ast.Name)
-                and target.value.id in self._context
+                and self._in_context(target.value)
                 and isinstance(target.ctx, ast.Store)
             ):
                 value = self._context[target.value.id]
@@ -230,10 +248,10 @@ class CodeGenerator(ast.NodeTransformer):
                     self.generic_visit(node)
                     return ast.Expr(
-                        type(self)._generate_store(
+                        self._generate_store(
                             value,
                             node.value,
-                            intermediate_indices=target.slice.elts
+                            indices=target.slice.elts
                             if isinstance(target.slice, ast.Tuple)
                             else (target.slice,),
                         )
@@ -243,6 +261,11 @@ class CodeGenerator(ast.NodeTransformer):
         return node
+    _NAME_FOR_PID = Symbol("ninetoothed_pid")
+    def _in_context(self, node):
+        return isinstance(node, ast.Name) and node.id in self._context
     def _generate_autotune(self, params, meta):
         device = triton.runtime.driver.active.get_current_device()
         properties = triton.runtime.driver.active.utils.get_device_properties(device)
@@ -303,27 +326,54 @@ class CodeGenerator(ast.NodeTransformer):
         )
     def _generate_launch(self, params, meta):
-        constexpr_params = [param for param in params if Symbol.is_constexpr(param)]
-        constexpr_params_without_prefixes = [
-            Symbol.remove_prefix(param) for param in constexpr_params
+        non_next_power_of_2_constexpr_params = [
+            param
+            for param in params
+            if naming.is_constexpr(param) and not naming.is_next_power_of_2(param)
+        ]
+        non_next_power_of_2_constexpr_params_without_prefixes = [
+            naming.remove_prefixes(param)
+            for param in non_next_power_of_2_constexpr_params
+        ]
+        next_power_of_2_params = [
+            param for param in params if naming.is_next_power_of_2(param)
+        ]
+        next_power_of_2_params_without_prefixes = [
+            naming.remove_prefixes(param) for param in next_power_of_2_params
         ]
         launch = ast.FunctionDef(
             name=f"launch_{self._func_def.name}",
             args=ast.arguments(
                 posonlyargs=[],
-                args=[ast.arg(arg=arg.original.name) for arg in self._args]
-                + [ast.arg(arg=param) for param in constexpr_params_without_prefixes],
+                args=[ast.arg(arg=arg.source.name) for arg in self._args]
+                + [
+                    ast.arg(arg=param)
+                    for param in non_next_power_of_2_constexpr_params_without_prefixes
+                ],
                 kwonlyargs=[],
                 defaults=[],
             ),
             body=[
                 ast.Assign(
                     targets=[ast.Name(id=param, ctx=ast.Store())],
-                    value=ast.Name(id=param_without_prefix, ctx=ast.Load()),
+                    value=ast.Name(id=param_without_prefixes, ctx=ast.Load()),
+                )
+                for param, param_without_prefixes in zip(
+                    non_next_power_of_2_constexpr_params,
+                    non_next_power_of_2_constexpr_params_without_prefixes,
+                )
+            ]
+            + [
+                ast.Assign(
+                    targets=[ast.Name(id=param, ctx=ast.Store())],
+                    value=Symbol(
+                        f"triton.next_power_of_2({param_without_prefixes})"
+                    ).node,
                 )
-                for param, param_without_prefix in zip(
-                    constexpr_params, constexpr_params_without_prefixes
+                for param, param_without_prefixes in zip(
+                    next_power_of_2_params,
+                    next_power_of_2_params_without_prefixes,
                 )
             ]
             + [
@@ -369,51 +419,105 @@ class CodeGenerator(ast.NodeTransformer):
         return ast.parse(f"lambda meta: ({num_elements},)", mode="eval").body
-    @staticmethod
-    def _generate_load(tensor, intermediate_indices=()):
+    def _generate_load(self, tensor, indices=()):
         if tensor.ndim == 0:
-            return Symbol(tensor.original.name).node
+            return Symbol(tensor.source.name).node
-        pointers, mask = CodeGenerator._generate_pointers_and_mask(
-            tensor, intermediate_indices
-        )
-        other = CodeGenerator._generate_other(tensor)
+        pointers, mask = self._generate_pointers_and_mask(tensor, indices)
+        other = type(self)._generate_other(tensor)
         return call("load", pointers, mask=mask, other=other).node
-    @staticmethod
-    def _generate_store(tensor, value, intermediate_indices=()):
-        pointers, mask = CodeGenerator._generate_pointers_and_mask(
-            tensor, intermediate_indices
-        )
+    def _generate_store(self, tensor, value, indices=()):
+        pointers, mask = self._generate_pointers_and_mask(tensor, indices)
         return call("store", pointers, value, mask=mask).node
-    @staticmethod
-    def _generate_pointers_and_mask(tensor, intermediate_indices):
-        intermediate_offsets = CodeGenerator._generate_intermediate_offsets(
-            tensor, intermediate_indices
-        )
-        offsets = [
-            CodeGenerator._name_for_offsets(tensor, dim) + intermediate_offsets[dim]
-            for dim in range(tensor.original.ndim)
-        ]
-        pointers = CodeGenerator._name_for_pointers(tensor) + sum(
-            map(lambda x, y: x * y, intermediate_offsets, tensor.original.strides)
+    def _generate_pointers_and_mask(self, tensor, indices):
+        invariant_target_dims = type(self)._find_invariant_target_dims(tensor)
+        indices = self._complete_indices(tensor, indices)
+        offsets = type(self)._generate_offsets(tensor, indices)
+        for source_dim in range(tensor.source.ndim):
+            for target_dim in range(tensor.target.ndim):
+                if target_dim not in invariant_target_dims:
+                    continue
+                name = type(self)._name_for_offsets(tensor, source_dim, target_dim)
+                self._invariants[name] = offsets[source_dim][target_dim]
+                offsets[source_dim][target_dim] = name
+        name_for_pointers = type(self)._name_for_pointers(tensor)
+        self._invariants[name_for_pointers] = Symbol(tensor.source.pointer_string())
+        for source_dim in range(tensor.source.ndim):
+            for target_dim in range(tensor.target.ndim):
+                if target_dim not in invariant_target_dims:
+                    continue
+                self._invariants[name_for_pointers] += (
+                    offsets[source_dim][target_dim][
+                        type(self)._generate_slices(tensor, target_dim)
+                    ]
+                    * tensor.source.strides[source_dim]
+                )
+        pointers = name_for_pointers + sum(
+            offsets[source_dim][target_dim][
+                type(self)._generate_slices(tensor, target_dim)
+            ]
+            * tensor.source.strides[source_dim]
+            for source_dim in range(tensor.source.ndim)
+            for target_dim in range(tensor.target.ndim)
+            if target_dim not in invariant_target_dims
+            and offsets[source_dim][target_dim] != 0
         )
         mask = functools.reduce(
             lambda x, y: x & y,
             (
-                offs[CodeGenerator._generate_slices(tensor, dim)] < size
-                for dim, (offs, size) in enumerate(zip(offsets, tensor.original.shape))
+                offsets[source_dim][target_dim][
+                    type(self)._generate_slices(tensor, target_dim)
+                ]
+                < tensor.source.shape[source_dim]
+                for source_dim in range(tensor.source.ndim)
+                for target_dim in range(tensor.target.ndim)
+                if offsets[source_dim][target_dim] != 0
             ),
         )
         return pointers, mask
+    def _complete_indices(self, tensor, indices):
+        indices = list(self._generate_pid_indices(tensor) + indices)
+        for size in tensor.inmost().shape:
+            if Symbol.is_name(size):
+                name = size.node.id
+                if not naming.is_meta(name):
+                    size = naming.make_next_power_of_2(name)
+            indices.append(call("arange", 0, size))
+        return tuple(indices)
+    def _generate_pid_indices(self, tensor):
+        self._invariants[type(self)._NAME_FOR_PID] = call("program_id", 0)
+        indices = list(
+            type(self)._unravel_index(type(self)._NAME_FOR_PID, tensor.shape)
+        )
+        for dim, index in enumerate(indices):
+            name = type(self)._name_for_index(tensor, dim)
+            self._invariants[name] = index
+            indices[dim] = name
+        return tuple(indices)
     @staticmethod
     def _generate_other(tensor):
-        other = tensor.original.other
+        other = tensor.source.other
         if isinstance(other, float) and not math.isfinite(other):
             return f"float('{other}')"
@@ -425,23 +529,86 @@ class CodeGenerator(ast.NodeTransformer):
         return tuple(slice(None) if i == dim else None for i in range(tensor.ndim))
     @staticmethod
-    def _generate_intermediate_offsets(tensor, intermediate_indices):
-        return tuple(
-            offs
-            for offs in tensor.offsets(
-                [0 for _ in range(tensor.ndim)]
-                + list(intermediate_indices)
-                + [0 for _ in range(tensor.inmost().ndim)]
-            )
+    def _generate_offsets(tensor, indices):
+        offsets = collections.defaultdict(
+            lambda: collections.defaultdict(lambda: Symbol(0))
         )
+        curr = tensor
+        start = 0
+        while isinstance(curr, type(tensor)):
+            stop = start + curr.ndim
+            curr_indices = indices[start:stop]
+            for index, stride, source_dim, target_dim in zip(
+                curr_indices, curr.strides, curr.source_dims, curr.target_dims
+            ):
+                offsets[source_dim][target_dim] += index * stride
+            start = stop
+            curr = curr.dtype
+        for source_dim in tuple(offsets):
+            for target_dim in tuple(offsets[source_dim]):
+                if not isinstance(source_dim, tuple):
+                    continue
+                unraveled = CodeGenerator._unravel_index(
+                    offsets[source_dim][target_dim],
+                    tuple(tensor.source.shape[dim] for dim in source_dim),
+                )
+                for offs, dim in zip(unraveled, source_dim):
+                    offsets[dim][target_dim] = offs
+        for source_dim in range(tensor.source.ndim):
+            for target_dim in range(tensor.target.ndim):
+                offsets[source_dim][target_dim] = copy.deepcopy(
+                    offsets[source_dim][target_dim]
+                )
+                offsets[source_dim][target_dim].find_and_replace(
+                    Symbol(tensor.source.strides[source_dim]), Symbol(1)
+                )
+        return offsets
+    @staticmethod
+    def _find_invariant_target_dims(tensor):
+        invariant_target_dims = set()
+        curr = tensor.dtype
+        while isinstance(curr.dtype, Tensor):
+            for target_dim in range(curr.target.ndim):
+                if target_dim not in curr.target_dims:
+                    invariant_target_dims.add(target_dim)
+            curr = curr.dtype
+        return invariant_target_dims
     @staticmethod
     def _name_for_pointers(tensor):
-        return Symbol(f"{tensor.original.name}_pointers")
+        return Symbol(f"{tensor.source.name}_pointers")
     @staticmethod
-    def _name_for_offsets(tensor, dim):
-        return Symbol(f"{tensor.original.name}_offsets_{dim}")
+    def _name_for_offsets(tensor, source_dim, target_dim):
+        return Symbol(f"{tensor.source.name}_offsets_{source_dim}_{target_dim}")
+    @staticmethod
+    def _name_for_index(tensor, dim):
+        return Symbol(f"{tensor.source.name}_index_{dim}")
+    @staticmethod
+    def _unravel_index(index, shape):
+        indices = []
+        for stride in Tensor(shape=shape).strides:
+            indices.append(index // stride)
+            index %= stride
+        return tuple(indices)
 class Tritonizer(ast.NodeTransformer):
@@ -477,6 +644,36 @@ class Tritonizer(ast.NodeTransformer):
         return node
+class _BinOpSimplifier(ast.NodeTransformer):
+    def visit_BinOp(self, node):
+        self.generic_visit(node)
+        if isinstance(node.op, ast.Mult):
+            left = Symbol(node.left)
+            right = Symbol(node.right)
+            if left == 0 or right == 0:
+                return Symbol(0).node
+            if left == 1:
+                return node.right
+            if right == 1:
+                return node.left
+        return node
+class _SimplifiedNameCollector(ast.NodeVisitor):
+    def __init__(self):
+        self.simplified_names = {}
+    def visit_Name(self, node):
+        self.generic_visit(node)
+        self.simplified_names[node.id] = naming.remove_prefixes(node.id)
 class _Handle:
     def __init__(self, kernel, launch, source):
         self._kernel = kernel
@@ -535,6 +732,23 @@ class _AliasRestorer(ast.NodeTransformer):
         return node
+class _ImportCollector(ast.NodeVisitor):
+    def __init__(self):
+        super().__init__()
+        self.imports = []
+    def visit_Import(self, node):
+        self.imports.append(node)
+        self.generic_visit(node)
+    def visit_ImportFrom(self, node):
+        self.imports.append(node)
+        self.generic_visit(node)
 class _FunctionDefFinder(ast.NodeVisitor):
     def __init__(self, name):
         self._name = name

ninetoothed/naming.py ADDED Viewed

@@ -0,0 +1,50 @@
+import re
+def make_constexpr(name):
+    return _add_prefix(name, _CONSTEXPR)
+def make_meta(name):
+    return _add_prefix(name, _META)
+def make_next_power_of_2(name):
+    return _add_prefix(name, _NEXT_POWER_OF_2)
+def is_constexpr(name):
+    return _CONSTEXPR in _find_prefixes(name) or is_meta(name)
+def is_meta(name):
+    return _META in _find_prefixes(name)
+def is_next_power_of_2(name):
+    return _NEXT_POWER_OF_2 in _find_prefixes(name)
+def remove_prefixes(name):
+    return _PREFIX_PATTERN.sub("", name)
+_CONSTEXPR = "constexpr"
+_META = "meta"
+_NEXT_POWER_OF_2 = "next_power_of_2"
+_PREFIX_PATTERN = re.compile(r"ninetoothed_((?!_).*?)_prefix_")
+def _add_prefix(name, string):
+    return f"{_make_prefix(string)}{name}"
+def _make_prefix(string):
+    return f"ninetoothed_{string}_prefix_"
+def _find_prefixes(name):
+    return set(_PREFIX_PATTERN.findall(name))

ninetoothed/symbol.py CHANGED Viewed

@@ -1,7 +1,10 @@
 import ast
 import inspect
+import numbers
 import types
+import ninetoothed.naming as naming
 class Symbol:
     def __init__(self, expr, constexpr=None, meta=None):
@@ -28,18 +31,31 @@ class Symbol:
             if constexpr is False:
                 raise ValueError("Non-constexpr meta symbol is not supported.")
-            self._node.id = type(self)._create_meta(self._node.id)
+            self._node.id = naming.make_meta(self._node.id)
         if constexpr:
-            self._node.id = type(self)._create_constexpr(self._node.id)
+            self._node.id = naming.make_constexpr(self._node.id)
+    def __eq__(self, other):
+        if isinstance(self._node, ast.Constant):
+            if isinstance(other, Symbol) and isinstance(other._node, ast.Constant):
+                return self._node.value == other._node.value
+            if isinstance(other, numbers.Number):
+                return self._node.value == other
+        return False
+    def __hash__(self):
+        return id(self)
     def __add__(self, other):
         other = type(self)(other)
-        if isinstance(self._node, ast.Constant) and self._node.value == 0:
+        if self == 0:
             return other
-        if isinstance(other._node, ast.Constant) and other._node.value == 0:
+        if other == 0:
             return self
         return type(self)(ast.BinOp(left=self._node, op=ast.Add(), right=other._node))
@@ -47,19 +63,30 @@ class Symbol:
     def __radd__(self, other):
         return self.__add__(other)
-    def __mul__(self, other):
+    def __sub__(self, other):
         other = type(self)(other)
-        if isinstance(self._node, ast.Constant) and self._node.value == 0:
-            return type(self)(0)
+        if self == 0:
+            return -other
-        if isinstance(other._node, ast.Constant) and other._node.value == 0:
+        if other == 0:
+            return self
+        return type(self)(ast.BinOp(left=self._node, op=ast.Sub(), right=other._node))
+    def __rsub__(self, other):
+        return type(self)(other).__sub__(self)
+    def __mul__(self, other):
+        other = type(self)(other)
+        if self == 0 or other == 0:
             return type(self)(0)
-        if isinstance(self._node, ast.Constant) and self._node.value == 1:
+        if self == 1:
             return other
-        if isinstance(other._node, ast.Constant) and other._node.value == 1:
+        if other == 1:
             return self
         return type(self)(ast.BinOp(left=self._node, op=ast.Mult(), right=other._node))
@@ -136,40 +163,8 @@ class Symbol:
         return SliceSimplifier().visit(self._node)
     @staticmethod
-    def is_constexpr(name):
-        return name.startswith(Symbol._constexpr_prefix()) or Symbol.is_meta(name)
-    @staticmethod
-    def is_meta(name):
-        return name.startswith(Symbol._meta_prefix())
-    @staticmethod
-    def remove_prefix(name):
-        if name.startswith(Symbol._constexpr_prefix()):
-            return name.removeprefix(Symbol._constexpr_prefix())
-        if name.startswith(Symbol._meta_prefix()):
-            return name.removeprefix(Symbol._meta_prefix())
-    @staticmethod
-    def _create_constexpr(name):
-        return f"{Symbol._constexpr_prefix()}{name}"
-    @staticmethod
-    def _create_meta(name):
-        return f"{Symbol._meta_prefix()}{name}"
-    @staticmethod
-    def _constexpr_prefix():
-        return f"{Symbol._ninetoothed_prefix()}constexpr_"
-    @staticmethod
-    def _meta_prefix():
-        return f"{Symbol._ninetoothed_prefix()}meta_"
-    @staticmethod
-    def _ninetoothed_prefix():
-        return "_ninetoothed_"
+    def is_name(object):
+        return isinstance(object, Symbol) and isinstance(object.node, ast.Name)
 class _FindAndReplacer(ast.NodeTransformer):

ninetoothed/tensor.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import itertools
+import math
 import re
 from ninetoothed.language import call
@@ -15,13 +16,18 @@ class Tensor:
         dtype=None,
         strides=None,
         other=None,
-        original=None,
+        name=None,
+        source=None,
+        source_dims=None,
+        target=None,
+        target_dims=None,
     ):
-        type(self).num_instances += 1
         self.dtype = dtype
-        self.name = f"_ninetoothed_tensor_{type(self).num_instances}"
+        if name is not None:
+            self.name = name
+        else:
+            self.name = f"_ninetoothed_tensor_{type(self).num_instances}"
         if ndim is not None:
             self.shape = (Symbol(self.size_string(i)) for i in range(ndim))
@@ -36,34 +42,61 @@ class Tensor:
         self.other = other
-        if original is not None:
-            self.original = original
+        if source is not None:
+            self.source = source
         else:
-            self.original = self
+            self.source = self
-    def tile(self, tile_shape, tile_strides=None):
-        if tile_strides is None:
-            tile_strides = [1 for _ in tile_shape]
+        if source_dims is not None:
+            self.source_dims = source_dims
+        else:
+            self.source_dims = (dim for dim in range(self.source.ndim))
+        if target is not None:
+            self.target = target
+        else:
+            self.target = self
+        if target_dims is not None:
+            self.target_dims = target_dims
+        else:
+            self.target_dims = (dim for dim in range(self.target.ndim))
+        type(self).num_instances += 1
+    def tile(self, tile_shape, strides=None, dilation=None):
+        if strides is None:
+            strides = [-1 for _ in tile_shape]
+        if dilation is None:
+            dilation = [1 for _ in tile_shape]
         outer_shape = []
         outer_strides = []
         inner_shape = []
         inner_strides = []
-        for size, stride, tile_size, tile_stride in zip(
-            self.shape, self.strides, tile_shape, tile_strides
+        for self_size, self_stride, tile_size, stride, spacing in zip(
+            self.shape, self.strides, tile_shape, strides, dilation
         ):
             if tile_size == -1:
-                tile_size = size
+                tile_size = self_size
+            if stride == -1:
+                stride = tile_size
-            new_size = call("cdiv", size, tile_size)
+            new_size = (
+                call("cdiv", self_size - spacing * (tile_size - 1) - 1, stride) + 1
+                if stride != 0
+                else -1
+            )
             outer_shape.append(new_size)
-            new_stride = stride * tile_size // tile_stride
+            new_stride = self_stride * stride // spacing
             outer_strides.append(new_stride)
             inner_shape.append(tile_size)
-            next_stride = stride * tile_stride
+            next_stride = self_stride * spacing
             inner_strides.append(next_stride)
         return type(self)(
@@ -72,10 +105,16 @@ class Tensor:
                 shape=inner_shape,
                 dtype=self.dtype,
                 strides=inner_strides,
-                original=self.original,
+                source=self.source,
+                source_dims=self.source_dims,
+                target=self.target,
+                target_dims=self.target_dims,
             ),
             strides=outer_strides,
-            original=self.original,
+            source=self.source,
+            source_dims=self.source_dims,
+            target=self.target,
+            target_dims=self.target_dims,
         )
     def expand(self, shape):
@@ -90,7 +129,10 @@ class Tensor:
                 stride if new_size == -1 else 0
                 for new_size, stride in zip(shape, self.strides)
             ],
-            original=self.original,
+            source=self.source,
+            source_dims=self.source_dims,
+            target=self.target,
+            target_dims=self.target_dims,
         )
     def squeeze(self, dim):
@@ -99,73 +141,109 @@ class Tensor:
             shape=[size for i, size in enumerate(self.shape) if dim != i],
             dtype=self.dtype,
             strides=[stride for i, stride in enumerate(self.strides) if dim != i],
-            original=self.original,
+            source=self.source,
+            source_dims=[
+                source_dim for i, source_dim in enumerate(self.source_dims) if dim != i
+            ],
+            target=self.target,
+            target_dims=[
+                target_dim for i, target_dim in enumerate(self.target_dims) if dim != i
+            ],
         )
-    def names(self):
-        if self.ndim == 0:
-            return {self.original.name}
+    def permute(self, dims):
+        # TODO: Add error handling.
+        new_shape = [None for _ in range(self.ndim)]
+        new_strides = [None for _ in range(self.ndim)]
+        new_source_dims = [None for _ in range(self.ndim)]
-        return (
-            {self.original.pointer_string()}
-            | {
-                name
-                for value in itertools.chain(self.shape, self.strides)
-                if isinstance(value, Symbol)
-                for name in value.names()
-            }
-            | (self.dtype.names() if isinstance(self.dtype, type(self)) else set())
+        for original_dim, permuted_dim in enumerate(dims):
+            new_shape[original_dim] = self.shape[permuted_dim]
+            new_strides[original_dim] = self.strides[permuted_dim]
+            new_source_dims[original_dim] = self.source_dims[permuted_dim]
+        return type(self)(
+            shape=new_shape,
+            dtype=self.dtype,
+            strides=new_strides,
+            source=self.source,
+            source_dims=new_source_dims,
+            target=self.target,
+            target_dims=self.target_dims,
         )
-    def offsets(self, indices=None):
-        if indices is None:
-            indices = self.indices()
+    def flatten(self, start_dim=None, end_dim=None):
+        # TODO: Add error handling.
+        if start_dim is None:
+            start_dim = 0
+        if end_dim is None:
+            end_dim = self.ndim
-        offsets = [[] for _ in range(self.original.ndim)]
+        leading_sizes = self.shape[:start_dim]
+        flattening_sizes = self.shape[start_dim:end_dim]
+        trailing_sizes = self.shape[end_dim:]
-        curr = self
-        start = 0
+        new_shape = leading_sizes + (math.prod(flattening_sizes),) + trailing_sizes
-        while isinstance(curr, type(self)):
-            stop = start + curr.ndim
-            curr_indices = indices[start:stop]
+        leading_strides = self.strides[:start_dim]
+        flattening_strides = self.strides[start_dim:end_dim]
+        trailing_strides = self.strides[end_dim:]
-            for index, stride in zip(curr_indices, curr.strides):
-                for dim in self._dims_of(stride):
-                    offsets[dim].append(index * stride)
+        new_strides = leading_strides + (flattening_strides[-1],) + trailing_strides
-            start = stop
-            curr = curr.dtype
+        leading_source_dims = self.source_dims[:start_dim]
+        flattening_source_dims = self.source_dims[start_dim:end_dim]
+        trailing_source_dims = self.source_dims[end_dim:]
-        for dim in range(self.original.ndim):
-            offsets[dim] = sum(offsets[dim])
-            offsets[dim].find_and_replace(Symbol(self.original.strides[dim]), Symbol(1))
-        return offsets
-    def indices(self, index=None):
-        if index is None:
-            index = call("program_id", 0)
+        new_source_dims = (
+            leading_source_dims + (flattening_source_dims,) + trailing_source_dims
+        )
-        indices = []
+        return type(self)(
+            shape=new_shape,
+            dtype=self.dtype,
+            strides=new_strides,
+            source=self.source,
+            source_dims=new_source_dims,
+            target=self.target,
+            target_dims=self.target_dims,
+        )
-        for stride in type(self)(shape=self.shape, original=self.original).strides:
-            indices.append(index // stride)
-            index %= stride
+    def ravel(self):
+        # TODO: Add error handling.
+        new_shape = []
+        new_strides = []
-        curr = self.dtype
+        curr = self
-        while isinstance(curr.dtype, type(self)):
-            for _ in range(curr.ndim):
-                indices.append(0)
+        while isinstance(curr, type(self)):
+            new_shape.extend(curr.shape)
+            new_strides.extend(curr.strides)
             curr = curr.dtype
-        if isinstance(curr, type(self)):
-            for dim in range(curr.ndim):
-                indices.append(call("arange", 0, curr.shape[dim]))
+        return type(self)(
+            shape=new_shape,
+            strides=new_strides,
+            other=self.source.other,
+            name=self.source.name,
+        )
+    def names(self):
+        if self.ndim == 0:
+            return {self.source.name}
-        return tuple(indices)
+        return (
+            {self.source.pointer_string()}
+            | {
+                name
+                for value in itertools.chain(self.shape, self.strides)
+                if isinstance(value, Symbol)
+                for name in value.names()
+            }
+            | (self.dtype.names() if isinstance(self.dtype, type(self)) else set())
+            | (self.source.names() if self.source is not self else set())
+        )
     def inmost(self):
         if not isinstance(self.dtype, type(self)):
@@ -214,6 +292,22 @@ class Tensor:
     def ndim(self):
         return len(self.shape)
+    @property
+    def source_dims(self):
+        return self._source_dims
+    @source_dims.setter
+    def source_dims(self, value):
+        self._source_dims = tuple(value)
+    @property
+    def target_dims(self):
+        return self._target_dims
+    @target_dims.setter
+    def target_dims(self, value):
+        self._target_dims = tuple(value)
     @staticmethod
     def pointer_pattern():
         return re.compile(rf"({_identifier_pattern_raw_string()})_(pointer)")
@@ -226,21 +320,11 @@ class Tensor:
     def stride_pattern():
         return re.compile(rf"({_identifier_pattern_raw_string()})_(stride)_(.+)")
-    def _dims_of(self, stride):
-        dims = set()
-        names = stride.names() if isinstance(stride, Symbol) else {stride}
-        for dim, original_stride in enumerate(self.original.strides):
-            if str(original_stride) in names:
-                dims.add(dim)
-        return dims
     @staticmethod
     def _calculate_default_strides(shape):
         strides = [1]
-        for size in shape[1:]:
+        for size in reversed(shape[1:]):
             strides.append(size * strides[-1])
         return reversed(strides)

{ninetoothed-0.6.0.dist-info → ninetoothed-0.8.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
-Metadata-Version: 2.3
+Metadata-Version: 2.4
 Name: ninetoothed
-Version: 0.6.0
+Version: 0.8.0
 Summary: A domain-specific language based on Triton but providing higher-level abstraction.
 Project-URL: Homepage, https://github.com/InfiniTensor/ninetoothed
 Project-URL: Issues, https://github.com/InfiniTensor/ninetoothed/issues
@@ -51,6 +51,8 @@ def add_kernel(
 In this code, we first define `BLOCK_SIZE`, which is a `Symbol`. You can think of `"BLOCK_SIZE"` as its name. We see that `meta` is set to `True`, indicating to the compiler that it is a meta-parameter and its value can be determined by the compiler. The `Tensor(1)` constructs a one-dimensional tensor (vector), and `Tensor(1).tile((BLOCK_SIZE,))` means we want to create a vector and divide it into blocks of size `BLOCK_SIZE`. Suppose the size of this vector is `8192` and `BLOCK_SIZE` is `1024`, then the vector will be divided into `8` blocks, each of size `1024`.
+![Tiling of Vector Addition](docs/source/_static/vecadd-tiling.png)
 By using type annotations, we tell the compiler that we will have three tensor parameters, which will be divided into blocks, and `x`, `y`, and `z` are these blocks. It's important to understand that `x`, `y`, and `z` are the blocks, not the tensors themselves. In the function body, `x`, `y`, and `z` are also the blocks. The rest is straightforward (only one line `z = x + y` left, haha), we add each block of `x` and `y` and store it in `z`. Since each block of the parameter tensors undergoes this operation, the addition is completed for the whole tensors as well.
 ### Matrix Multiplication
@@ -82,4 +84,10 @@ def matmul_kernel(a: a_tiled, b: b_tiled, c: c_tiled):
 For matrix multiplication, we also have three tensor parameters, but the tiling method is more complex than vector addition. We denote the three matrices as $A$, $B$, and $C$, where $A$ and $B$ are inputs, and $C$ is the output. Tiling $C$ is simple; we just need to divide it into blocks of size `(BLOCK_SIZE_M, BLOCK_SIZE_N)` by rows and columns. Once each block computes its result, the entire $C$ is computed. However, how should we tile $A$ and $B$? The answer is to introduce another meta-parameter `BLOCK_SIZE_K`. This way, we can divide $A$ into blocks of size `(BLOCK_SIZE_M, BLOCK_SIZE_K)` and $B$ into blocks of size `(BLOCK_SIZE_K, BLOCK_SIZE_N)`. However, for matrix multiplication, $A$ and $B$ do not correspond block by block; each row of $A$ needs to correspond to each column of $B$. Therefore, we need to further `tile` $A$ and $B$ by rows and columns, respectively. Up to this point, we have a set of row blocks of $A$ and column blocks of $B$. However, each row block of $A$ must correspond to every column block of $B$. This is where `expand` comes in. We `expand` the row blocks of $A$ along the columns to the number of columns of $C$ and the column blocks of $B$ along the rows to the number of rows of $C$. This way, we successfully tile $A$, $B$, and $C$. In fact, our meta-operations up to this point have already enabled us to write kernel functions. However, we notice that the levels where the row blocks and column blocks reside, which we mentioned earlier, are two-dimensional, and their sizes are of the forms `(1, ...)` and `(..., 1)`. This means that if no other operations are performed, the way we access row blocks and column blocks would have to be `a[0, k]` and `b[k, 0]`. If we want to use `a` to find the range of `k`, we would need to use `a.shape[1]`, but we know that dimensions of size `1` can actually be removed completely. This is why we added two lines of `squeeze`. The `dtype` refers to the data type, which in PyTorch can generally be some integer or floating-point type, such as `torch.float32`. However, since meta-operations like `tile` can be performed in NineToothed, `dtype` can also be a `Tensor`. In other words, there is a concept of "tensors that store tensors" in NineToothed. In summary, these two lines perform operations on the tensors stored in the outmost tensor, removing the dimensions of size `1`. This way, when we access the row and column blocks, we can use `a[k]` and `b[k]`, and when finding the range of `k`, we can use `a.shape[0]`.
+![Tiling of Matrix Multiplication](docs/source/_static/matmul-tiling.png)
 With tiling done, the rest is simple. In the function body, we define an `accumulator` to accumulate intermediate results. We then iterate through the corresponding row blocks of $A$ and column blocks of $B$, multiplying them and accumulating the results in `accumulator`. Finally, we place the `accumulator` in the corresponding block of $C$. Since each block of the parameter tensors undergoes this operation, the multiplication is completed for the whole tensors as well.
+## License
+This project is distributed under the Apache-2.0 license. See the included [LICENSE](LICENSE) file for details.

ninetoothed-0.8.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,11 @@
+ninetoothed/__init__.py,sha256=dX34sk5GA3OgWf1Jc4gJMW3UwcGcJsuG3hs3rkiqq6g,161
+ninetoothed/jit.py,sha256=z70hQEsogfQu0cLxq5m3cOsWsVANcMRJaVv5di9vk1c,23741
+ninetoothed/language.py,sha256=YwjlBENmmKPTnhaQ2uYbj5MwzrCAT7MLJ6VkQ6NeXJE,504
+ninetoothed/naming.py,sha256=3FBnC-S3dAZRcBcob9SrcVpVEYE5IXRacwkCiA3vIGU,891
+ninetoothed/symbol.py,sha256=rZ5nXtn-U1Nw0BBRJ-kfrwmX_zCbAi76un-Z2QFaoZc,4773
+ninetoothed/tensor.py,sha256=_jM0tVgqIwZd3MJJsGVTaLCsSxpPO8JfF4qkMShhQvQ,9429
+ninetoothed/torchifier.py,sha256=8M2PDwyFIfVypX6Z-Vt_bGbsCPqxqKnftL0rXeh9bOM,911
+ninetoothed-0.8.0.dist-info/METADATA,sha256=gPWYhTBH5EdeOyGnArZIEw82aFmoQchD6pxtLi6LGMA,7054
+ninetoothed-0.8.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+ninetoothed-0.8.0.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+ninetoothed-0.8.0.dist-info/RECORD,,

{ninetoothed-0.6.0.dist-info → ninetoothed-0.8.0.dist-info}/WHEEL RENAMED Viewed

@@ -1,4 +1,4 @@
 Wheel-Version: 1.0
-Generator: hatchling 1.25.0
+Generator: hatchling 1.27.0
 Root-Is-Purelib: true
 Tag: py3-none-any

ninetoothed-0.6.0.dist-info/RECORD DELETED Viewed

@@ -1,10 +0,0 @@
-ninetoothed/__init__.py,sha256=T5UJXlC-wbo8JKPbLUNT65Kccp12xP52WFV5FsugETI,147
-ninetoothed/jit.py,sha256=5gNp4HixCkural_Ns3DxwT4LL3OUcG0ECj4NLjb-EYk,16959
-ninetoothed/language.py,sha256=YwjlBENmmKPTnhaQ2uYbj5MwzrCAT7MLJ6VkQ6NeXJE,504
-ninetoothed/symbol.py,sha256=8Wg-JQPkVv9mMIxB1Rj4SHzOytHXPgHLkuK0BEFPDkc,5243
-ninetoothed/tensor.py,sha256=L-9LhwnM4uRtRvj3tqrzerUijEfKeTQvFBcmS1hQilI,6656
-ninetoothed/torchifier.py,sha256=8M2PDwyFIfVypX6Z-Vt_bGbsCPqxqKnftL0rXeh9bOM,911
-ninetoothed-0.6.0.dist-info/METADATA,sha256=zvY4nvKt7R8kWDYrGnApem_C07trLgOj1-7zXPfqD9U,6785
-ninetoothed-0.6.0.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
-ninetoothed-0.6.0.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-ninetoothed-0.6.0.dist-info/RECORD,,

{ninetoothed-0.6.0.dist-info → ninetoothed-0.8.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

ninetoothed 0.6.0__py3-none-any.whl → 0.8.0__py3-none-any.whl

ninetoothed 0.6.0py3-none-any.whl → 0.8.0py3-none-any.whl