PyPI - ninetoothed - Versions diffs - 0.14.0__tar.gz → 0.15.1__tar.gz - Mend

ninetoothed 0.14.0tar.gz → 0.15.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (54) hide show

ninetoothed-0.15.1/.github/ISSUE_TEMPLATE/bug-report.yml ADDED Viewed

@@ -0,0 +1,55 @@
+name: 🐛 Bug report
+description: Something isn't working as expected 🤔.
+labels: ["bug"]
+body:
+  - type: markdown
+    attributes:
+      value: Thanks for taking the time to fill out this bug report!
+  - type: checkboxes
+    attributes:
+      label: Is there an existing issue for this?
+      description: >
+        Please search to see if an issue already exists
+        for the bug you encountered.
+      options:
+      - label: I have searched the existing issues.
+        required: true
+  - type: textarea
+    attributes:
+      label: "Describe the bug:"
+      description: A clear and concise description of what the bug is.
+    validations:
+      required: false
+  - type: textarea
+    attributes:
+      label: "To reproduce:"
+      description: >
+        Steps to reproduce the behavior.
+        If applicable, provide a small, self-contained piece of code
+        that can be run directly to reproduce the issue.
+    validations:
+      required: false
+  - type: textarea
+    attributes:
+      label: "Expected behavior:"
+      description: >
+        A clear and concise description of what you expected to happen.
+    validations:
+      required: false
+  - type: textarea
+    attributes:
+      label: "Environment details:"
+      description: >
+        Please include your NineToothed version, operating system,
+        hardware platform, and any relevant information.
+        If you are using PyTorch, please run
+        `python -m torch.utils.collect_env` to gather
+        environment information.
+    validations:
+      required: false

ninetoothed-0.15.1/.github/ISSUE_TEMPLATE/feature-request.yml ADDED Viewed

@@ -0,0 +1,13 @@
+name: 🚀 Feature request
+description: I have a suggestion 🙂!
+body:
+  - type: textarea
+    attributes:
+      label: "Description & motivation:"
+      description: >
+        Please describe the feature that you would like to see and
+        explain the problem it would solve or
+        the benefit it would provide.
+    validations:
+      required: true

ninetoothed-0.15.1/.github/pull_request_template.md ADDED Viewed

@@ -0,0 +1,5 @@
+`pytest` output:
+```
+```

{ninetoothed-0.14.0 → ninetoothed-0.15.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: ninetoothed
-Version: 0.14.0
+Version: 0.15.1
 Summary: A domain-specific language based on Triton but providing higher-level abstraction.
 Project-URL: Homepage, https://github.com/InfiniTensor/ninetoothed
 Project-URL: Issues, https://github.com/InfiniTensor/ninetoothed/issues

{ninetoothed-0.14.0 → ninetoothed-0.15.1}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 [project]
 name = "ninetoothed"
-version = "0.14.0"
+version = "0.15.1"
 authors = [{ name = "Jiacheng Huang", email = "huangjiacheng0709@outlook.com" }]
 description = "A domain-specific language based on Triton but providing higher-level abstraction."
 readme = "README.md"

ninetoothed-0.15.1/src/ninetoothed/__init__.py ADDED Viewed

@@ -0,0 +1,35 @@
+from ninetoothed.dtype import (
+    float16,
+    float32,
+    float64,
+    int8,
+    int16,
+    int32,
+    int64,
+    uint8,
+    uint16,
+    uint32,
+    uint64,
+)
+from ninetoothed.jit import jit
+from ninetoothed.make import make
+from ninetoothed.symbol import Symbol
+from ninetoothed.tensor import Tensor
+__all__ = [
+    "Symbol",
+    "Tensor",
+    "float16",
+    "float32",
+    "float64",
+    "int8",
+    "int16",
+    "int32",
+    "int64",
+    "jit",
+    "make",
+    "uint8",
+    "uint16",
+    "uint32",
+    "uint64",
+]

ninetoothed-0.15.1/src/ninetoothed/aot.py ADDED Viewed

@@ -0,0 +1,217 @@
+import ast
+import pathlib
+import subprocess
+import tempfile
+import uuid
+from ninetoothed.dtype import int64
+from ninetoothed.generation import CACHE_DIR, CodeGenerator
+from ninetoothed.tensor import Tensor
+def aot(
+    func, caller="cuda", kernel_name=None, output_dir=None, num_warps=4, num_stages=3
+):
+    output_dir = pathlib.Path(output_dir)
+    output_contents = _aot(func, caller, kernel_name, num_warps, num_stages)
+    for output_name, output_content in output_contents.items():
+        output_path = output_dir / f"{kernel_name}{output_name[-2:]}"
+        with open(output_path, "w") as f:
+            f.write(output_content)
+def _aot(func, caller, kernel_name, num_warps, num_stages):
+    def _find_tensor_by_source_name(tensors, name):
+        for tensor in tensors:
+            if tensor.source.name == name:
+                return tensor
+    _HEADER_PATH.parent.mkdir(exist_ok=True)
+    if not _HEADER_PATH.exists():
+        _HEADER_PATH.write_text(_HEADER_CONTENT)
+    code_generator = CodeGenerator()
+    source_file = code_generator(
+        func, caller=caller, kernel_name=kernel_name, prettify=False
+    )
+    tensors = code_generator.tensors
+    kernel_func = code_generator.kernel_func
+    launch_func = code_generator.launch_func
+    param_types = []
+    for arg in kernel_func.args.args:
+        param = arg.arg
+        if match := Tensor.pointer_pattern().fullmatch(param):
+            source_name = match.group(0).removesuffix("_pointer")
+            tensor = _find_tensor_by_source_name(tensors, source_name)
+            dtype = tensor.source.dtype
+            param_types.append(f"*{dtype}")
+        elif Tensor.size_pattern().fullmatch(param):
+            param_types.append(int64)
+        elif Tensor.stride_pattern().fullmatch(param):
+            param_types.append(int64)
+    signature = ", ".join(param_types)
+    grid_extractor = _GridExtractor()
+    launch_func = grid_extractor.visit(launch_func)
+    grid_extractor.visit(code_generator.raw_grid)
+    grid = f"{ast.unparse(grid_extractor.grid[0])}, 1, 1"
+    signature_hash, output_contents = _compile(
+        source_file, kernel_name, signature, grid, num_warps, num_stages
+    )
+    unparser = _Unparser()
+    launch_func_unparsed = unparser.unparse(launch_func)
+    launch_func_unparsed = launch_func_unparsed.replace(
+        func.__name__, f"{kernel_name}_{signature_hash}"
+    )
+    c_source_file_name = f"{kernel_name}.{signature_hash}.c"
+    c_source_file = output_contents[c_source_file_name]
+    c_source_file = f"{c_source_file}\n{launch_func_unparsed}\n"
+    c_source_file = c_source_file.replace("<stdint.h>", f'"{_HEADER_PATH}"')
+    output_contents[c_source_file_name] = c_source_file
+    c_header_file_name = f"{kernel_name}.{signature_hash}.h"
+    c_header_file = output_contents[c_header_file_name]
+    c_header_file = f"{c_header_file}\n{unparser.header};\n"
+    c_header_file = c_header_file.replace("<stdint.h>", f'"{_HEADER_PATH}"')
+    output_contents[c_header_file_name] = c_header_file
+    return output_contents
+_HEADER_CONTENT = """#include <stdint.h>
+typedef struct {
+    uintptr_t data;
+    uint64_t *shape;
+    int64_t *strides;
+} NineToothedTensor;
+"""
+_HEADER_PATH = CACHE_DIR / "ninetoothed.h"
+class _Unparser:
+    def unparse(self, node):
+        method_name = "_unparse_" + node.__class__.__name__
+        if hasattr(self, method_name):
+            return getattr(self, method_name)(node)
+        return self._generic_unparse(node)
+    def _generic_unparse(self, node):
+        return ast.unparse(node)
+    def _unparse_Expr(self, node):
+        return self.unparse(node.value)
+    def _unparse_Call(self, node):
+        call = ast.Call(
+            func=node.func,
+            args=[ast.Name(id="stream", ctx=ast.Load())] + node.args,
+            keywords=[],
+        )
+        return f"return {self._generic_unparse(call)};"
+    def _unparse_FunctionDef(self, node):
+        params = ["CUstream stream"]
+        params += [f"NineToothedTensor {arg.arg}" for arg in node.args.args]
+        header = f"CUresult {node.name}({', '.join(params)})"
+        self.header = header
+        body_lines = []
+        for stmt in node.body:
+            stmt_unparsed = self.unparse(stmt)
+            if isinstance(stmt, ast.Expr):
+                stmt_unparsed = stmt_unparsed.strip()
+                if not stmt_unparsed.endswith(";"):
+                    stmt_unparsed += ";"
+            body_lines.append("    " + stmt_unparsed)
+        body = "\n".join(body_lines)
+        return f"{header} {{\n{body}\n}}"
+class _GridExtractor(ast.NodeTransformer):
+    def visit_BinOp(self, node):
+        self.generic_visit(node)
+        if isinstance(node.op, ast.FloorDiv):
+            node.op = ast.Div()
+        return node
+    def visit_Call(self, node):
+        self.generic_visit(node)
+        node.func = node.func.value
+        return node
+    def visit_Lambda(self, node):
+        self.generic_visit(node)
+        self.grid = node.body.elts
+        return node
+def _compile(path, name, signature, grid, num_warps, num_stages):
+    with tempfile.TemporaryDirectory() as temp_dir:
+        output_dir = pathlib.Path(temp_dir)
+        output_name = uuid.uuid4().hex
+        output_path = output_dir / output_name
+        command = [
+            "python",
+            "-m",
+            "triton.tools.compile",
+            str(path),
+            "--kernel-name",
+            str(name),
+            "--signature",
+            str(signature),
+            "--grid",
+            str(grid),
+            "--num-warps",
+            str(num_warps),
+            "--num-stages",
+            str(num_stages),
+            "--out-path",
+            str(output_path),
+        ]
+        subprocess.run(command, check=True)
+        matching_files = list(output_dir.glob(f"{output_name}.*"))
+        signature_hash = matching_files[0].name.split(".")[1]
+        output_contents = {}
+        for file in matching_files:
+            with file.open() as f:
+                output_contents[file.name.replace(output_name, name)] = f.read()
+    return signature_hash, output_contents

ninetoothed-0.15.1/src/ninetoothed/cudaifier.py ADDED Viewed

@@ -0,0 +1,36 @@
+import ast
+import ninetoothed.naming as naming
+from ninetoothed.tensor import Tensor
+class Cudaifier(ast.NodeTransformer):
+    def visit_Name(self, node):
+        self.generic_visit(node)
+        source = node.id
+        if naming.is_constexpr(source):
+            return node
+        def repl(match):
+            return f"{match.group(1)}.data"
+        source = Tensor.pointer_pattern().sub(repl, source)
+        def repl(match):
+            return f"{match.group(1)}.shape[{match.group(3)}]"
+        source = Tensor.size_pattern().sub(repl, source)
+        def repl(match):
+            return f"{match.group(1)}.strides[{match.group(3)}]"
+        source = Tensor.stride_pattern().sub(repl, source)
+        source = source.removesuffix("_with_auto_tuning")
+        if source != node.id:
+            return ast.parse(source, mode="eval").body
+        return node

ninetoothed-0.15.1/src/ninetoothed/dtype.py ADDED Viewed

@@ -0,0 +1,13 @@
+int8 = "i8"
+int16 = "i16"
+int32 = "i32"
+int64 = "i64"
+uint8 = "u8"
+uint16 = "u16"
+uint32 = "u32"
+uint64 = "u64"
+float16 = "fp16"
+float32 = "fp32"
+float64 = "fp64"

ninetoothed-0.14.0/src/ninetoothed/jit.py → ninetoothed-0.15.1/src/ninetoothed/generation.py RENAMED Viewed

@@ -2,84 +2,89 @@ import ast
 import collections
 import copy
 import functools
-import importlib.util
+import hashlib
 import inspect
 import itertools
 import math
+import pathlib
 import subprocess
-import sys
-import tempfile
 import triton
 import ninetoothed.naming as naming
+from ninetoothed.cudaifier import Cudaifier
 from ninetoothed.language import attribute, call
 from ninetoothed.symbol import Symbol
 from ninetoothed.tensor import Tensor
 from ninetoothed.torchifier import Torchifier
+CACHE_DIR = pathlib.Path.home() / ".ninetoothed"
-def make(arrangement, application, tensors):
-    """Integrate the arrangement and the application of the tensors.
-    :param arrangement: The arrangement of the tensors.
-    :param application: The application of the tensors.
-    :param tensors: The tensors.
-    :return: A handle to the compute kernel.
-    """
-    params = inspect.signature(application).parameters
-    types = arrangement(*tensors)
-    annotations = {param: type for param, type in zip(params, types)}
-    application.__annotations__ = annotations
+class CodeGenerator(ast.NodeTransformer):
+    def __init__(self):
+        super().__init__()
+        self._POWER_OF_TWOS = tuple(2**n for n in range(5, 11))
+        self._MIN_PRODUCT = 2**10
+        self._MAX_PRODUCT = 2**20
-    return jit(application)
+    def __call__(self, func, caller, kernel_name, prettify):
+        def _get_tree(func):
+            module = ast.parse(inspect.getsource(inspect.getmodule(func)))
+            collector = _ImportCollector()
+            collector.visit(module)
-def jit(func=None, *, _prettify=False):
-    """A decorator for generating compute kernels.
+            finder = _FunctionDefFinder(func.__name__)
+            finder.visit(module)
+            func_def = finder.result
-    :param func: The function to be compiled.
-    :param _prettify: Whether to prettify the generated code.
-    :return: A handle to the compute kernel.
+            inliner = _Inliner(func.__globals__)
+            inliner.visit(func_def)
+            module.body = collector.imports + inliner.imports + [finder.result]
-    .. note::
+            return _AliasRestorer().visit(module)
-        The ``_prettify`` parameter is experimental, which might break
-        the generated code.
-    """
+        def _find_dependencies(func):
+            dependencies = set()
-    def wrapper(func):
-        return JIT(func, _prettify=_prettify)()
+            for obj in func.__globals__.values():
+                if isinstance(obj, triton.runtime.JITFunction):
+                    dependencies.add(obj.src)
-    if func is None:
-        return wrapper
+            return "\n".join(
+                f"@triton.jit\n{dependency}" for dependency in dependencies
+            )
-    return wrapper(func)
+        self.launch_func_name = f"launch_{kernel_name}"
+        self._caller = caller
-class JIT:
-    def __init__(self, func, _prettify=False):
-        self.func = func
+        self._context = inspect.get_annotations(func)
-        self._prettify = _prettify
+        self._args = list(self._context.values())
-    def __call__(self):
-        tree = self._get_tree()
+        tree = _get_tree(func)
-        CodeGenerator(inspect.get_annotations(self.func)).visit(tree)
+        self.visit(tree)
         Tritonizer().visit(tree)
         _BinOpSimplifier().visit(tree)
         ast.fix_missing_locations(tree)
-        if self._prettify:
+        if prettify:
             name_collector = _SimplifiedNameCollector()
             name_collector.visit(tree)
         unparsed = ast.unparse(tree).replace("None:", ":").replace(":None", ":")
-        dependencies = self._find_dependencies()
+        dependencies = _find_dependencies(func)
         source = "\n\n".join((unparsed, dependencies)).strip()
+        source = source.replace(func.__name__, kernel_name)
+        source += "\n"
-        if self._prettify:
+        if prettify:
             for original, simplified in name_collector.simplified_names.items():
                 if simplified not in name_collector.simplified_names:
                     source = source.replace(original, simplified)
@@ -88,73 +93,29 @@ class JIT:
                 ["ruff", "format", "-"], input=source, encoding="utf-8"
             )
-        with tempfile.NamedTemporaryFile(delete=False, suffix=".py") as temp_file:
-            temp_file.write(source.encode("utf-8"))
-            temp_file_name = temp_file.name
+        digest = hashlib.sha256(source.encode("utf-8")).hexdigest()
+        cache_dir = CACHE_DIR
+        cache_dir.mkdir(exist_ok=True)
+        cache_file = cache_dir / f"{digest}.py"
-        module = type(self)._import_from_path(temp_file_name, temp_file_name)
-        module_vars = vars(module)
+        if not cache_file.exists():
+            with open(cache_file, "w", encoding="utf-8") as f:
+                f.write(source)
-        handle = _Handle(
-            module_vars[self.func.__name__],
-            module_vars[f"launch_{self.func.__name__}"],
-            source,
-        )
-        return handle
-    def _get_tree(self):
-        module = ast.parse(inspect.getsource(inspect.getmodule(self.func)))
-        collector = _ImportCollector()
-        collector.visit(module)
-        finder = _FunctionDefFinder(self.func.__name__)
-        finder.visit(module)
-        func_def = finder.result
+        self.tensors = self._args
+        self.kernel_func = self._func_def
+        self.launch_func = self._launch
-        inliner = _Inliner(self.func.__globals__)
-        inliner.visit(func_def)
-        module.body = collector.imports + inliner.imports + [finder.result]
-        return _AliasRestorer().visit(module)
-    def _find_dependencies(self):
-        dependencies = set()
-        for obj in self.func.__globals__.values():
-            if isinstance(obj, triton.runtime.JITFunction):
-                dependencies.add(obj.src)
-        return "\n".join(f"@triton.jit\n{dependency}" for dependency in dependencies)
-    @staticmethod
-    def _import_from_path(module_name, file_path):
-        spec = importlib.util.spec_from_file_location(module_name, file_path)
-        module = importlib.util.module_from_spec(spec)
-        sys.modules[module_name] = module
-        spec.loader.exec_module(module)
-        return module
-class CodeGenerator(ast.NodeTransformer):
-    def __init__(self, context):
-        super().__init__()
-        self._context = context
-        self._args = list(self._context.values())
-        self._POWER_OF_TWOS = tuple(2**n for n in range(5, 11))
-        self._MIN_PRODUCT = 2**10
-        self._MAX_PRODUCT = 2**20
+        return str(cache_file)
     def visit_Module(self, node):
         self.generic_visit(node)
+        func_with_auto_tuning = f"{Symbol(self._autotune)}({self._func_def.name})"
+        node.body.append(
+            ast.parse(f"{self._func_name_with_auto_tuning} = {func_with_auto_tuning}")
+        )
         node.body.append(self._launch)
         return node
@@ -162,6 +123,8 @@ class CodeGenerator(ast.NodeTransformer):
     def visit_FunctionDef(self, node):
         self._func_def = node
+        self._func_name_with_auto_tuning = f"{self._func_def.name}_with_auto_tuning"
         self._invariants = {}
         self.generic_visit(node)
@@ -184,6 +147,9 @@ class CodeGenerator(ast.NodeTransformer):
             if naming.is_constexpr(name)
         }
+        non_meta_names = sorted(non_meta_names)
+        meta_names = sorted(meta_names)
         node.args = [
             ast.arg(arg=name)
             if not naming.is_constexpr(name)
@@ -194,8 +160,8 @@ class CodeGenerator(ast.NodeTransformer):
             for name in meta_names
         ]
-        autotune = self._generate_autotune(non_meta_names, meta_names)
-        self._func_def.decorator_list = [autotune, Symbol("triton.jit").node]
+        self._autotune = self._generate_autotune(non_meta_names, meta_names)
+        self._func_def.decorator_list = [Symbol("triton.jit").node]
         self._launch = self._generate_launch(non_meta_names, meta_names)
@@ -354,7 +320,7 @@ class CodeGenerator(ast.NodeTransformer):
         ]
         launch = ast.FunctionDef(
-            name=f"launch_{self._func_def.name}",
+            name=self.launch_func_name,
             args=ast.arguments(
                 posonlyargs=[],
                 args=[ast.arg(arg=arg.source.name) for arg in self._args]
@@ -392,7 +358,9 @@ class CodeGenerator(ast.NodeTransformer):
                 ast.Expr(
                     ast.Call(
                         func=ast.Subscript(
-                            value=ast.Name(id=self._func_def.name, ctx=ast.Load()),
+                            value=ast.Name(
+                                id=self._func_name_with_auto_tuning, ctx=ast.Load()
+                            ),
                             slice=self._generate_grid(),
                             ctx=ast.Load(),
                         ),
@@ -422,14 +390,23 @@ class CodeGenerator(ast.NodeTransformer):
         MetaEncloser(meta).visit(launch)
-        Torchifier().visit(launch)
+        if self._caller == "torch":
+            Torchifier().visit(launch)
+        elif self._caller == "cuda":
+            Cudaifier().visit(launch)
+        else:
+            raise ValueError(f"Unsupported caller: `{self._caller}`.")
         return launch
     def _generate_grid(self):
         num_elements = functools.reduce(lambda x, y: x * y, self._args[0].shape)
-        return ast.parse(f"lambda meta: ({num_elements},)", mode="eval").body
+        grid = ast.parse(f"lambda meta: ({num_elements},)", mode="eval").body
+        self.raw_grid = copy.deepcopy(grid)
+        return grid
     def _generate_load(self, tensor, indices=()):
         if tensor.ndim == 0:
@@ -851,7 +828,7 @@ class _Inliner(ast.NodeTransformer):
             return names
         def _make_temporary():
-            prefix = naming.auto_generate(f"temporary_{self._count}")
+            prefix = f"{naming.auto_generate(f'temporary_{self._count}')}_"
             self._count += 1
             return prefix
@@ -941,16 +918,6 @@ class _SimplifiedNameCollector(ast.NodeVisitor):
         self.simplified_names[node.id] = naming.remove_prefixes(node.id)
-class _Handle:
-    def __init__(self, kernel, launch, source):
-        self._kernel = kernel
-        self._launch = launch
-        self._source = source
-    def __call__(self, *args, **kwargs):
-        return self._launch(*args, **kwargs)
 class _AliasRestorer(ast.NodeTransformer):
     def __init__(self):
         super().__init__()

ninetoothed-0.15.1/src/ninetoothed/jit.py ADDED Viewed

@@ -0,0 +1,77 @@
+import importlib
+import sys
+from ninetoothed.generation import CodeGenerator
+def jit(func=None, *, caller="torch", kernel_name=None, _prettify=False):
+    """A decorator for generating compute kernels.
+    :param func: The function to be compiled.
+    :param caller: Who will call the compute kernel.
+    :param kernel_name: The name for the generated kernel.
+    :param _prettify: Whether to prettify the generated code.
+    :return: A handle to the compute kernel.
+    .. note::
+        The ``_prettify`` parameter is experimental, which might break
+        the generated code.
+    """
+    def wrapper(func):
+        return JIT(func, caller=caller, kernel_name=kernel_name, _prettify=_prettify)()
+    if func is None:
+        return wrapper
+    return wrapper(func)
+class JIT:
+    def __init__(self, func, caller, kernel_name, _prettify=False):
+        self.func = func
+        self._caller = caller
+        if kernel_name is not None:
+            self._kernel_name = kernel_name
+        else:
+            self._kernel_name = func.__name__
+        self._prettify = _prettify
+    def __call__(self):
+        code_generator = CodeGenerator()
+        source_file = code_generator(
+            self.func, self._caller, self._kernel_name, self._prettify
+        )
+        module = type(self)._import_from_path(source_file, source_file)
+        module_vars = vars(module)
+        handle = _Handle(
+            module_vars[self._kernel_name],
+            module_vars[code_generator.launch_func_name],
+            source_file,
+        )
+        return handle
+    @staticmethod
+    def _import_from_path(module_name, file_path):
+        spec = importlib.util.spec_from_file_location(module_name, file_path)
+        module = importlib.util.module_from_spec(spec)
+        sys.modules[module_name] = module
+        spec.loader.exec_module(module)
+        return module
+class _Handle:
+    def __init__(self, kernel, launch, source):
+        self._kernel = kernel
+        self._launch = launch
+        self._source = source
+    def __call__(self, *args, **kwargs):
+        return self._launch(*args, **kwargs)

ninetoothed-0.15.1/src/ninetoothed/make.py ADDED Viewed

@@ -0,0 +1,45 @@
+import inspect
+from ninetoothed.aot import aot
+from ninetoothed.jit import jit
+def make(
+    arrangement,
+    application,
+    tensors,
+    caller="torch",
+    kernel_name=None,
+    output_dir=None,
+    num_warps=4,
+    num_stages=3,
+):
+    """Integrate the arrangement and the application of the tensors.
+    :param arrangement: The arrangement of the tensors.
+    :param application: The application of the tensors.
+    :param tensors: The tensors.
+    :param caller: Who will call the compute kernel.
+    :param kernel_name: The name for the generated kernel.
+    :param output_dir: The directory to store the generated files.
+    :param num_warps: The number of warps to use.
+    :param num_stages: The number of pipeline stages.
+    :return: A handle to the compute kernel.
+    """
+    params = inspect.signature(application).parameters
+    types = arrangement(*tensors)
+    annotations = {param: type for param, type in zip(params, types)}
+    application.__annotations__ = annotations
+    if caller == "torch":
+        return jit(application, caller=caller, kernel_name=kernel_name)
+    return aot(
+        application,
+        caller=caller,
+        kernel_name=kernel_name,
+        output_dir=output_dir,
+        num_warps=num_warps,
+        num_stages=num_stages,
+    )

{ninetoothed-0.14.0 → ninetoothed-0.15.1}/src/ninetoothed/tensor.py RENAMED Viewed

@@ -146,7 +146,7 @@ class Tensor:
             )
             outer_shape.append(new_size)
-            new_stride = self_stride * stride // spacing
+            new_stride = self_stride * stride
             outer_strides.append(new_stride)
             inner_shape.append(tile_size)

{ninetoothed-0.14.0 → ninetoothed-0.15.1}/src/ninetoothed/visualization.py RENAMED Viewed

@@ -118,10 +118,16 @@ def _visualize_unit_square(ax, x, y, color):
 def _visualize_rect(ax, width, height, x, y, color):
-    pos_x, pos_y = zip(*_verts_of_rect(width, height, x, y))
-    ax.fill(pos_x, pos_y, color)
-    ax.plot(pos_x + (pos_x[0],), pos_y + (pos_y[0],), "k")
+    ax.add_patch(
+        plt.Rectangle(
+            (x, y),
+            width,
+            height,
+            edgecolor="k",
+            facecolor=color,
+            linewidth=plt.rcParams["lines.linewidth"],
+        )
+    )
 def _verts_of_rect(width, height, x, y):

{ninetoothed-0.14.0 → ninetoothed-0.15.1}/tests/test_addmm.py RENAMED Viewed

@@ -3,6 +3,7 @@ import random
 import torch
 import ninetoothed
+import ninetoothed.language as ntl
 import tests.test_matmul as matmul
 from ninetoothed import Tensor
 from tests.skippers import skip_if_cuda_not_available, skip_if_float8_e5m2_not_supported
@@ -19,8 +20,9 @@ def arrangement(input, mat1, mat2, beta, alpha, output):
 def application(input, mat1, mat2, beta, alpha, output):
-    matmul.application(mat1, mat2, output)
-    output = beta * input + alpha * output
+    matmul_output = ntl.zeros(output.shape, dtype=ntl.float32)
+    matmul.application(mat1, mat2, matmul_output)
+    output = beta * input + alpha * matmul_output
 def addmm(input, mat1, mat2, beta=1, alpha=1):
@@ -43,6 +45,7 @@ def addmm(input, mat1, mat2, beta=1, alpha=1):
 class TestCUDA:
     @classmethod
     def setup_class(cls):
+        random.seed(0)
         torch.manual_seed(0)
         shape = (512, 512)
@@ -74,9 +77,6 @@ class TestCUDA:
         beta = type(self).beta
         alpha = type(self).alpha
-        # TODO: The current application function inlining feature
-        # causes some precision issues. Consider reducing `atol` and
-        # `rtol` of this test in the future.
         assert torch.allclose(
             addmm(input, mat1, mat2, beta=beta, alpha=alpha),
             torch.addmm(
@@ -86,6 +86,5 @@ class TestCUDA:
                 beta=beta,
                 alpha=alpha,
             ),
-            atol=0.5,
-            rtol=0.5,
+            atol=0.125,
         )

ninetoothed-0.15.1/tests/test_aot.py ADDED Viewed

@@ -0,0 +1,153 @@
+import ctypes
+import functools
+import subprocess
+import torch
+import torch.nn.functional as F
+import ninetoothed
+import ninetoothed.generation
+import tests.test_conv2d as conv2d
+import tests.test_matmul as matmul
+from ninetoothed import Tensor
+from tests.skippers import skip_if_cuda_not_available
+@skip_if_cuda_not_available
+class TestCUDA:
+    @classmethod
+    def setup_class(cls):
+        torch.manual_seed(0)
+    def test_matmul(self):
+        arrangement = functools.partial(
+            matmul.arrangement, BLOCK_SIZE_M=64, BLOCK_SIZE_N=64, BLOCK_SIZE_K=64
+        )
+        application = matmul.application
+        tensors = tuple(Tensor(2, dtype=ninetoothed.float16) for _ in range(3))
+        caller = "cuda"
+        kernel_name = "matmul"
+        output_dir = ninetoothed.generation.CACHE_DIR
+        launch_func = _generate_launch_func(
+            arrangement,
+            application,
+            tensors,
+            caller=caller,
+            kernel_name=kernel_name,
+            output_dir=output_dir,
+        )
+        shape = (512, 512)
+        dtype = torch.float16
+        device = caller
+        lhs = torch.randn(shape, dtype=dtype, device=device)
+        rhs = torch.randn(shape, dtype=dtype, device=device)
+        output = torch.empty((lhs.shape[0], rhs.shape[1]), dtype=dtype, device=device)
+        _run_launch_func(launch_func, lhs, rhs, output)
+        assert torch.allclose(output, torch.matmul(lhs, rhs))
+    def test_conv2d(self):
+        arrangement = functools.partial(
+            conv2d.arrangement, BLOCK_SIZE_M=64, BLOCK_SIZE_N=64, BLOCK_SIZE_K=64
+        )
+        application = matmul.application
+        tensors = tuple(Tensor(4, dtype=ninetoothed.float16) for _ in range(3))
+        caller = "cuda"
+        kernel_name = "conv2d"
+        output_dir = ninetoothed.generation.CACHE_DIR
+        launch_func = _generate_launch_func(
+            arrangement,
+            application,
+            tensors,
+            caller=caller,
+            kernel_name=kernel_name,
+            output_dir=output_dir,
+        )
+        n, c, h, w = 4, 64, 16, 16
+        k, _, r, s = 512, c, 3, 3
+        p = h - r + 1
+        q = w - s + 1
+        dtype = torch.float16
+        device = caller
+        input = torch.randn(n, c, h, w, dtype=dtype, device=device)
+        filter = torch.randn(k, c, r, s, dtype=dtype, device=device)
+        output = torch.empty(n, k, p, q, dtype=dtype, device=device)
+        _run_launch_func(launch_func, input, filter, output)
+        assert torch.allclose(output, F.conv2d(input, filter), atol=0.001, rtol=0.001)
+class _ArgumentTensor(ctypes.Structure):
+    _fields_ = [
+        ("data", ctypes.c_void_p),
+        ("shape", ctypes.POINTER(ctypes.c_uint64)),
+        ("strides", ctypes.POINTER(ctypes.c_int64)),
+    ]
+    @staticmethod
+    def from_torch_tensor(tensor):
+        data = ctypes.c_void_p(tensor.data_ptr())
+        shape = (ctypes.c_uint64 * len(tensor.shape))(*tensor.shape)
+        strides = (ctypes.c_int64 * len(tensor.stride()))(*tensor.stride())
+        return _ArgumentTensor(data, shape, strides)
+def _run_launch_func(launch_func, *tensors):
+    stream = torch.cuda.Stream()
+    arg_tensors = tuple(_ArgumentTensor.from_torch_tensor(tensor) for tensor in tensors)
+    with torch.cuda.stream(stream):
+        launch_func(ctypes.c_void_p(stream.cuda_stream), *arg_tensors)
+    stream.synchronize()
+def _generate_launch_func(
+    arrangement, application, tensors, caller, kernel_name, output_dir
+):
+    ninetoothed.make(
+        arrangement,
+        application,
+        tensors,
+        caller=caller,
+        kernel_name=kernel_name,
+        output_dir=output_dir,
+    )
+    _compile_library(kernel_name, output_dir)
+    library = _load_library(kernel_name, output_dir)
+    launch_func_name = f"launch_{kernel_name}"
+    launch_func = getattr(library, launch_func_name)
+    launch_func.argtypes = (ctypes.c_void_p,) + tuple(_ArgumentTensor for _ in tensors)
+    launch_func.restype = ctypes.c_int
+    return launch_func
+def _compile_library(kernel_name, output_dir):
+    command = [
+        "nvcc",
+        "-shared",
+        "-Xcompiler",
+        "-fPIC",
+        "-lcuda",
+        "-o",
+        output_dir / f"{kernel_name}.so",
+        output_dir / f"{kernel_name}.c",
+    ]
+    subprocess.run(command, check=True)
+def _load_library(kernel_name, kernel_dir):
+    return ctypes.CDLL(kernel_dir / f"{kernel_name}.so")

{ninetoothed-0.14.0 → ninetoothed-0.15.1}/tests/test_conv2d.py RENAMED Viewed

@@ -1,3 +1,5 @@
+import functools
 import torch
 import torch.nn.functional as F
@@ -7,7 +9,14 @@ from ninetoothed import Tensor
 from tests.skippers import skip_if_cuda_not_available
-def arrangement(input, filter, output):
+def arrangement(
+    input,
+    filter,
+    output,
+    BLOCK_SIZE_M=matmul.BLOCK_SIZE_M,
+    BLOCK_SIZE_N=matmul.BLOCK_SIZE_N,
+    BLOCK_SIZE_K=matmul.BLOCK_SIZE_K,
+):
     input_tiled = input.tile((1, *filter.shape[1:]), strides=(-1, -1, 1, 1))
     input_squeezed = input_tiled.squeeze(1)
     input_squeezed.dtype = input_squeezed.dtype.squeeze(0)
@@ -19,7 +28,12 @@ def arrangement(input, filter, output):
     output_flattened = output.permute((0, 2, 3, 1)).flatten(end_dim=3)
-    return matmul.arrangement(input_flattened, filter_permuted, output_flattened)
+    return functools.partial(
+        matmul.arrangement,
+        BLOCK_SIZE_M=BLOCK_SIZE_M,
+        BLOCK_SIZE_N=BLOCK_SIZE_N,
+        BLOCK_SIZE_K=BLOCK_SIZE_K,
+    )(input_flattened, filter_permuted, output_flattened)
 def conv2d(input, filter):

{ninetoothed-0.14.0 → ninetoothed-0.15.1}/tests/test_matmul.py RENAMED Viewed

@@ -5,12 +5,19 @@ import ninetoothed.language as ntl
 from ninetoothed import Symbol, Tensor
 from tests.skippers import skip_if_cuda_not_available, skip_if_float8_e5m2_not_supported
-def arrangement(lhs, rhs, output):
-    BLOCK_SIZE_M = Symbol("BLOCK_SIZE_M", meta=True)
-    BLOCK_SIZE_N = Symbol("BLOCK_SIZE_N", meta=True)
-    BLOCK_SIZE_K = Symbol("BLOCK_SIZE_K", meta=True)
+BLOCK_SIZE_M = Symbol("BLOCK_SIZE_M", meta=True)
+BLOCK_SIZE_N = Symbol("BLOCK_SIZE_N", meta=True)
+BLOCK_SIZE_K = Symbol("BLOCK_SIZE_K", meta=True)
+def arrangement(
+    lhs,
+    rhs,
+    output,
+    BLOCK_SIZE_M=BLOCK_SIZE_M,
+    BLOCK_SIZE_N=BLOCK_SIZE_N,
+    BLOCK_SIZE_K=BLOCK_SIZE_K,
+):
     output_tiled = output.tile((BLOCK_SIZE_M, BLOCK_SIZE_N))
     lhs_tiled = (

ninetoothed-0.14.0/src/ninetoothed/__init__.py DELETED Viewed

@@ -1,5 +0,0 @@
-from ninetoothed.jit import jit, make
-from ninetoothed.symbol import Symbol
-from ninetoothed.tensor import Tensor
-__all__ = ["Symbol", "Tensor", "jit", "make"]

{ninetoothed-0.14.0 → ninetoothed-0.15.1}/.gitattributes RENAMED Viewed

File without changes

{ninetoothed-0.14.0 → ninetoothed-0.15.1}/.github/workflows/publish-to-pypi.yml RENAMED Viewed

File without changes

{ninetoothed-0.14.0 → ninetoothed-0.15.1}/.github/workflows/pytest.yml RENAMED Viewed

File without changes

{ninetoothed-0.14.0 → ninetoothed-0.15.1}/.github/workflows/ruff.yml RENAMED Viewed

File without changes

{ninetoothed-0.14.0 → ninetoothed-0.15.1}/.github/workflows/sphinx.yml RENAMED Viewed

File without changes

{ninetoothed-0.14.0 → ninetoothed-0.15.1}/.gitignore RENAMED Viewed

File without changes

{ninetoothed-0.14.0 → ninetoothed-0.15.1}/LICENSE RENAMED Viewed

File without changes

{ninetoothed-0.14.0 → ninetoothed-0.15.1}/README.md RENAMED Viewed

File without changes

{ninetoothed-0.14.0 → ninetoothed-0.15.1}/docs/Makefile RENAMED Viewed

File without changes

{ninetoothed-0.14.0 → ninetoothed-0.15.1}/docs/README.zh.md RENAMED Viewed

File without changes

{ninetoothed-0.14.0 → ninetoothed-0.15.1}/docs/make.bat RENAMED Viewed

File without changes

{ninetoothed-0.14.0 → ninetoothed-0.15.1}/docs/requirements.txt RENAMED Viewed

File without changes

{ninetoothed-0.14.0 → ninetoothed-0.15.1}/docs/source/_static/matmul-tiling.png RENAMED Viewed

File without changes

{ninetoothed-0.14.0 → ninetoothed-0.15.1}/docs/source/_static/ninetoothed-logo.png RENAMED Viewed

File without changes

{ninetoothed-0.14.0 → ninetoothed-0.15.1}/docs/source/_static/vecadd-tiling.png RENAMED Viewed

File without changes

{ninetoothed-0.14.0 → ninetoothed-0.15.1}/docs/source/code_generation.rst RENAMED Viewed

File without changes

{ninetoothed-0.14.0 → ninetoothed-0.15.1}/docs/source/conf.py RENAMED Viewed

File without changes

{ninetoothed-0.14.0 → ninetoothed-0.15.1}/docs/source/index.rst RENAMED Viewed

File without changes

{ninetoothed-0.14.0 → ninetoothed-0.15.1}/docs/source/installation.rst RENAMED Viewed

File without changes

{ninetoothed-0.14.0 → ninetoothed-0.15.1}/docs/source/python_api.rst RENAMED Viewed

File without changes

{ninetoothed-0.14.0 → ninetoothed-0.15.1}/docs/source/symbol.rst RENAMED Viewed

File without changes

{ninetoothed-0.14.0 → ninetoothed-0.15.1}/docs/source/tensor.rst RENAMED Viewed

File without changes

{ninetoothed-0.14.0 → ninetoothed-0.15.1}/docs/source/visualization.rst RENAMED Viewed

File without changes

{ninetoothed-0.14.0 → ninetoothed-0.15.1}/requirements.txt RENAMED Viewed

File without changes

{ninetoothed-0.14.0 → ninetoothed-0.15.1}/src/ninetoothed/language.py RENAMED Viewed

File without changes

{ninetoothed-0.14.0 → ninetoothed-0.15.1}/src/ninetoothed/naming.py RENAMED Viewed

File without changes

{ninetoothed-0.14.0 → ninetoothed-0.15.1}/src/ninetoothed/symbol.py RENAMED Viewed

File without changes

{ninetoothed-0.14.0 → ninetoothed-0.15.1}/src/ninetoothed/torchifier.py RENAMED Viewed

File without changes

{ninetoothed-0.14.0 → ninetoothed-0.15.1}/tests/__init__.py RENAMED Viewed

File without changes

{ninetoothed-0.14.0 → ninetoothed-0.15.1}/tests/skippers.py RENAMED Viewed

File without changes

{ninetoothed-0.14.0 → ninetoothed-0.15.1}/tests/test_add.py RENAMED Viewed

File without changes

{ninetoothed-0.14.0 → ninetoothed-0.15.1}/tests/test_attention.py RENAMED Viewed

File without changes

{ninetoothed-0.14.0 → ninetoothed-0.15.1}/tests/test_max_pool2d.py RENAMED Viewed

File without changes

{ninetoothed-0.14.0 → ninetoothed-0.15.1}/tests/test_naming.py RENAMED Viewed

File without changes

{ninetoothed-0.14.0 → ninetoothed-0.15.1}/tests/test_softmax.py RENAMED Viewed

File without changes

ninetoothed 0.14.0__tar.gz → 0.15.1__tar.gz

ninetoothed 0.14.0tar.gz → 0.15.1tar.gz