PyPI - ninetoothed - Versions diffs - 0.15.1__py3-none-any.whl → 0.17.0__py3-none-any.whl - Mend

ninetoothed 0.15.1py3-none-any.whl → 0.17.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

ninetoothed/__init__.py +2 -1
ninetoothed/aot.py +22 -7
ninetoothed/generation.py +300 -33
ninetoothed/jit.py +46 -4
ninetoothed/language.py +4 -0
ninetoothed/make.py +22 -3
ninetoothed/symbol.py +65 -2
ninetoothed/tensor.py +17 -6
ninetoothed/utils.py +12 -0
ninetoothed/visualization.py +19 -12
{ninetoothed-0.15.1.dist-info → ninetoothed-0.17.0.dist-info}/METADATA +2 -1
ninetoothed-0.17.0.dist-info/RECORD +18 -0
ninetoothed-0.15.1.dist-info/RECORD +0 -17
{ninetoothed-0.15.1.dist-info → ninetoothed-0.17.0.dist-info}/WHEEL +0 -0
{ninetoothed-0.15.1.dist-info → ninetoothed-0.17.0.dist-info}/licenses/LICENSE +0 -0

ninetoothed/__init__.py CHANGED Viewed

@@ -13,12 +13,13 @@ from ninetoothed.dtype import (
 )
 from ninetoothed.jit import jit
 from ninetoothed.make import make
-from ninetoothed.symbol import Symbol
+from ninetoothed.symbol import Symbol, block_size
 from ninetoothed.tensor import Tensor
 __all__ = [
     "Symbol",
     "Tensor",
+    "block_size",
     "float16",
     "float32",
     "float64",

ninetoothed/aot.py CHANGED Viewed

@@ -31,12 +31,18 @@ def _aot(func, caller, kernel_name, num_warps, num_stages):
     _HEADER_PATH.parent.mkdir(exist_ok=True)
-    if not _HEADER_PATH.exists():
+    if not _HEADER_PATH.exists() or _HEADER_PATH.read_text() != _HEADER_CONTENT:
         _HEADER_PATH.write_text(_HEADER_CONTENT)
     code_generator = CodeGenerator()
     source_file = code_generator(
-        func, caller=caller, kernel_name=kernel_name, prettify=False
+        func,
+        caller=caller,
+        kernel_name=kernel_name,
+        num_warps=num_warps,
+        num_stages=num_stages,
+        max_num_configs=None,
+        prettify=False,
     )
     tensors = code_generator.tensors
@@ -85,20 +91,29 @@ def _aot(func, caller, kernel_name, num_warps, num_stages):
     c_header_file_name = f"{kernel_name}.{signature_hash}.h"
     c_header_file = output_contents[c_header_file_name]
-    c_header_file = f"{c_header_file}\n{unparser.header};\n"
+    c_header_file = f'{c_header_file}\n#ifdef __cplusplus\nextern "C" {unparser.header};\n#else\n{unparser.header};\n#endif\n'
     c_header_file = c_header_file.replace("<stdint.h>", f'"{_HEADER_PATH}"')
     output_contents[c_header_file_name] = c_header_file
     return output_contents
-_HEADER_CONTENT = """#include <stdint.h>
+_HEADER_CONTENT = """#ifndef NINETOOTHED_H
+#define NINETOOTHED_H
+#include <stdint.h>
 typedef struct {
-    uintptr_t data;
+    void *data;
     uint64_t *shape;
     int64_t *strides;
 } NineToothedTensor;
+typedef void *NineToothedStream;
+typedef int NineToothedResult;
+#endif // NINETOOTHED_H
 """
 _HEADER_PATH = CACHE_DIR / "ninetoothed.h"
@@ -129,9 +144,9 @@ class _Unparser:
         return f"return {self._generic_unparse(call)};"
     def _unparse_FunctionDef(self, node):
-        params = ["CUstream stream"]
+        params = ["NineToothedStream stream"]
         params += [f"NineToothedTensor {arg.arg}" for arg in node.args.args]
-        header = f"CUresult {node.name}({', '.join(params)})"
+        header = f"NineToothedResult {node.name}({', '.join(params)})"
         self.header = header

ninetoothed/generation.py CHANGED Viewed

@@ -5,11 +5,21 @@ import functools
 import hashlib
 import inspect
 import itertools
+import json
 import math
+import os
 import pathlib
+import random
+import shutil
 import subprocess
+import tempfile
+import time
+import uuid
+import sympy
 import triton
+import triton.language as tl
+from triton.language.extra import libdevice
 import ninetoothed.naming as naming
 from ninetoothed.cudaifier import Cudaifier
@@ -19,19 +29,47 @@ from ninetoothed.tensor import Tensor
 from ninetoothed.torchifier import Torchifier
 CACHE_DIR = pathlib.Path.home() / ".ninetoothed"
+CACHE_DIR.mkdir(exist_ok=True)
 class CodeGenerator(ast.NodeTransformer):
     def __init__(self):
         super().__init__()
-        self._POWER_OF_TWOS = tuple(2**n for n in range(5, 11))
+        cache_file = CACHE_DIR / "code_generator_cache.json"
-        self._MIN_PRODUCT = 2**10
+        log2_min_num_elements = 4
-        self._MAX_PRODUCT = 2**20
+        if cache_file.exists():
+            with open(cache_file) as f:
+                cache = json.load(f)
-    def __call__(self, func, caller, kernel_name, prettify):
+            log2_max_num_elements = cache["log2_max_num_elements"]
+        else:
+            log2_max_num_elements = _determine_log2_max_num_elements_per_block(
+                log2_min_num_elements
+            )
+            cache = {"log2_max_num_elements": log2_max_num_elements}
+            with open(cache_file, "w") as f:
+                json.dump(cache, f, indent=4)
+                f.write("\n")
+        self._min_num_elements = 2**log2_min_num_elements
+        self._max_num_elements = 2**log2_max_num_elements
+    def __call__(
+        self,
+        func,
+        caller,
+        kernel_name,
+        num_warps,
+        num_stages,
+        max_num_configs,
+        prettify,
+    ):
         def _get_tree(func):
             module = ast.parse(inspect.getsource(inspect.getmodule(func)))
@@ -63,6 +101,12 @@ class CodeGenerator(ast.NodeTransformer):
         self._caller = caller
+        self._num_wraps = num_warps
+        self._num_stages = num_stages
+        self._max_num_configs = max_num_configs
         self._context = inspect.get_annotations(func)
         self._args = list(self._context.values())
@@ -94,9 +138,7 @@ class CodeGenerator(ast.NodeTransformer):
             )
         digest = hashlib.sha256(source.encode("utf-8")).hexdigest()
-        cache_dir = CACHE_DIR
-        cache_dir.mkdir(exist_ok=True)
-        cache_file = cache_dir / f"{digest}.py"
+        cache_file = CACHE_DIR / f"{digest}.py"
         if not cache_file.exists():
             with open(cache_file, "w", encoding="utf-8") as f:
@@ -111,11 +153,15 @@ class CodeGenerator(ast.NodeTransformer):
     def visit_Module(self, node):
         self.generic_visit(node)
-        func_with_auto_tuning = f"{Symbol(self._autotune)}({self._func_def.name})"
+        if self._autotune is not None:
+            func_with_auto_tuning = f"{Symbol(self._autotune)}({self._func_def.name})"
+            node.body.append(
+                ast.parse(
+                    f"{self._func_name_with_auto_tuning} = {func_with_auto_tuning}"
+                )
+            )
-        node.body.append(
-            ast.parse(f"{self._func_name_with_auto_tuning} = {func_with_auto_tuning}")
-        )
         node.body.append(self._launch)
         return node
@@ -137,8 +183,13 @@ class CodeGenerator(ast.NodeTransformer):
     def visit_arguments(self, node):
         self.generic_visit(node)
-        names_of_args = [arg.names() - {"ninetoothed"} for arg in self._args]
-        names = functools.reduce(lambda x, y: x | y, names_of_args)
+        symbols = {
+            name.node.id: name
+            for arg in self._args
+            for name in arg.names()
+            if name != "ninetoothed"
+        }
+        names = symbols.keys()
         meta_names = {name for name in names if naming.is_meta(name)}
         non_meta_names = {name for name in names if name not in meta_names}
         non_meta_names |= {
@@ -147,6 +198,8 @@ class CodeGenerator(ast.NodeTransformer):
             if naming.is_constexpr(name)
         }
+        self._symbols = symbols
         non_meta_names = sorted(non_meta_names)
         meta_names = sorted(meta_names)
@@ -161,12 +214,53 @@ class CodeGenerator(ast.NodeTransformer):
         ]
         self._autotune = self._generate_autotune(non_meta_names, meta_names)
+        if self._autotune is not None:
+            self._func_name = self._func_name_with_auto_tuning
+        else:
+            self._func_name = self._func_def.name
         self._func_def.decorator_list = [Symbol("triton.jit").node]
         self._launch = self._generate_launch(non_meta_names, meta_names)
         return node
+    def visit_Call(self, node):
+        def _offsets(tensor, dim=None):
+            if dim is None:
+                return tensor._last_generated_overall_offsets.node
+            offsets = tensor._last_generated_offsets
+            if dim < 0:
+                dim += tensor.source.ndim
+            return sum(
+                offsets[dim][target_dim] for target_dim in range(tensor.target.ndim)
+            ).node
+        func = node.func
+        args = node.args
+        if isinstance(func, ast.Attribute):
+            if func.attr == "offsets":
+                value = func.value
+                if self._in_context(value):
+                    tensor = self._context[value.id]
+                elif isinstance(value, ast.Subscript) and self._in_context(value.value):
+                    tensor = self._context[value.value.id]
+                self.visit(value)
+                # TODO: Add error handling.
+                return _offsets(tensor, ast.literal_eval(args[0]) if args else None)
+        self.generic_visit(node)
+        return node
     def visit_Subscript(self, node):
         if self._in_context(node.value) and isinstance(node.ctx, ast.Load):
             value = self._context[node.value.id]
@@ -184,13 +278,24 @@ class CodeGenerator(ast.NodeTransformer):
         return node
     def visit_Attribute(self, node):
-        if self._in_context(node.value):
-            value = self._context[node.value.id]
+        value = node.value
-            if isinstance(value, Tensor):
-                inner = value.dtype
+        if isinstance(value, ast.Attribute):
+            value = self.visit_Attribute(value)
+        if self._in_context(value):
+            value = self._context[value.id].dtype
+        if isinstance(value, Tensor):
+            attr = getattr(value, node.attr)
-                return Symbol(getattr(inner, node.attr)).node
+            if node.attr == "dtype" and attr is None:
+                return Symbol(f"{value.source.pointer_string()}.type.element_ty").node
+            if isinstance(attr, Tensor):
+                return attr
+            return Symbol(attr).node
         self.generic_visit(node)
@@ -244,12 +349,69 @@ class CodeGenerator(ast.NodeTransformer):
         return isinstance(node, ast.Name) and node.id in self._context
     def _generate_autotune(self, params, meta):
-        device = triton.runtime.driver.active.get_current_device()
-        properties = triton.runtime.driver.active.utils.get_device_properties(device)
-        max_shared_mem = properties["max_shared_mem"]
+        inequalities = True
+        for arg in self._args:
+            if arg.ndim == 0:
+                continue
+            num_elements = sympy.simplify(str(math.prod(arg.innermost().shape)))
+            inequalities &= num_elements <= self._max_num_elements
+            inequalities &= num_elements >= self._min_num_elements
+        values_of_meta_params = []
+        for param in meta:
+            symbol = self._symbols[param]
+            values = range(symbol.lower_bound, symbol.upper_bound + 1)
+            if symbol.power_of_two:
+                values = tuple(value for value in values if value & (value - 1) == 0)
+            else:
+                values = tuple(values)
+            values_of_meta_params.append(values)
+        max_values_of_non_meta_params = {}
+        for free_symbol in inequalities.free_symbols:
+            symbol_str = str(free_symbol)
+            if symbol_str in meta:
+                continue
+            symbol = self._symbols[symbol_str]
+            max_values_of_non_meta_params[symbol_str] = symbol.upper_bound
-        num_warps = 8
-        num_stages = max_shared_mem // 2**15
+        block_size_configs = []
+        for values in itertools.product(*values_of_meta_params):
+            config = {param: value for param, value in zip(meta, values)}
+            if sympy.logic.simplify_logic(
+                inequalities.subs(config | max_values_of_non_meta_params)
+            ):
+                block_size_configs.append(config)
+        if isinstance(self._num_wraps, collections.abc.Iterable):
+            num_warps_configs = self._num_wraps
+        else:
+            num_warps_configs = (self._num_wraps,)
+        if isinstance(self._num_stages, collections.abc.Iterable):
+            num_stages_configs = self._num_stages
+        else:
+            num_stages_configs = (self._num_stages,)
+        compiler_configs = tuple(
+            {"num_warps": num_warps, "num_stages": num_stages}
+            for num_warps, num_stages in itertools.product(
+                num_warps_configs, num_stages_configs
+            )
+        )
         configs = [
             ast.Call(
@@ -260,19 +422,38 @@ class CodeGenerator(ast.NodeTransformer):
                 ),
                 args=[
                     ast.Dict(
-                        keys=[ast.Constant(value=param) for param in meta],
-                        values=[ast.Constant(value=value) for value in values],
+                        keys=[
+                            ast.Constant(value=param)
+                            for param in block_size_config.keys()
+                        ],
+                        values=[
+                            ast.Constant(value=value)
+                            for value in block_size_config.values()
+                        ],
                     )
                 ],
                 keywords=[
-                    ast.keyword(arg="num_warps", value=ast.Constant(value=num_warps)),
-                    ast.keyword(arg="num_stages", value=ast.Constant(value=num_stages)),
+                    ast.keyword(
+                        arg="num_warps",
+                        value=ast.Constant(value=compiler_config["num_warps"]),
+                    ),
+                    ast.keyword(
+                        arg="num_stages",
+                        value=ast.Constant(value=compiler_config["num_stages"]),
+                    ),
                 ],
             )
-            for values in itertools.product(self._POWER_OF_TWOS, repeat=len(meta))
-            if self._MIN_PRODUCT <= math.prod(values) <= self._MAX_PRODUCT
+            for block_size_config, compiler_config in itertools.product(
+                block_size_configs, compiler_configs
+            )
         ]
+        if self._max_num_configs is not None and len(configs) > self._max_num_configs:
+            configs = random.sample(configs, k=self._max_num_configs)
+        if not configs:
+            return None
         return ast.Call(
             func=ast.Attribute(
                 value=ast.Name(id="ninetoothed", ctx=ast.Load()),
@@ -358,9 +539,7 @@ class CodeGenerator(ast.NodeTransformer):
                 ast.Expr(
                     ast.Call(
                         func=ast.Subscript(
-                            value=ast.Name(
-                                id=self._func_name_with_auto_tuning, ctx=ast.Load()
-                            ),
+                            value=ast.Name(id=self._func_name, ctx=ast.Load()),
                             slice=self._generate_grid(),
                             ctx=ast.Load(),
                         ),
@@ -428,6 +607,8 @@ class CodeGenerator(ast.NodeTransformer):
         indices = self._complete_indices(tensor, indices)
         offsets = type(self)._generate_offsets(tensor, indices)
+        tensor._last_generated_offsets = offsets
         for source_dim in range(tensor.source.ndim):
             for target_dim in range(tensor.target.ndim):
                 if target_dim not in invariant_target_dims:
@@ -452,7 +633,7 @@ class CodeGenerator(ast.NodeTransformer):
                     * tensor.source.strides[source_dim]
                 )
-        pointers = name_for_pointers + sum(
+        overall_offsets = sum(
             offsets[source_dim][target_dim][
                 type(self)._generate_slices(tensor, target_dim)
             ]
@@ -462,6 +643,10 @@ class CodeGenerator(ast.NodeTransformer):
             if target_dim not in invariant_target_dims
             and offsets[source_dim][target_dim] != 0
         )
+        tensor._last_generated_overall_offsets = overall_offsets
+        pointers = name_for_pointers + overall_offsets
         mask = functools.reduce(
             lambda x, y: x & y,
             (
@@ -848,6 +1033,9 @@ class _Inliner(ast.NodeTransformer):
         if func_def is None:
             return None, []
+        if inspect.getmodule(func) is libdevice:
+            return None, []
         collector = _ImportCollector()
         collector.visit(ast.parse(inspect.getsource(inspect.getmodule(func))))
         self.imports.extend(collector.imports)
@@ -994,3 +1182,82 @@ class _FunctionDefFinder(ast.NodeVisitor):
             self.result = node
         self.generic_visit(node)
+def _determine_log2_max_num_elements_per_block(
+    min_exponent, max_exponent=30, num_iterations=3
+):
+    _profile_pseudo_add_kernel(1)
+    for n in range(min_exponent, max_exponent + 1):
+        elapsed_time = 0
+        for _ in range(num_iterations):
+            elapsed_time += _profile_pseudo_add_kernel(2**n)
+        average_elapsed_time = elapsed_time / num_iterations
+        if average_elapsed_time >= 1:
+            return n - 1
+def _profile_pseudo_add_kernel(block_size):
+    cache_dir = triton.runtime.cache.default_cache_dir()
+    os.makedirs(cache_dir, exist_ok=True)
+    with tempfile.TemporaryDirectory() as backup_dir:
+        backup_path = os.path.join(backup_dir, str(uuid.uuid4()))
+        if os.path.exists(backup_path):
+            shutil.rmtree(backup_path)
+        shutil.move(cache_dir, backup_path)
+        try:
+            start_time = time.time()
+            _run_pseudo_add_kernel(block_size)
+            end_time = time.time()
+            elapsed_time = end_time - start_time
+        finally:
+            if os.path.exists(cache_dir):
+                shutil.rmtree(cache_dir)
+            shutil.move(backup_path, cache_dir)
+        return elapsed_time
+def _run_pseudo_add_kernel(block_size):
+    @triton.jit
+    def kernel(a_ptr, b_ptr, c_ptr, num_elements, BLOCK_SIZE: tl.constexpr):
+        pid = tl.program_id(0)
+        offs = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+        mask = offs < num_elements
+        a = tl.load(a_ptr + offs, mask=mask)
+        b = tl.load(b_ptr + offs, mask=mask)
+        c = a + b
+        tl.store(c_ptr + offs, c, mask=mask)
+    num_elements = 0
+    shape = (num_elements,)
+    dtype = tl.float32
+    a = Tensor(shape=shape, dtype=dtype)
+    b = Tensor(shape=shape, dtype=dtype)
+    c = Tensor(shape=shape, dtype=dtype)
+    def data_ptr():
+        return 0
+    a.data_ptr = data_ptr
+    b.data_ptr = data_ptr
+    c.data_ptr = data_ptr
+    kernel[(1,)](a, b, c, num_elements, block_size)

ninetoothed/jit.py CHANGED Viewed

@@ -4,12 +4,25 @@ import sys
 from ninetoothed.generation import CodeGenerator
-def jit(func=None, *, caller="torch", kernel_name=None, _prettify=False):
+def jit(
+    func=None,
+    *,
+    caller="torch",
+    kernel_name=None,
+    num_warps=None,
+    num_stages=None,
+    max_num_configs=None,
+    _prettify=False,
+):
     """A decorator for generating compute kernels.
     :param func: The function to be compiled.
     :param caller: Who will call the compute kernel.
     :param kernel_name: The name for the generated kernel.
+    :param num_warps: The number of warps to use.
+    :param num_stages: The number of pipeline stages.
+    :param max_num_configs: The maximum number of auto-tuning
+        configurations to use.
     :param _prettify: Whether to prettify the generated code.
     :return: A handle to the compute kernel.
@@ -20,7 +33,15 @@ def jit(func=None, *, caller="torch", kernel_name=None, _prettify=False):
     """
     def wrapper(func):
-        return JIT(func, caller=caller, kernel_name=kernel_name, _prettify=_prettify)()
+        return JIT(
+            func,
+            caller=caller,
+            kernel_name=kernel_name,
+            num_warps=num_warps,
+            num_stages=num_stages,
+            max_num_configs=max_num_configs,
+            _prettify=_prettify,
+        )()
     if func is None:
         return wrapper
@@ -29,7 +50,16 @@ def jit(func=None, *, caller="torch", kernel_name=None, _prettify=False):
 class JIT:
-    def __init__(self, func, caller, kernel_name, _prettify=False):
+    def __init__(
+        self,
+        func,
+        caller,
+        kernel_name,
+        num_warps,
+        num_stages,
+        max_num_configs,
+        _prettify=False,
+    ):
         self.func = func
         self._caller = caller
@@ -39,12 +69,24 @@ class JIT:
         else:
             self._kernel_name = func.__name__
+        self._num_warps = num_warps
+        self._num_stages = num_stages
+        self._max_num_configs = max_num_configs
         self._prettify = _prettify
     def __call__(self):
         code_generator = CodeGenerator()
         source_file = code_generator(
-            self.func, self._caller, self._kernel_name, self._prettify
+            self.func,
+            self._caller,
+            self._kernel_name,
+            self._num_warps,
+            self._num_stages,
+            self._max_num_configs,
+            self._prettify,
         )
         module = type(self)._import_from_path(source_file, source_file)
         module_vars = vars(module)

ninetoothed/language.py CHANGED Viewed

@@ -1,7 +1,11 @@
 import ast
+from triton.language.extra import libdevice
 from ninetoothed.symbol import Symbol
+__all__ = ["libdevice"]
 LANGUAGE = "ninetoothed.language"

ninetoothed/make.py CHANGED Viewed

@@ -2,6 +2,7 @@ import inspect
 from ninetoothed.aot import aot
 from ninetoothed.jit import jit
+from ninetoothed.utils import calculate_default_configs
 def make(
@@ -11,8 +12,9 @@ def make(
     caller="torch",
     kernel_name=None,
     output_dir=None,
-    num_warps=4,
-    num_stages=3,
+    num_warps=None,
+    num_stages=None,
+    max_num_configs=None,
 ):
     """Integrate the arrangement and the application of the tensors.
@@ -24,16 +26,33 @@ def make(
     :param output_dir: The directory to store the generated files.
     :param num_warps: The number of warps to use.
     :param num_stages: The number of pipeline stages.
+    :param max_num_configs: The maximum number of auto-tuning
+        configurations to use.
     :return: A handle to the compute kernel.
     """
+    default_num_warps, default_num_stages = calculate_default_configs()
+    if num_warps is None:
+        num_warps = default_num_warps
+    if num_stages is None:
+        num_stages = default_num_stages
     params = inspect.signature(application).parameters
     types = arrangement(*tensors)
     annotations = {param: type for param, type in zip(params, types)}
     application.__annotations__ = annotations
     if caller == "torch":
-        return jit(application, caller=caller, kernel_name=kernel_name)
+        return jit(
+            application,
+            caller=caller,
+            kernel_name=kernel_name,
+            num_warps=num_warps,
+            num_stages=num_stages,
+            max_num_configs=max_num_configs,
+        )
     return aot(
         application,

ninetoothed/symbol.py CHANGED Viewed

@@ -12,9 +12,20 @@ class Symbol:
     :param expr: The expression used to construct the symbol.
     :param constexpr: Whether the symbol is a constexpr.
     :param mata: Whether the symbol is a meta.
+    :param lower_bound: The minimum value for the symbol's range.
+    :param upper_bound: The maximum value for the symbol's range.
+    :param power_of_two: Whether the value should be a power of two.
     """
-    def __init__(self, expr, constexpr=None, meta=None):
+    def __init__(
+        self,
+        expr,
+        constexpr=None,
+        meta=None,
+        lower_bound=None,
+        upper_bound=None,
+        power_of_two=None,
+    ):
         if isinstance(expr, type(self)):
             self._node = expr._node
             return
@@ -43,6 +54,40 @@ class Symbol:
         if constexpr:
             self._node.id = naming.make_constexpr(self._node.id)
+        self._node.symbol = self
+        DEFAULT_LOWER_BOUND_FOR_META_SYMBOLS = 2**5
+        DEFAULT_UPPER_BOUND_FOR_META_SYMBOLS = 2**10
+        DEFAULT_POWER_OF_TWO_FOR_META_SYMBOLS = True
+        DEFAULT_LOWER_BOUND_FOR_NON_META_CONSTEXPR_SYMBOLS = 1
+        DEFAULT_UPPER_BOUND_FOR_NON_META_CONSTEXPR_SYMBOLS = 2**20
+        DEFAULT_POWER_OF_TWO_FOR_NON_META_CONSTEXPR_SYMBOLS = False
+        if lower_bound is not None:
+            self.lower_bound = lower_bound
+        else:
+            if meta:
+                self.lower_bound = DEFAULT_LOWER_BOUND_FOR_META_SYMBOLS
+            elif constexpr:
+                self.lower_bound = DEFAULT_LOWER_BOUND_FOR_NON_META_CONSTEXPR_SYMBOLS
+        if upper_bound is not None:
+            self.upper_bound = upper_bound
+        else:
+            if meta:
+                self.upper_bound = DEFAULT_UPPER_BOUND_FOR_META_SYMBOLS
+            elif constexpr:
+                self.upper_bound = DEFAULT_UPPER_BOUND_FOR_NON_META_CONSTEXPR_SYMBOLS
+        if power_of_two is not None:
+            self.power_of_two = power_of_two
+        else:
+            if meta:
+                self.power_of_two = DEFAULT_POWER_OF_TWO_FOR_META_SYMBOLS
+            elif constexpr:
+                self.power_of_two = DEFAULT_POWER_OF_TWO_FOR_NON_META_CONSTEXPR_SYMBOLS
     def __eq__(self, other):
         if isinstance(self._node, ast.Constant):
             if isinstance(other, Symbol) and isinstance(other._node, ast.Constant):
@@ -155,7 +200,7 @@ class Symbol:
             def visit_Name(self, node):
                 self.generic_visit(node)
-                self.names.add(node.id)
+                self.names.add(node.symbol)
         name_collector = NameCollector()
@@ -179,6 +224,24 @@ class Symbol:
         return isinstance(object, Symbol) and isinstance(object.node, ast.Name)
+def block_size(lower_bound=None, upper_bound=None):
+    """Create a block size symbol that serves as a meta-parameter.
+    :param lower_bound: The lower bound for the block size's range.
+    :param upper_bound: The upper bound for the block size's range.
+    :return: A block size symbol that serves as a meta-parameter.
+    """
+    name = naming.auto_generate(f"BLOCK_SIZE_{block_size._num_block_sizes}")
+    block_size._num_block_sizes += 1
+    return Symbol(name, meta=True, lower_bound=lower_bound, upper_bound=upper_bound)
+block_size._num_block_sizes = 0
 class _FindAndReplacer(ast.NodeTransformer):
     def __init__(self, targets, replacement):
         self._targets_unparsed = tuple(

ninetoothed/tensor.py CHANGED Viewed

@@ -14,7 +14,7 @@ class Tensor:
     :param dtype: The element type of the tensor.
     :param strides: The strides of the tensor.
     :param other: The values for out-of-bounds positions.
-    :param constexpr_shape: Whether the sizes are constexpr.
+    :param shape_options: The options for configuring shape symbols.
     :param name: The name of the tensor.
     :param source: For internal use only.
     :param source_dims: For internal use only.
@@ -31,7 +31,7 @@ class Tensor:
         dtype=None,
         strides=None,
         other=None,
-        constexpr_shape=None,
+        shape_options=None,
         name=None,
         source=None,
         source_dims=None,
@@ -48,9 +48,20 @@ class Tensor:
             self.name = naming.auto_generate(f"tensor_{type(self).num_instances}")
         if ndim is not None:
+            if shape_options is None:
+                shape_options = tuple({} for _ in range(ndim))
+            if isinstance(shape_options, dict):
+                shape_options = tuple(shape_options for _ in range(ndim))
+            shape_options = tuple(
+                size_options if size_options is not None else {}
+                for size_options in shape_options
+            )
             self.shape = (
-                Symbol(self.size_string(i), constexpr=constexpr_shape)
-                for i in range(ndim)
+                Symbol(self.size_string(i), **size_options)
+                for i, size_options in zip(range(ndim), shape_options)
             )
             self.strides = (Symbol(self.stride_string(i)) for i in range(ndim))
         else:
@@ -364,10 +375,10 @@ class Tensor:
     def names(self):
         if self.ndim == 0:
-            return {self.source.name}
+            return {Symbol(self.source.name)}
         return (
-            {self.source.pointer_string()}
+            {Symbol(self.source.pointer_string())}
             | {
                 name
                 for value in itertools.chain(self.shape, self.strides)

ninetoothed/utils.py ADDED Viewed

@@ -0,0 +1,12 @@
+import triton
+def calculate_default_configs():
+    device = triton.runtime.driver.active.get_current_device()
+    properties = triton.runtime.driver.active.utils.get_device_properties(device)
+    max_shared_mem = properties["max_shared_mem"]
+    num_warps = 8
+    num_stages = max_shared_mem // 2**15
+    return num_warps, num_stages

ninetoothed/visualization.py CHANGED Viewed

@@ -10,8 +10,6 @@ def visualize(tensor, color=None, save_path=None):
     :param color: The color to be used for visualization.
     :param save_path: The path where the visualization should be saved.
     """
-    outline_width = 0.1
-    plt.rcParams["lines.linewidth"] = 72 * outline_width
     if color is None:
         color = f"C{visualize.count}"
@@ -21,6 +19,24 @@ def visualize(tensor, color=None, save_path=None):
     width = max_pos_y + 1
     height = max_pos_x + 1
+    _, ax = _prepare_figure_and_axes(width, height)
+    _visualize_tensor(ax, tensor, 0, 0, color)
+    plt.savefig(save_path, transparent=True, bbox_inches="tight", pad_inches=0)
+    plt.close()
+    visualize.count += 1
+visualize.count = 0
+def _prepare_figure_and_axes(width, height):
+    outline_width = 0.1
+    plt.rcParams["lines.linewidth"] = 72 * outline_width
     fig = plt.figure(figsize=(width + outline_width, height + outline_width))
     h = (Size.Fixed(0), Size.Fixed(width + outline_width))
@@ -41,16 +57,7 @@ def visualize(tensor, color=None, save_path=None):
     plt.xlim((-half_outline_width, width + half_outline_width))
     plt.ylim((-half_outline_width, height + half_outline_width))
-    _visualize_tensor(ax, tensor, 0, 0, color)
-    plt.savefig(save_path, transparent=True, bbox_inches="tight", pad_inches=0)
-    plt.close()
-    visualize.count += 1
-visualize.count = 0
+    return fig, ax
 def _visualize_tensor(ax, tensor, x, y, color, level_spacing=4):

{ninetoothed-0.15.1.dist-info → ninetoothed-0.17.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: ninetoothed
-Version: 0.15.1
+Version: 0.17.0
 Summary: A domain-specific language based on Triton but providing higher-level abstraction.
 Project-URL: Homepage, https://github.com/InfiniTensor/ninetoothed
 Project-URL: Issues, https://github.com/InfiniTensor/ninetoothed/issues
@@ -10,6 +10,7 @@ Classifier: License :: OSI Approved :: Apache Software License
 Classifier: Operating System :: OS Independent
 Classifier: Programming Language :: Python :: 3
 Requires-Python: >=3.10
+Requires-Dist: sympy>=1.13.0
 Requires-Dist: triton>=3.0.0
 Provides-Extra: all
 Requires-Dist: matplotlib>=3.9.0; extra == 'all'

ninetoothed-0.17.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,18 @@
+ninetoothed/__init__.py,sha256=F2bxRNhzcGdtADA8RehTuf-QK0xnxno8kxvr6H2L5Tg,552
+ninetoothed/aot.py,sha256=b7ykTC5roe_xg3NkZv6VyInBrEiNRwjpixCULUPRuEg,6506
+ninetoothed/cudaifier.py,sha256=5ylMr1q0B9NwbeXkpCu3o2nMGpDfh65nAQ0Az_qMQuI,877
+ninetoothed/dtype.py,sha256=-0iBleay5gYA4wtT3l17QjCesr7g26M6CSfhNJdI3k4,165
+ninetoothed/generation.py,sha256=wf8BL-x0PR6rG-9OSpgIZi8LtsIdFbqRUFiQFE5FIno,38107
+ninetoothed/jit.py,sha256=CpeSkO_zUe9DwtTJ2K2H7Bwpx-FvIHfrgzOcEosfpek,2946
+ninetoothed/language.py,sha256=ERiA4dpwiow2AT2xFeFWYg1KqlnBo6xxPGp8VZrP0Lk,574
+ninetoothed/make.py,sha256=fQKuRJL7HC2iGTAN323mlIWXz9Z3jotIoN68ur29Qlw,1834
+ninetoothed/naming.py,sha256=Fl0x4eDRStTpkXjJg6179ErEnY7bR5Qi0AT6RX9C3fU,951
+ninetoothed/symbol.py,sha256=lJo3NL2-T7tKbKjb6MCRLMemN94mqS3bIiG943P0Mbo,7454
+ninetoothed/tensor.py,sha256=gQEzHTcXqZVBFLc2YRfXTKxjxPWMxWN7fNl2BCfJwMs,14782
+ninetoothed/torchifier.py,sha256=aDijK5UOwK2oLXDHgDo8M959rJclEI0lcfaPr7GQTXY,1012
+ninetoothed/utils.py,sha256=mtRXABBVPnlgd2n1REh9oB3s_5bUsKhd3iwu3oJ5DSQ,338
+ninetoothed/visualization.py,sha256=oc3cA5qqT66_RoAs5D681SCxR5E5wgFwk95ZefdSfZU,3794
+ninetoothed-0.17.0.dist-info/METADATA,sha256=_V2M45nT4Yin-zs7hq5-yHlN6KwV5_zcA8afwXP8S-Q,7340
+ninetoothed-0.17.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+ninetoothed-0.17.0.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+ninetoothed-0.17.0.dist-info/RECORD,,

ninetoothed-0.15.1.dist-info/RECORD DELETED Viewed

@@ -1,17 +0,0 @@
-ninetoothed/__init__.py,sha256=zGaZiUzwJZ2jfwLxp7lT8ll_V5ngP5QYrfVbapftbCY,522
-ninetoothed/aot.py,sha256=5P9s-KAA7xNNdK8_fbCZEIteQlbaB_1wOl8_rEBQg9U,6128
-ninetoothed/cudaifier.py,sha256=5ylMr1q0B9NwbeXkpCu3o2nMGpDfh65nAQ0Az_qMQuI,877
-ninetoothed/dtype.py,sha256=-0iBleay5gYA4wtT3l17QjCesr7g26M6CSfhNJdI3k4,165
-ninetoothed/generation.py,sha256=Gmeh9OPmWZmF9CUY-UIIBPi-SOjFCxZjvXNwqX3uD84,30963
-ninetoothed/jit.py,sha256=0MFbFIODtw-bxuOC7WByxiVtQMeyvZkoDxvfAZ9rIFQ,2120
-ninetoothed/language.py,sha256=YwjlBENmmKPTnhaQ2uYbj5MwzrCAT7MLJ6VkQ6NeXJE,504
-ninetoothed/make.py,sha256=wRr3JwGt5E2OCquq_nzBZljdW-AJPOqH49cM08gwl4A,1287
-ninetoothed/naming.py,sha256=Fl0x4eDRStTpkXjJg6179ErEnY7bR5Qi0AT6RX9C3fU,951
-ninetoothed/symbol.py,sha256=UpGmx_jvaDtowADnp1DwYC3fvBXSiaMiYpU-ewkVo50,5261
-ninetoothed/tensor.py,sha256=ByTnoeqxD9lXprvy1DDp5L-zU2up52-jop9AAUrSTYk,14347
-ninetoothed/torchifier.py,sha256=aDijK5UOwK2oLXDHgDo8M959rJclEI0lcfaPr7GQTXY,1012
-ninetoothed/visualization.py,sha256=zlMH-0WplaboePGzcbpcj4UovpX0k2r4SysSPsNS4r4,3674
-ninetoothed-0.15.1.dist-info/METADATA,sha256=6RA1-6fYFfSTJnnwsRoVb4yIRrn4kfLhN47GNvmGji0,7311
-ninetoothed-0.15.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-ninetoothed-0.15.1.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-ninetoothed-0.15.1.dist-info/RECORD,,

{ninetoothed-0.15.1.dist-info → ninetoothed-0.17.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{ninetoothed-0.15.1.dist-info → ninetoothed-0.17.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

ninetoothed 0.15.1__py3-none-any.whl → 0.17.0__py3-none-any.whl

ninetoothed 0.15.1py3-none-any.whl → 0.17.0py3-none-any.whl