PyPI - gstaichi - Versions diffs - 0.1.18.dev1__cp310-cp310-win_amd64.whl - Mend

gstaichi 0.1.18.dev1__cp310-cp310-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (198) hide show

gstaichi-0.1.18.dev1.data/data/SPIRV-Tools/cmake/SPIRV-ToolsConfig.cmake +5 -0
gstaichi-0.1.18.dev1.data/data/SPIRV-Tools/cmake/SPIRV-ToolsTarget-release.cmake +29 -0
gstaichi-0.1.18.dev1.data/data/SPIRV-Tools/cmake/SPIRV-ToolsTarget.cmake +113 -0
gstaichi-0.1.18.dev1.data/data/SPIRV-Tools-diff/cmake/SPIRV-Tools-diffConfig.cmake +5 -0
gstaichi-0.1.18.dev1.data/data/SPIRV-Tools-diff/cmake/SPIRV-Tools-diffTargets-release.cmake +19 -0
gstaichi-0.1.18.dev1.data/data/SPIRV-Tools-diff/cmake/SPIRV-Tools-diffTargets.cmake +122 -0
gstaichi-0.1.18.dev1.data/data/SPIRV-Tools-link/cmake/SPIRV-Tools-linkConfig.cmake +5 -0
gstaichi-0.1.18.dev1.data/data/SPIRV-Tools-link/cmake/SPIRV-Tools-linkTargets-release.cmake +19 -0
gstaichi-0.1.18.dev1.data/data/SPIRV-Tools-link/cmake/SPIRV-Tools-linkTargets.cmake +122 -0
gstaichi-0.1.18.dev1.data/data/SPIRV-Tools-lint/cmake/SPIRV-Tools-lintConfig.cmake +5 -0
gstaichi-0.1.18.dev1.data/data/SPIRV-Tools-lint/cmake/SPIRV-Tools-lintTargets-release.cmake +19 -0
gstaichi-0.1.18.dev1.data/data/SPIRV-Tools-lint/cmake/SPIRV-Tools-lintTargets.cmake +122 -0
gstaichi-0.1.18.dev1.data/data/SPIRV-Tools-opt/cmake/SPIRV-Tools-optConfig.cmake +5 -0
gstaichi-0.1.18.dev1.data/data/SPIRV-Tools-opt/cmake/SPIRV-Tools-optTargets-release.cmake +19 -0
gstaichi-0.1.18.dev1.data/data/SPIRV-Tools-opt/cmake/SPIRV-Tools-optTargets.cmake +122 -0
gstaichi-0.1.18.dev1.data/data/SPIRV-Tools-reduce/cmake/SPIRV-Tools-reduceConfig.cmake +5 -0
gstaichi-0.1.18.dev1.data/data/SPIRV-Tools-reduce/cmake/SPIRV-Tools-reduceTarget-release.cmake +19 -0
gstaichi-0.1.18.dev1.data/data/SPIRV-Tools-reduce/cmake/SPIRV-Tools-reduceTarget.cmake +122 -0
gstaichi-0.1.18.dev1.data/data/bin/SPIRV-Tools-shared.dll +0 -0
gstaichi-0.1.18.dev1.data/data/include/GLFW/glfw3.h +6389 -0
gstaichi-0.1.18.dev1.data/data/include/GLFW/glfw3native.h +594 -0
gstaichi-0.1.18.dev1.data/data/include/spirv-tools/instrument.hpp +268 -0
gstaichi-0.1.18.dev1.data/data/include/spirv-tools/libspirv.h +907 -0
gstaichi-0.1.18.dev1.data/data/include/spirv-tools/libspirv.hpp +375 -0
gstaichi-0.1.18.dev1.data/data/include/spirv-tools/linker.hpp +97 -0
gstaichi-0.1.18.dev1.data/data/include/spirv-tools/optimizer.hpp +970 -0
gstaichi-0.1.18.dev1.data/data/lib/SPIRV-Tools-diff.lib +0 -0
gstaichi-0.1.18.dev1.data/data/lib/SPIRV-Tools-link.lib +0 -0
gstaichi-0.1.18.dev1.data/data/lib/SPIRV-Tools-lint.lib +0 -0
gstaichi-0.1.18.dev1.data/data/lib/SPIRV-Tools-opt.lib +0 -0
gstaichi-0.1.18.dev1.data/data/lib/SPIRV-Tools-reduce.lib +0 -0
gstaichi-0.1.18.dev1.data/data/lib/SPIRV-Tools-shared.lib +0 -0
gstaichi-0.1.18.dev1.data/data/lib/SPIRV-Tools.lib +0 -0
gstaichi-0.1.18.dev1.data/data/lib/cmake/glfw3/glfw3Config.cmake +3 -0
gstaichi-0.1.18.dev1.data/data/lib/cmake/glfw3/glfw3ConfigVersion.cmake +65 -0
gstaichi-0.1.18.dev1.data/data/lib/cmake/glfw3/glfw3Targets-release.cmake +19 -0
gstaichi-0.1.18.dev1.data/data/lib/cmake/glfw3/glfw3Targets.cmake +107 -0
gstaichi-0.1.18.dev1.data/data/lib/glfw3.lib +0 -0
gstaichi-0.1.18.dev1.dist-info/METADATA +108 -0
gstaichi-0.1.18.dev1.dist-info/RECORD +198 -0
gstaichi-0.1.18.dev1.dist-info/WHEEL +5 -0
gstaichi-0.1.18.dev1.dist-info/entry_points.txt +2 -0
gstaichi-0.1.18.dev1.dist-info/licenses/LICENSE +201 -0
gstaichi-0.1.18.dev1.dist-info/top_level.txt +1 -0
taichi/CHANGELOG.md +15 -0
taichi/__init__.py +44 -0
taichi/__main__.py +5 -0
taichi/_funcs.py +706 -0
taichi/_kernels.py +420 -0
taichi/_lib/__init__.py +3 -0
taichi/_lib/c_api/bin/taichi_c_api.dll +0 -0
taichi/_lib/c_api/include/taichi/cpp/taichi.hpp +1401 -0
taichi/_lib/c_api/include/taichi/taichi.h +29 -0
taichi/_lib/c_api/include/taichi/taichi_core.h +1111 -0
taichi/_lib/c_api/include/taichi/taichi_cpu.h +29 -0
taichi/_lib/c_api/include/taichi/taichi_cuda.h +36 -0
taichi/_lib/c_api/include/taichi/taichi_platform.h +55 -0
taichi/_lib/c_api/include/taichi/taichi_unity.h +64 -0
taichi/_lib/c_api/include/taichi/taichi_vulkan.h +151 -0
taichi/_lib/c_api/lib/taichi_c_api.lib +0 -0
taichi/_lib/c_api/runtime/runtime_cuda.bc +0 -0
taichi/_lib/c_api/runtime/runtime_x64.bc +0 -0
taichi/_lib/c_api/runtime/slim_libdevice.10.bc +0 -0
taichi/_lib/c_api/taichi/lib/cmake/taichi/TaichiConfig.cmake +29 -0
taichi/_lib/c_api/taichi/lib/cmake/taichi/TaichiConfigVersion.cmake +65 -0
taichi/_lib/c_api/taichi/lib/cmake/taichi/TaichiTargets.cmake +121 -0
taichi/_lib/core/__init__.py +0 -0
taichi/_lib/core/py.typed +0 -0
taichi/_lib/core/taichi_python.cp310-win_amd64.pyd +0 -0
taichi/_lib/core/taichi_python.pyi +3077 -0
taichi/_lib/runtime/runtime_cuda.bc +0 -0
taichi/_lib/runtime/runtime_x64.bc +0 -0
taichi/_lib/runtime/slim_libdevice.10.bc +0 -0
taichi/_lib/utils.py +249 -0
taichi/_logging.py +131 -0
taichi/_main.py +552 -0
taichi/_snode/__init__.py +5 -0
taichi/_snode/fields_builder.py +189 -0
taichi/_snode/snode_tree.py +34 -0
taichi/_ti_module/__init__.py +3 -0
taichi/_ti_module/cppgen.py +309 -0
taichi/_ti_module/module.py +145 -0
taichi/_version.py +1 -0
taichi/_version_check.py +100 -0
taichi/ad/__init__.py +3 -0
taichi/ad/_ad.py +530 -0
taichi/algorithms/__init__.py +3 -0
taichi/algorithms/_algorithms.py +117 -0
taichi/aot/__init__.py +12 -0
taichi/aot/_export.py +28 -0
taichi/aot/conventions/__init__.py +3 -0
taichi/aot/conventions/gfxruntime140/__init__.py +38 -0
taichi/aot/conventions/gfxruntime140/dr.py +244 -0
taichi/aot/conventions/gfxruntime140/sr.py +613 -0
taichi/aot/module.py +253 -0
taichi/aot/utils.py +151 -0
taichi/assets/.git +1 -0
taichi/assets/Go-Regular.ttf +0 -0
taichi/assets/static/imgs/ti_gallery.png +0 -0
taichi/examples/minimal.py +28 -0
taichi/experimental.py +16 -0
taichi/graph/__init__.py +3 -0
taichi/graph/_graph.py +292 -0
taichi/lang/__init__.py +50 -0
taichi/lang/_ndarray.py +348 -0
taichi/lang/_ndrange.py +152 -0
taichi/lang/_texture.py +172 -0
taichi/lang/_wrap_inspect.py +189 -0
taichi/lang/any_array.py +99 -0
taichi/lang/argpack.py +411 -0
taichi/lang/ast/__init__.py +5 -0
taichi/lang/ast/ast_transformer.py +1806 -0
taichi/lang/ast/ast_transformer_utils.py +328 -0
taichi/lang/ast/checkers.py +106 -0
taichi/lang/ast/symbol_resolver.py +57 -0
taichi/lang/ast/transform.py +9 -0
taichi/lang/common_ops.py +310 -0
taichi/lang/exception.py +80 -0
taichi/lang/expr.py +180 -0
taichi/lang/field.py +464 -0
taichi/lang/impl.py +1246 -0
taichi/lang/kernel_arguments.py +157 -0
taichi/lang/kernel_impl.py +1415 -0
taichi/lang/matrix.py +1877 -0
taichi/lang/matrix_ops.py +341 -0
taichi/lang/matrix_ops_utils.py +190 -0
taichi/lang/mesh.py +687 -0
taichi/lang/misc.py +807 -0
taichi/lang/ops.py +1489 -0
taichi/lang/runtime_ops.py +13 -0
taichi/lang/shell.py +35 -0
taichi/lang/simt/__init__.py +5 -0
taichi/lang/simt/block.py +94 -0
taichi/lang/simt/grid.py +7 -0
taichi/lang/simt/subgroup.py +191 -0
taichi/lang/simt/warp.py +96 -0
taichi/lang/snode.py +487 -0
taichi/lang/source_builder.py +150 -0
taichi/lang/struct.py +855 -0
taichi/lang/util.py +381 -0
taichi/linalg/__init__.py +8 -0
taichi/linalg/matrixfree_cg.py +310 -0
taichi/linalg/sparse_cg.py +59 -0
taichi/linalg/sparse_matrix.py +303 -0
taichi/linalg/sparse_solver.py +123 -0
taichi/math/__init__.py +11 -0
taichi/math/_complex.py +204 -0
taichi/math/mathimpl.py +886 -0
taichi/profiler/__init__.py +6 -0
taichi/profiler/kernel_metrics.py +260 -0
taichi/profiler/kernel_profiler.py +592 -0
taichi/profiler/memory_profiler.py +15 -0
taichi/profiler/scoped_profiler.py +36 -0
taichi/shaders/Circles_vk.frag +29 -0
taichi/shaders/Circles_vk.vert +45 -0
taichi/shaders/Circles_vk_frag.spv +0 -0
taichi/shaders/Circles_vk_vert.spv +0 -0
taichi/shaders/Lines_vk.frag +9 -0
taichi/shaders/Lines_vk.vert +11 -0
taichi/shaders/Lines_vk_frag.spv +0 -0
taichi/shaders/Lines_vk_vert.spv +0 -0
taichi/shaders/Mesh_vk.frag +71 -0
taichi/shaders/Mesh_vk.vert +68 -0
taichi/shaders/Mesh_vk_frag.spv +0 -0
taichi/shaders/Mesh_vk_vert.spv +0 -0
taichi/shaders/Particles_vk.frag +95 -0
taichi/shaders/Particles_vk.vert +73 -0
taichi/shaders/Particles_vk_frag.spv +0 -0
taichi/shaders/Particles_vk_vert.spv +0 -0
taichi/shaders/SceneLines2quad_vk_comp.spv +0 -0
taichi/shaders/SceneLines_vk.frag +9 -0
taichi/shaders/SceneLines_vk.vert +12 -0
taichi/shaders/SceneLines_vk_frag.spv +0 -0
taichi/shaders/SceneLines_vk_vert.spv +0 -0
taichi/shaders/SetImage_vk.frag +21 -0
taichi/shaders/SetImage_vk.vert +15 -0
taichi/shaders/SetImage_vk_frag.spv +0 -0
taichi/shaders/SetImage_vk_vert.spv +0 -0
taichi/shaders/Triangles_vk.frag +16 -0
taichi/shaders/Triangles_vk.vert +29 -0
taichi/shaders/Triangles_vk_frag.spv +0 -0
taichi/shaders/Triangles_vk_vert.spv +0 -0
taichi/shaders/lines2quad_vk_comp.spv +0 -0
taichi/sparse/__init__.py +3 -0
taichi/sparse/_sparse_grid.py +77 -0
taichi/tools/__init__.py +12 -0
taichi/tools/diagnose.py +124 -0
taichi/tools/np2ply.py +364 -0
taichi/tools/vtk.py +38 -0
taichi/types/__init__.py +19 -0
taichi/types/annotations.py +47 -0
taichi/types/compound_types.py +90 -0
taichi/types/enums.py +49 -0
taichi/types/ndarray_type.py +147 -0
taichi/types/primitive_types.py +203 -0
taichi/types/quant.py +88 -0
taichi/types/texture_type.py +85 -0
taichi/types/utils.py +13 -0

taichi/lang/kernel_impl.py ADDED Viewed

@@ -0,0 +1,1415 @@
+# type: ignore
+import ast
+import dataclasses
+import functools
+import inspect
+import json
+import operator
+import os
+import pathlib
+import re
+import sys
+import textwrap
+import time
+import types
+import typing
+import warnings
+import weakref
+from typing import Any, Callable, Type, Union
+import numpy as np
+import taichi.lang
+import taichi.lang._ndarray
+import taichi.lang._texture
+import taichi.lang.expr
+import taichi.lang.snode
+import taichi.types.annotations
+from taichi import _logging
+from taichi._lib import core as _ti_core
+from taichi._lib.core.taichi_python import ASTBuilder
+from taichi.lang import impl, ops, runtime_ops
+from taichi.lang._wrap_inspect import getsourcefile, getsourcelines
+from taichi.lang.any_array import AnyArray
+from taichi.lang.argpack import ArgPack, ArgPackType
+from taichi.lang.ast import (
+    ASTTransformerContext,
+    KernelSimplicityASTChecker,
+    transform_tree,
+)
+from taichi.lang.ast.ast_transformer_utils import ReturnStatus
+from taichi.lang.exception import (
+    TaichiCompilationError,
+    TaichiRuntimeError,
+    TaichiRuntimeTypeError,
+    TaichiSyntaxError,
+    TaichiTypeError,
+    handle_exception_from_cpp,
+)
+from taichi.lang.expr import Expr
+from taichi.lang.kernel_arguments import KernelArgument
+from taichi.lang.matrix import MatrixType
+from taichi.lang.shell import _shell_pop_print
+from taichi.lang.struct import StructType
+from taichi.lang.util import cook_dtype, has_paddle, has_pytorch, to_taichi_type
+from taichi.types import (
+    ndarray_type,
+    primitive_types,
+    sparse_matrix_builder,
+    template,
+    texture_type,
+)
+from taichi.types.compound_types import CompoundType
+from taichi.types.enums import AutodiffMode, Layout
+from taichi.types.utils import is_signed
+def func(fn: Callable, is_real_function: bool = False):
+    """Marks a function as callable in Taichi-scope.
+    This decorator transforms a Python function into a Taichi one. Taichi
+    will JIT compile it into native instructions.
+    Args:
+        fn (Callable): The Python function to be decorated
+        is_real_function (bool): Whether the function is a real function
+    Returns:
+        Callable: The decorated function
+    Example::
+        >>> @ti.func
+        >>> def foo(x):
+        >>>     return x + 2
+        >>>
+        >>> @ti.kernel
+        >>> def run():
+        >>>     print(foo(40))  # 42
+    """
+    is_classfunc = _inside_class(level_of_class_stackframe=3 + is_real_function)
+    fun = Func(fn, _classfunc=is_classfunc, is_real_function=is_real_function)
+    @functools.wraps(fn)
+    def decorated(*args, **kwargs):
+        return fun.__call__(*args, **kwargs)
+    decorated._is_taichi_function = True
+    decorated._is_real_function = is_real_function
+    decorated.func = fun
+    return decorated
+def real_func(fn: Callable):
+    return func(fn, is_real_function=True)
+def pyfunc(fn: Callable):
+    """Marks a function as callable in both Taichi and Python scopes.
+    When called inside the Taichi scope, Taichi will JIT compile it into
+    native instructions. Otherwise it will be invoked directly as a
+    Python function.
+    See also :func:`~taichi.lang.kernel_impl.func`.
+    Args:
+        fn (Callable): The Python function to be decorated
+    Returns:
+        Callable: The decorated function
+    """
+    is_classfunc = _inside_class(level_of_class_stackframe=3)
+    fun = Func(fn, _classfunc=is_classfunc, _pyfunc=True)
+    @functools.wraps(fn)
+    def decorated(*args, **kwargs):
+        return fun.__call__(*args, **kwargs)
+    decorated._is_taichi_function = True
+    decorated._is_real_function = False
+    decorated.func = fun
+    return decorated
+def _get_tree_and_ctx(
+    self: "Func | Kernel",
+    excluded_parameters=(),
+    is_kernel: bool = True,
+    arg_features=None,
+    args=None,
+    ast_builder: ASTBuilder | None = None,
+    is_real_function: bool = False,
+):
+    file = getsourcefile(self.func)
+    src, start_lineno = getsourcelines(self.func)
+    src = [textwrap.fill(line, tabsize=4, width=9999) for line in src]
+    tree = ast.parse(textwrap.dedent("\n".join(src)))
+    func_body = tree.body[0]
+    func_body.decorator_list = []
+    global_vars = _get_global_vars(self.func)
+    if is_kernel or is_real_function:
+        # inject template parameters into globals
+        for i in self.template_slot_locations:
+            template_var_name = self.arguments[i].name
+            global_vars[template_var_name] = args[i]
+        parameters = inspect.signature(self.func).parameters
+        for arg_i, (param_name, param) in enumerate(parameters.items()):
+            if dataclasses.is_dataclass(param.annotation):
+                for member_field in dataclasses.fields(param.annotation):
+                    child_value = getattr(args[arg_i], member_field.name)
+                    flat_name = f"__ti_{param_name}_{member_field.name}"
+                    global_vars[flat_name] = child_value
+    return tree, ASTTransformerContext(
+        excluded_parameters=excluded_parameters,
+        is_kernel=is_kernel,
+        func=self,
+        arg_features=arg_features,
+        global_vars=global_vars,
+        argument_data=args,
+        src=src,
+        start_lineno=start_lineno,
+        file=file,
+        ast_builder=ast_builder,
+        is_real_function=is_real_function,
+    )
+def expand_func_arguments(arguments: list[KernelArgument]) -> list[KernelArgument]:
+    new_arguments = []
+    for argument in arguments:
+        if dataclasses.is_dataclass(argument.annotation):
+            for field in dataclasses.fields(argument.annotation):
+                new_argument = KernelArgument(
+                    _annotation=field.type,
+                    _name=f"__ti_{argument.name}_{field.name}",
+                )
+                new_arguments.append(new_argument)
+        else:
+            new_arguments.append(argument)
+    return new_arguments
+def _process_args(self: "Func | Kernel", is_func: bool, args, kwargs):
+    if is_func:
+        self.arguments = expand_func_arguments(self.arguments)
+    fused_args = [argument.default for argument in self.arguments]
+    len_args = len(args)
+    if len_args > len(fused_args):
+        arg_str = ", ".join([str(arg) for arg in args])
+        expected_str = ", ".join([f"{arg.name} : {arg.annotation}" for arg in self.arguments])
+        msg = f"Too many arguments. Expected ({expected_str}), got ({arg_str})."
+        raise TaichiSyntaxError(msg)
+    for i, arg in enumerate(args):
+        fused_args[i] = arg
+    for key, value in kwargs.items():
+        found = False
+        for i, arg in enumerate(self.arguments):
+            if key == arg.name:
+                if i < len_args:
+                    raise TaichiSyntaxError(f"Multiple values for argument '{key}'.")
+                fused_args[i] = value
+                found = True
+                break
+        if not found:
+            raise TaichiSyntaxError(f"Unexpected argument '{key}'.")
+    for i, arg in enumerate(fused_args):
+        if arg is inspect.Parameter.empty:
+            if self.arguments[i].annotation is inspect._empty:
+                raise TaichiSyntaxError(f"Parameter `{self.arguments[i].name}` missing.")
+            else:
+                raise TaichiSyntaxError(
+                    f"Parameter `{self.arguments[i].name} : {self.arguments[i].annotation}` missing."
+                )
+    return tuple(fused_args)
+def unpack_ndarray_struct(tree: ast.Module, struct_locals: set[str]) -> ast.Module:
+    class AttributeToNameTransformer(ast.NodeTransformer):
+        def visit_Attribute(self, node: ast.AST):
+            if isinstance(node.value, ast.Attribute):
+                return node
+            if not isinstance(node.value, ast.Name):
+                return node
+            base_id = node.value.id
+            attr_name = node.attr
+            new_id = f"__ti_{base_id}_{attr_name}"
+            if new_id not in struct_locals:
+                return node
+            return ast.copy_location(ast.Name(id=new_id, ctx=node.ctx), node)
+    transformer = AttributeToNameTransformer()
+    new_tree = transformer.visit(tree)
+    ast.fix_missing_locations(new_tree)
+    return new_tree
+def extract_struct_locals_from_context(ctx: ASTTransformerContext):
+    """
+    - Uses ctx.func.func to get the function signature.
+    - Searches this for any dataclasses:
+      - If it finds any dataclasses, then converts them into expanded names.
+      - E.g. my_struct: MyStruct, and MyStruct contains a, b, c would become:
+          {"__ti_my_struct_a", "__ti_my_struct_b, "__ti_my_struct_c"}
+    """
+    assert ctx.func is not None
+    sig = inspect.signature(ctx.func.func)
+    parameters = sig.parameters
+    struct_locals = set()
+    for param_name, parameter in parameters.items():
+        if dataclasses.is_dataclass(parameter.annotation):
+            for field in dataclasses.fields(parameter.annotation):
+                child_name = f"__ti_{param_name}_{field.name}"
+                struct_locals.add(child_name)
+    return struct_locals
+class Func:
+    function_counter = 0
+    def __init__(self, _func: Callable, _classfunc=False, _pyfunc=False, is_real_function=False):
+        self.func = _func
+        self.func_id = Func.function_counter
+        Func.function_counter += 1
+        self.compiled = {}
+        self.classfunc = _classfunc
+        self.pyfunc = _pyfunc
+        self.is_real_function = is_real_function
+        self.arguments: list[KernelArgument] = []
+        self.orig_arguments: list[KernelArgument] = []
+        self.return_type: tuple[Type, ...] | None = None
+        self.extract_arguments()
+        self.template_slot_locations: list[int] = []
+        for i, arg in enumerate(self.arguments):
+            if arg.annotation == template or isinstance(arg.annotation, template):
+                self.template_slot_locations.append(i)
+        self.mapper = TaichiCallableTemplateMapper(self.arguments, self.template_slot_locations)
+        self.taichi_functions = {}  # The |Function| class in C++
+        self.has_print = False
+    def __call__(self, *args, **kwargs):
+        args = _process_args(self, is_func=True, args=args, kwargs=kwargs)
+        if not impl.inside_kernel():
+            if not self.pyfunc:
+                raise TaichiSyntaxError("Taichi functions cannot be called from Python-scope.")
+            return self.func(*args)
+        current_kernel = impl.get_runtime().current_kernel
+        if self.is_real_function:
+            if current_kernel.autodiff_mode != AutodiffMode.NONE:
+                raise TaichiSyntaxError("Real function in gradient kernels unsupported.")
+            instance_id, arg_features = self.mapper.lookup(args)
+            key = _ti_core.FunctionKey(self.func.__name__, self.func_id, instance_id)
+            if key.instance_id not in self.compiled:
+                self.do_compile(key=key, args=args, arg_features=arg_features)
+            return self.func_call_rvalue(key=key, args=args)
+        tree, ctx = _get_tree_and_ctx(
+            self,
+            is_kernel=False,
+            args=args,
+            ast_builder=current_kernel.ast_builder(),
+            is_real_function=self.is_real_function,
+        )
+        struct_locals = extract_struct_locals_from_context(ctx)
+        tree = unpack_ndarray_struct(tree, struct_locals=struct_locals)
+        ret = transform_tree(tree, ctx)
+        if not self.is_real_function:
+            if self.return_type and ctx.returned != ReturnStatus.ReturnedValue:
+                raise TaichiSyntaxError("Function has a return type but does not have a return statement")
+        return ret
+    def func_call_rvalue(self, key, args):
+        # Skip the template args, e.g., |self|
+        assert self.is_real_function
+        non_template_args = []
+        dbg_info = _ti_core.DebugInfo(impl.get_runtime().get_current_src_info())
+        for i, kernel_arg in enumerate(self.arguments):
+            anno = kernel_arg.annotation
+            if not isinstance(anno, template):
+                if id(anno) in primitive_types.type_ids:
+                    non_template_args.append(ops.cast(args[i], anno))
+                elif isinstance(anno, primitive_types.RefType):
+                    non_template_args.append(_ti_core.make_reference(args[i].ptr, dbg_info))
+                elif isinstance(anno, ndarray_type.NdarrayType):
+                    if not isinstance(args[i], AnyArray):
+                        raise TaichiTypeError(
+                            f"Expected ndarray in the kernel argument for argument {kernel_arg.name}, got {args[i]}"
+                        )
+                    non_template_args += _ti_core.get_external_tensor_real_func_args(args[i].ptr, dbg_info)
+                else:
+                    non_template_args.append(args[i])
+        non_template_args = impl.make_expr_group(non_template_args)
+        compiling_callable = impl.get_runtime().compiling_callable
+        assert compiling_callable is not None
+        func_call = compiling_callable.ast_builder().insert_func_call(
+            self.taichi_functions[key.instance_id], non_template_args, dbg_info
+        )
+        if self.return_type is None:
+            return None
+        func_call = Expr(func_call)
+        ret = []
+        for i, return_type in enumerate(self.return_type):
+            if id(return_type) in primitive_types.type_ids:
+                ret.append(
+                    Expr(
+                        _ti_core.make_get_element_expr(
+                            func_call.ptr, (i,), _ti_core.DebugInfo(impl.get_runtime().get_current_src_info())
+                        )
+                    )
+                )
+            elif isinstance(return_type, (StructType, MatrixType)):
+                ret.append(return_type.from_taichi_object(func_call, (i,)))
+            else:
+                raise TaichiTypeError(f"Unsupported return type for return value {i}: {return_type}")
+        if len(ret) == 1:
+            return ret[0]
+        return tuple(ret)
+    def do_compile(self, key, args, arg_features):
+        tree, ctx = _get_tree_and_ctx(
+            self, is_kernel=False, args=args, arg_features=arg_features, is_real_function=self.is_real_function
+        )
+        fn = impl.get_runtime().prog.create_function(key)
+        def func_body():
+            old_callable = impl.get_runtime().compiling_callable
+            impl.get_runtime().compiling_callable = fn
+            ctx.ast_builder = fn.ast_builder()
+            transform_tree(tree, ctx)
+            impl.get_runtime().compiling_callable = old_callable
+        self.taichi_functions[key.instance_id] = fn
+        self.compiled[key.instance_id] = func_body
+        self.taichi_functions[key.instance_id].set_function_body(func_body)
+    def extract_arguments(self) -> None:
+        sig = inspect.signature(self.func)
+        if sig.return_annotation not in (inspect.Signature.empty, None):
+            self.return_type = sig.return_annotation
+            if (
+                isinstance(self.return_type, (types.GenericAlias, typing._GenericAlias))
+                and self.return_type.__origin__ is tuple
+            ):
+                self.return_type = self.return_type.__args__
+            if not isinstance(self.return_type, (list, tuple)):
+                self.return_type = (self.return_type,)
+            for i, return_type in enumerate(self.return_type):
+                if return_type is Ellipsis:
+                    raise TaichiSyntaxError("Ellipsis is not supported in return type annotations")
+        params = sig.parameters
+        arg_names = params.keys()
+        for i, arg_name in enumerate(arg_names):
+            param = params[arg_name]
+            if param.kind == inspect.Parameter.VAR_KEYWORD:
+                raise TaichiSyntaxError("Taichi functions do not support variable keyword parameters (i.e., **kwargs)")
+            if param.kind == inspect.Parameter.VAR_POSITIONAL:
+                raise TaichiSyntaxError("Taichi functions do not support variable positional parameters (i.e., *args)")
+            if param.kind == inspect.Parameter.KEYWORD_ONLY:
+                raise TaichiSyntaxError("Taichi functions do not support keyword parameters")
+            if param.kind != inspect.Parameter.POSITIONAL_OR_KEYWORD:
+                raise TaichiSyntaxError('Taichi functions only support "positional or keyword" parameters')
+            annotation = param.annotation
+            if annotation is inspect.Parameter.empty:
+                if i == 0 and self.classfunc:
+                    annotation = template()
+                # TODO: pyfunc also need type annotation check when real function is enabled,
+                #       but that has to happen at runtime when we know which scope it's called from.
+                elif not self.pyfunc and self.is_real_function:
+                    raise TaichiSyntaxError(
+                        f"Taichi function `{self.func.__name__}` parameter `{arg_name}` must be type annotated"
+                    )
+            else:
+                if isinstance(annotation, ndarray_type.NdarrayType):
+                    pass
+                elif isinstance(annotation, MatrixType):
+                    pass
+                elif isinstance(annotation, StructType):
+                    pass
+                elif id(annotation) in primitive_types.type_ids:
+                    pass
+                elif type(annotation) == taichi.types.annotations.Template:
+                    pass
+                elif isinstance(annotation, template) or annotation == taichi.types.annotations.Template:
+                    pass
+                elif isinstance(annotation, primitive_types.RefType):
+                    pass
+                elif isinstance(annotation, type) and dataclasses.is_dataclass(annotation):
+                    pass
+                else:
+                    raise TaichiSyntaxError(f"Invalid type annotation (argument {i}) of Taichi function: {annotation}")
+            self.arguments.append(KernelArgument(annotation, param.name, param.default))
+            self.orig_arguments.append(KernelArgument(annotation, param.name, param.default))
+AnnotationType = Union[
+    template,
+    ArgPackType,
+    "texture_type.TextureType",
+    "texture_type.RWTextureType",
+    ndarray_type.NdarrayType,
+    sparse_matrix_builder,
+    Any,
+]
+class TaichiCallableTemplateMapper:
+    """
+    This should probably be renamed to sometihng like FeatureMapper, or
+    FeatureExtractor, since:
+    - it's not specific to templates
+    - it extracts what are later called 'features', for example for ndarray this includes:
+        - element type
+        - number dimensions
+        - needs grad (or not)
+    - these are returned as a heterogeneous tuple, whose contents depends on the type
+    """
+    def __init__(self, arguments: list[KernelArgument], template_slot_locations: list[int]) -> None:
+        self.arguments = arguments
+        self.num_args = len(arguments)
+        self.template_slot_locations = template_slot_locations
+        self.mapping = {}
+    @staticmethod
+    def extract_arg(arg, annotation: AnnotationType, arg_name: str):
+        if annotation == template or isinstance(annotation, template):
+            if isinstance(arg, taichi.lang.snode.SNode):
+                return arg.ptr
+            if isinstance(arg, taichi.lang.expr.Expr):
+                return arg.ptr.get_underlying_ptr_address()
+            if isinstance(arg, _ti_core.Expr):
+                return arg.get_underlying_ptr_address()
+            if isinstance(arg, tuple):
+                return tuple(TaichiCallableTemplateMapper.extract_arg(item, annotation, arg_name) for item in arg)
+            if isinstance(arg, taichi.lang._ndarray.Ndarray):
+                raise TaichiRuntimeTypeError(
+                    "Ndarray shouldn't be passed in via `ti.template()`, please annotate your kernel using `ti.types.ndarray(...)` instead"
+                )
+            if isinstance(arg, (list, tuple, dict, set)) or hasattr(arg, "_data_oriented"):
+                # [Composite arguments] Return weak reference to the object
+                # Taichi kernel will cache the extracted arguments, thus we can't simply return the original argument.
+                # Instead, a weak reference to the original value is returned to avoid memory leak.
+                # TODO(zhanlue): replacing "tuple(args)" with "hash of argument values"
+                # This can resolve the following issues:
+                # 1. Invalid weak-ref will leave a dead(dangling) entry in both caches: "self.mapping" and "self.compiled_functions"
+                # 2. Different argument instances with same type and same value, will get templatized into seperate kernels.
+                return weakref.ref(arg)
+            # [Primitive arguments] Return the value
+            return arg
+        if isinstance(annotation, ArgPackType):
+            if not isinstance(arg, ArgPack):
+                raise TaichiRuntimeTypeError(f"Argument {arg_name} must be a argument pack, got {type(arg)}")
+            return tuple(
+                TaichiCallableTemplateMapper.extract_arg(arg[name], dtype, arg_name)
+                for index, (name, dtype) in enumerate(annotation.members.items())
+            )
+        if dataclasses.is_dataclass(annotation):
+            _res_l = []
+            for field in dataclasses.fields(annotation):
+                field_value = getattr(arg, field.name)
+                arg_name = f"__ti_{arg_name}_{field.name}"
+                field_extracted = TaichiCallableTemplateMapper.extract_arg(field_value, field.type, arg_name)
+                _res_l.append(field_extracted)
+            return tuple(_res_l)
+        if isinstance(annotation, texture_type.TextureType):
+            if not isinstance(arg, taichi.lang._texture.Texture):
+                raise TaichiRuntimeTypeError(f"Argument {arg_name} must be a texture, got {type(arg)}")
+            if arg.num_dims != annotation.num_dimensions:
+                raise TaichiRuntimeTypeError(
+                    f"TextureType dimension mismatch for argument {arg_name}: expected {annotation.num_dimensions}, got {arg.num_dims}"
+                )
+            return (arg.num_dims,)
+        if isinstance(annotation, texture_type.RWTextureType):
+            if not isinstance(arg, taichi.lang._texture.Texture):
+                raise TaichiRuntimeTypeError(f"Argument {arg_name} must be a texture, got {type(arg)}")
+            if arg.num_dims != annotation.num_dimensions:
+                raise TaichiRuntimeTypeError(
+                    f"RWTextureType dimension mismatch for argument {arg_name}: expected {annotation.num_dimensions}, got {arg.num_dims}"
+                )
+            if arg.fmt != annotation.fmt:
+                raise TaichiRuntimeTypeError(
+                    f"RWTextureType format mismatch for argument {arg_name}: expected {annotation.fmt}, got {arg.fmt}"
+                )
+            # (penguinliong) '0' is the assumed LOD level. We currently don't
+            # support mip-mapping.
+            return arg.num_dims, arg.fmt, 0
+        if isinstance(annotation, ndarray_type.NdarrayType):
+            if isinstance(arg, taichi.lang._ndarray.Ndarray):
+                annotation.check_matched(arg.get_type(), arg_name)
+                needs_grad = (arg.grad is not None) if annotation.needs_grad is None else annotation.needs_grad
+                assert arg.shape is not None
+                return arg.element_type, len(arg.shape), needs_grad, annotation.boundary
+            if isinstance(arg, AnyArray):
+                ty = arg.get_type()
+                annotation.check_matched(arg.get_type(), arg_name)
+                return ty.element_type, len(arg.shape), ty.needs_grad, annotation.boundary
+            # external arrays
+            shape = getattr(arg, "shape", None)
+            if shape is None:
+                raise TaichiRuntimeTypeError(f"Invalid type for argument {arg_name}, got {arg}")
+            shape = tuple(shape)
+            element_shape: tuple[int, ...] = ()
+            dtype = to_taichi_type(arg.dtype)
+            if isinstance(annotation.dtype, MatrixType):
+                if annotation.ndim is not None:
+                    if len(shape) != annotation.dtype.ndim + annotation.ndim:
+                        raise ValueError(
+                            f"Invalid value for argument {arg_name} - required array has ndim={annotation.ndim} element_dim={annotation.dtype.ndim}, "
+                            f"array with {len(shape)} dimensions is provided"
+                        )
+                else:
+                    if len(shape) < annotation.dtype.ndim:
+                        raise ValueError(
+                            f"Invalid value for argument {arg_name} - required element_dim={annotation.dtype.ndim}, "
+                            f"array with {len(shape)} dimensions is provided"
+                        )
+                element_shape = shape[-annotation.dtype.ndim :]
+                anno_element_shape = annotation.dtype.get_shape()
+                if None not in anno_element_shape and element_shape != anno_element_shape:
+                    raise ValueError(
+                        f"Invalid value for argument {arg_name} - required element_shape={anno_element_shape}, "
+                        f"array with element shape of {element_shape} is provided"
+                    )
+            elif annotation.dtype is not None:
+                # User specified scalar dtype
+                if annotation.dtype != dtype:
+                    raise ValueError(
+                        f"Invalid value for argument {arg_name} - required array has dtype={annotation.dtype.to_string()}, "
+                        f"array with dtype={dtype.to_string()} is provided"
+                    )
+                if annotation.ndim is not None and len(shape) != annotation.ndim:
+                    raise ValueError(
+                        f"Invalid value for argument {arg_name} - required array has ndim={annotation.ndim}, "
+                        f"array with {len(shape)} dimensions is provided"
+                    )
+            needs_grad = (
+                getattr(arg, "requires_grad", False) if annotation.needs_grad is None else annotation.needs_grad
+            )
+            element_type = (
+                _ti_core.get_type_factory_instance().get_tensor_type(element_shape, dtype)
+                if len(element_shape) != 0
+                else arg.dtype
+            )
+            return element_type, len(shape) - len(element_shape), needs_grad, annotation.boundary
+        if isinstance(annotation, sparse_matrix_builder):
+            return arg.dtype
+        # Use '#' as a placeholder because other kinds of arguments are not involved in template instantiation
+        return "#"
+    def extract(self, args):
+        extracted = []
+        for arg, kernel_arg in zip(args, self.arguments):
+            extracted.append(self.extract_arg(arg, kernel_arg.annotation, kernel_arg.name))
+        return tuple(extracted)
+    def lookup(self, args):
+        if len(args) != self.num_args:
+            raise TypeError(f"{self.num_args} argument(s) needed but {len(args)} provided.")
+        key = self.extract(args)
+        if key not in self.mapping:
+            count = len(self.mapping)
+            self.mapping[key] = count
+        return self.mapping[key], key
+def _get_global_vars(_func):
+    # Discussions: https://github.com/taichi-dev/taichi/issues/282
+    global_vars = _func.__globals__.copy()
+    freevar_names = _func.__code__.co_freevars
+    closure = _func.__closure__
+    if closure:
+        freevar_values = list(map(lambda x: x.cell_contents, closure))
+        for name, value in zip(freevar_names, freevar_values):
+            global_vars[name] = value
+    return global_vars
+class Kernel:
+    counter = 0
+    def __init__(self, _func: Callable, autodiff_mode, _classkernel=False):
+        self.func = _func
+        self.kernel_counter = Kernel.counter
+        Kernel.counter += 1
+        assert autodiff_mode in (
+            AutodiffMode.NONE,
+            AutodiffMode.VALIDATION,
+            AutodiffMode.FORWARD,
+            AutodiffMode.REVERSE,
+        )
+        self.autodiff_mode = autodiff_mode
+        self.grad: Kernel | None = None
+        self.arguments: list[KernelArgument] = []
+        self.return_type = None
+        self.classkernel = _classkernel
+        self.extract_arguments()
+        self.template_slot_locations = []
+        for i, arg in enumerate(self.arguments):
+            if arg.annotation == template or isinstance(arg.annotation, template):
+                self.template_slot_locations.append(i)
+        self.mapper = TaichiCallableTemplateMapper(self.arguments, self.template_slot_locations)
+        impl.get_runtime().kernels.append(self)
+        self.reset()
+        self.kernel_cpp = None
+        self.compiled_kernels = {}
+        self.has_print = False
+    def ast_builder(self) -> ASTBuilder:
+        assert self.kernel_cpp is not None
+        return self.kernel_cpp.ast_builder()
+    def reset(self):
+        self.runtime = impl.get_runtime()
+        self.compiled_kernels = {}
+    def extract_arguments(self):
+        sig = inspect.signature(self.func)
+        if sig.return_annotation not in (inspect._empty, None):
+            self.return_type = sig.return_annotation
+            if (
+                isinstance(self.return_type, (types.GenericAlias, typing._GenericAlias))
+                and self.return_type.__origin__ is tuple
+            ):
+                self.return_type = self.return_type.__args__
+            if not isinstance(self.return_type, (list, tuple)):
+                self.return_type = (self.return_type,)
+            for return_type in self.return_type:
+                if return_type is Ellipsis:
+                    raise TaichiSyntaxError("Ellipsis is not supported in return type annotations")
+        params = sig.parameters
+        arg_names = params.keys()
+        for i, arg_name in enumerate(arg_names):
+            param = params[arg_name]
+            if param.kind == inspect.Parameter.VAR_KEYWORD:
+                raise TaichiSyntaxError("Taichi kernels do not support variable keyword parameters (i.e., **kwargs)")
+            if param.kind == inspect.Parameter.VAR_POSITIONAL:
+                raise TaichiSyntaxError("Taichi kernels do not support variable positional parameters (i.e., *args)")
+            if param.default is not inspect.Parameter.empty:
+                raise TaichiSyntaxError("Taichi kernels do not support default values for arguments")
+            if param.kind == inspect.Parameter.KEYWORD_ONLY:
+                raise TaichiSyntaxError("Taichi kernels do not support keyword parameters")
+            if param.kind != inspect.Parameter.POSITIONAL_OR_KEYWORD:
+                raise TaichiSyntaxError('Taichi kernels only support "positional or keyword" parameters')
+            annotation = param.annotation
+            if param.annotation is inspect.Parameter.empty:
+                if i == 0 and self.classkernel:  # The |self| parameter
+                    annotation = template()
+                else:
+                    raise TaichiSyntaxError("Taichi kernels parameters must be type annotated")
+            else:
+                if isinstance(
+                    annotation,
+                    (
+                        template,
+                        ndarray_type.NdarrayType,
+                        texture_type.TextureType,
+                        texture_type.RWTextureType,
+                    ),
+                ):
+                    pass
+                elif id(annotation) in primitive_types.type_ids:
+                    pass
+                elif isinstance(annotation, sparse_matrix_builder):
+                    pass
+                elif isinstance(annotation, MatrixType):
+                    pass
+                elif isinstance(annotation, StructType):
+                    pass
+                elif isinstance(annotation, ArgPackType):
+                    pass
+                elif annotation == template:
+                    pass
+                elif isinstance(annotation, type) and dataclasses.is_dataclass(annotation):
+                    pass
+                else:
+                    raise TaichiSyntaxError(f"Invalid type annotation (argument {i}) of Taichi kernel: {annotation}")
+            self.arguments.append(KernelArgument(annotation, param.name, param.default))
+    def materialize(self, key, args: list[Any], arg_features):
+        if key is None:
+            key = (self.func, 0, self.autodiff_mode)
+        self.runtime.materialize()
+        if key in self.compiled_kernels:
+            return
+        kernel_name = f"{self.func.__name__}_c{self.kernel_counter}_{key[1]}"
+        _logging.trace(f"Compiling kernel {kernel_name} in {self.autodiff_mode}...")
+        tree, ctx = _get_tree_and_ctx(
+            self,
+            args=args,
+            excluded_parameters=self.template_slot_locations,
+            arg_features=arg_features,
+        )
+        if self.autodiff_mode != AutodiffMode.NONE:
+            KernelSimplicityASTChecker(self.func).visit(tree)
+        # Do not change the name of 'taichi_ast_generator'
+        # The warning system needs this identifier to remove unnecessary messages
+        def taichi_ast_generator(kernel_cxx: Kernel):  # not sure if this type is correct, seems doubtful
+            nonlocal tree
+            if self.runtime.inside_kernel:
+                raise TaichiSyntaxError(
+                    "Kernels cannot call other kernels. I.e., nested kernels are not allowed. "
+                    "Please check if you have direct/indirect invocation of kernels within kernels. "
+                    "Note that some methods provided by the Taichi standard library may invoke kernels, "
+                    "and please move their invocations to Python-scope."
+                )
+            self.kernel_cpp = kernel_cxx
+            self.runtime.inside_kernel = True
+            self.runtime._current_kernel = self
+            assert self.runtime.compiling_callable is None
+            self.runtime.compiling_callable = kernel_cxx
+            try:
+                ctx.ast_builder = kernel_cxx.ast_builder()
+                def ast_to_dict(node):
+                    if isinstance(node, ast.AST):
+                        fields = {k: ast_to_dict(v) for k, v in ast.iter_fields(node)}
+                        return {
+                            "type": node.__class__.__name__,
+                            "fields": fields,
+                            "lineno": getattr(node, "lineno", None),
+                            "col_offset": getattr(node, "col_offset", None),
+                        }
+                    if isinstance(node, list):
+                        return [ast_to_dict(x) for x in node]
+                    return node  # Basic types (str, int, None, etc.)
+                if os.environ.get("TI_DUMP_AST", "") == "1":
+                    target_dir = pathlib.Path("/tmp/ast")
+                    target_dir.mkdir(parents=True, exist_ok=True)
+                    start = time.time()
+                    ast_str = ast.dump(tree, indent=2)
+                    output_file = target_dir / f"{kernel_name}_ast.txt"
+                    output_file.write_text(ast_str)
+                    elapsed_txt = time.time() - start
+                    start = time.time()
+                    json_str = json.dumps(ast_to_dict(tree), indent=2)
+                    output_file = target_dir / f"{kernel_name}_ast.json"
+                    output_file.write_text(json_str)
+                    elapsed_json = time.time() - start
+                    output_file = target_dir / f"{kernel_name}_gen_time.json"
+                    output_file.write_text(
+                        json.dumps({"elapsed_txt": elapsed_txt, "elapsed_json": elapsed_json}, indent=2)
+                    )
+                struct_locals = extract_struct_locals_from_context(ctx)
+                tree = unpack_ndarray_struct(tree, struct_locals=struct_locals)
+                transform_tree(tree, ctx)
+                if not ctx.is_real_function:
+                    if self.return_type and ctx.returned != ReturnStatus.ReturnedValue:
+                        raise TaichiSyntaxError("Kernel has a return type but does not have a return statement")
+            finally:
+                self.runtime.inside_kernel = False
+                self.runtime._current_kernel = None
+                self.runtime.compiling_callable = None
+        taichi_kernel = impl.get_runtime().prog.create_kernel(taichi_ast_generator, kernel_name, self.autodiff_mode)
+        assert key not in self.compiled_kernels
+        self.compiled_kernels[key] = taichi_kernel
+    def launch_kernel(self, t_kernel, *args):
+        assert len(args) == len(self.arguments), f"{len(self.arguments)} arguments needed but {len(args)} provided"
+        tmps = []
+        callbacks = []
+        actual_argument_slot = 0
+        launch_ctx = t_kernel.make_launch_context()
+        max_arg_num = 64
+        exceed_max_arg_num = False
+        def set_arg_ndarray(indices, v):
+            v_primal = v.arr
+            v_grad = v.grad.arr if v.grad else None
+            if v_grad is None:
+                launch_ctx.set_arg_ndarray(indices, v_primal)
+            else:
+                launch_ctx.set_arg_ndarray_with_grad(indices, v_primal, v_grad)
+        def set_arg_texture(indices, v):
+            launch_ctx.set_arg_texture(indices, v.tex)
+        def set_arg_rw_texture(indices, v):
+            launch_ctx.set_arg_rw_texture(indices, v.tex)
+        def set_arg_ext_array(indices, v, needed):
+            # Element shapes are already specialized in Taichi codegen.
+            # The shape information for element dims are no longer needed.
+            # Therefore we strip the element shapes from the shape vector,
+            # so that it only holds "real" array shapes.
+            is_soa = needed.layout == Layout.SOA
+            array_shape = v.shape
+            if functools.reduce(operator.mul, array_shape, 1) > np.iinfo(np.int32).max:
+                warnings.warn("Ndarray index might be out of int32 boundary but int64 indexing is not supported yet.")
+            if needed.dtype is None or id(needed.dtype) in primitive_types.type_ids:
+                element_dim = 0
+            else:
+                element_dim = needed.dtype.ndim
+                array_shape = v.shape[element_dim:] if is_soa else v.shape[:-element_dim]
+            if isinstance(v, np.ndarray):
+                # numpy
+                if v.flags.c_contiguous:
+                    launch_ctx.set_arg_external_array_with_shape(indices, int(v.ctypes.data), v.nbytes, array_shape, 0)
+                elif v.flags.f_contiguous:
+                    # TODO: A better way that avoids copying is saving strides info.
+                    tmp = np.ascontiguousarray(v)
+                    # Purpose: DO NOT GC |tmp|!
+                    tmps.append(tmp)
+                    def callback(original, updated):
+                        np.copyto(original, np.asfortranarray(updated))
+                    callbacks.append(functools.partial(callback, v, tmp))
+                    launch_ctx.set_arg_external_array_with_shape(
+                        indices, int(tmp.ctypes.data), tmp.nbytes, array_shape, 0
+                    )
+                else:
+                    raise ValueError(
+                        "Non contiguous numpy arrays are not supported, please call np.ascontiguousarray(arr) "
+                        "before passing it into taichi kernel."
+                    )
+            elif has_pytorch():
+                import torch  # pylint: disable=C0415
+                if isinstance(v, torch.Tensor):
+                    if not v.is_contiguous():
+                        raise ValueError(
+                            "Non contiguous tensors are not supported, please call tensor.contiguous() before "
+                            "passing it into taichi kernel."
+                        )
+                    taichi_arch = self.runtime.prog.config().arch
+                    def get_call_back(u, v):
+                        def call_back():
+                            u.copy_(v)
+                        return call_back
+                    # FIXME: only allocate when launching grad kernel
+                    if v.requires_grad and v.grad is None:
+                        v.grad = torch.zeros_like(v)
+                    if v.requires_grad:
+                        if not isinstance(v.grad, torch.Tensor):
+                            raise ValueError(
+                                f"Expecting torch.Tensor for gradient tensor, but getting {v.grad.__class__.__name__} instead"
+                            )
+                        if not v.grad.is_contiguous():
+                            raise ValueError(
+                                "Non contiguous gradient tensors are not supported, please call tensor.grad.contiguous() before passing it into taichi kernel."
+                            )
+                    tmp = v
+                    if (str(v.device) != "cpu") and not (
+                        str(v.device).startswith("cuda") and taichi_arch == _ti_core.Arch.cuda
+                    ):
+                        # Getting a torch CUDA tensor on Taichi non-cuda arch:
+                        # We just replace it with a CPU tensor and by the end of kernel execution we'll use the
+                        # callback to copy the values back to the original CUDA tensor.
+                        host_v = v.to(device="cpu", copy=True)
+                        tmp = host_v
+                        callbacks.append(get_call_back(v, host_v))
+                    launch_ctx.set_arg_external_array_with_shape(
+                        indices,
+                        int(tmp.data_ptr()),
+                        tmp.element_size() * tmp.nelement(),
+                        array_shape,
+                        int(v.grad.data_ptr()) if v.grad is not None else 0,
+                    )
+                else:
+                    raise TaichiRuntimeTypeError(f"Argument {needed} cannot be converted into required type {type(v)}")
+            elif has_paddle():
+                import paddle  # pylint: disable=C0415  # type: ignore
+                if isinstance(v, paddle.Tensor):
+                    # For now, paddle.fluid.core.Tensor._ptr() is only available on develop branch
+                    def get_call_back(u, v):
+                        def call_back():
+                            u.copy_(v, False)
+                        return call_back
+                    tmp = v.value().get_tensor()
+                    taichi_arch = self.runtime.prog.config().arch
+                    if v.place.is_gpu_place():
+                        if taichi_arch != _ti_core.Arch.cuda:
+                            # Paddle cuda tensor on Taichi non-cuda arch
+                            host_v = v.cpu()
+                            tmp = host_v.value().get_tensor()
+                            callbacks.append(get_call_back(v, host_v))
+                    elif v.place.is_cpu_place():
+                        if taichi_arch == _ti_core.Arch.cuda:
+                            # Paddle cpu tensor on Taichi cuda arch
+                            gpu_v = v.cuda()
+                            tmp = gpu_v.value().get_tensor()
+                            callbacks.append(get_call_back(v, gpu_v))
+                    else:
+                        # Paddle do support many other backends like XPU, NPU, MLU, IPU
+                        raise TaichiRuntimeTypeError(f"Taichi do not support backend {v.place} that Paddle support")
+                    launch_ctx.set_arg_external_array_with_shape(
+                        indices, int(tmp._ptr()), v.element_size() * v.size, array_shape, 0
+                    )
+                else:
+                    raise TaichiRuntimeTypeError(f"Argument {needed} cannot be converted into required type {v}")
+            else:
+                raise TaichiRuntimeTypeError(f"Argument {needed} cannot be converted into required type {v}")
+        def set_arg_matrix(indices, v, needed):
+            def cast_float(x):
+                if not isinstance(x, (int, float, np.integer, np.floating)):
+                    raise TaichiRuntimeTypeError(
+                        f"Argument {needed.dtype} cannot be converted into required type {type(x)}"
+                    )
+                return float(x)
+            def cast_int(x):
+                if not isinstance(x, (int, np.integer)):
+                    raise TaichiRuntimeTypeError(
+                        f"Argument {needed.dtype} cannot be converted into required type {type(x)}"
+                    )
+                return int(x)
+            cast_func = None
+            if needed.dtype in primitive_types.real_types:
+                cast_func = cast_float
+            elif needed.dtype in primitive_types.integer_types:
+                cast_func = cast_int
+            else:
+                raise ValueError(f"Matrix dtype {needed.dtype} is not integer type or real type.")
+            if needed.ndim == 2:
+                v = [cast_func(v[i, j]) for i in range(needed.n) for j in range(needed.m)]
+            else:
+                v = [cast_func(v[i]) for i in range(needed.n)]
+            v = needed(*v)
+            needed.set_kernel_struct_args(v, launch_ctx, indices)
+        def set_arg_sparse_matrix_builder(indices, v):
+            # Pass only the base pointer of the ti.types.sparse_matrix_builder() argument
+            launch_ctx.set_arg_uint(indices, v._get_ndarray_addr())
+        set_later_list = []
+        def recursive_set_args(needed_arg_type, provided_arg_type, v, indices):
+            """
+            Returns the number of kernel args set
+            e.g. templates don't set kernel args, so returns 0
+            a single ndarray is 1 kernel arg, so returns 1
+            a struct of 3 ndarrays would set 3 kernel args, so return 3
+            """
+            in_argpack = len(indices) > 1
+            nonlocal actual_argument_slot, exceed_max_arg_num, set_later_list
+            if actual_argument_slot >= max_arg_num:
+                exceed_max_arg_num = True
+                return 0
+            actual_argument_slot += 1
+            if isinstance(needed_arg_type, ArgPackType):
+                if not isinstance(v, ArgPack):
+                    raise TaichiRuntimeTypeError.get(indices, str(needed_arg_type), str(provided_arg_type))
+                idx_new = 0
+                for j, (name, anno) in enumerate(needed_arg_type.members.items()):
+                    idx_new += recursive_set_args(anno, type(v[name]), v[name], indices + (idx_new,))
+                launch_ctx.set_arg_argpack(indices, v._ArgPack__argpack)  # type: ignore
+                return 1
+            # Note: do not use sth like "needed == f32". That would be slow.
+            if id(needed_arg_type) in primitive_types.real_type_ids:
+                if not isinstance(v, (float, int, np.floating, np.integer)):
+                    raise TaichiRuntimeTypeError.get(indices, needed_arg_type.to_string(), provided_arg_type)
+                if in_argpack:
+                    return 1
+                launch_ctx.set_arg_float(indices, float(v))
+                return 1
+            if id(needed_arg_type) in primitive_types.integer_type_ids:
+                if not isinstance(v, (int, np.integer)):
+                    raise TaichiRuntimeTypeError.get(indices, needed_arg_type.to_string(), provided_arg_type)
+                if in_argpack:
+                    return 1
+                if is_signed(cook_dtype(needed_arg_type)):
+                    launch_ctx.set_arg_int(indices, int(v))
+                else:
+                    launch_ctx.set_arg_uint(indices, int(v))
+                return 1
+            if isinstance(needed_arg_type, sparse_matrix_builder):
+                if in_argpack:
+                    set_later_list.append((set_arg_sparse_matrix_builder, (v,)))
+                    return 0
+                set_arg_sparse_matrix_builder(indices, v)
+                return 1
+            if dataclasses.is_dataclass(needed_arg_type):
+                assert provided_arg_type == needed_arg_type
+                idx = 0
+                for j, field in enumerate(dataclasses.fields(needed_arg_type)):
+                    assert not isinstance(field.type, str)
+                    field_value = getattr(v, field.name)
+                    idx += recursive_set_args(field.type, field.type, field_value, (indices[0] + idx,))
+                return idx
+            if isinstance(needed_arg_type, ndarray_type.NdarrayType) and isinstance(v, taichi.lang._ndarray.Ndarray):
+                if in_argpack:
+                    set_later_list.append((set_arg_ndarray, (v,)))
+                    return 0
+                set_arg_ndarray(indices, v)
+                return 1
+            if isinstance(needed_arg_type, texture_type.TextureType) and isinstance(v, taichi.lang._texture.Texture):
+                if in_argpack:
+                    set_later_list.append((set_arg_texture, (v,)))
+                    return 0
+                set_arg_texture(indices, v)
+                return 1
+            if isinstance(needed_arg_type, texture_type.RWTextureType) and isinstance(v, taichi.lang._texture.Texture):
+                if in_argpack:
+                    set_later_list.append((set_arg_rw_texture, (v,)))
+                    return 0
+                set_arg_rw_texture(indices, v)
+                return 1
+            if isinstance(needed_arg_type, ndarray_type.NdarrayType):
+                if in_argpack:
+                    set_later_list.append((set_arg_ext_array, (v, needed_arg_type)))
+                    return 0
+                set_arg_ext_array(indices, v, needed_arg_type)
+                return 1
+            if isinstance(needed_arg_type, MatrixType):
+                if in_argpack:
+                    return 1
+                set_arg_matrix(indices, v, needed_arg_type)
+                return 1
+            if isinstance(needed_arg_type, StructType):
+                if in_argpack:
+                    return 1
+                if not isinstance(v, needed_arg_type):
+                    raise TaichiRuntimeTypeError(
+                        f"Argument {provided_arg_type} cannot be converted into required type {needed_arg_type}"
+                    )
+                needed_arg_type.set_kernel_struct_args(v, launch_ctx, indices)
+                return 1
+            if needed_arg_type == template or isinstance(needed_arg_type, template):
+                return 0
+            raise ValueError(f"Argument type mismatch. Expecting {needed_arg_type}, got {type(v)}.")
+        template_num = 0
+        i_out = 0
+        for i_in, val in enumerate(args):
+            needed_ = self.arguments[i_in].annotation
+            if needed_ == template or isinstance(needed_, template):
+                template_num += 1
+                i_out += 1
+                continue
+            i_out += recursive_set_args(needed_, type(val), val, (i_out - template_num,))
+        for i, (set_arg_func, params) in enumerate(set_later_list):
+            set_arg_func((len(args) - template_num + i,), *params)
+        if exceed_max_arg_num:
+            raise TaichiRuntimeError(
+                f"The number of elements in kernel arguments is too big! Do not exceed {max_arg_num} on {_ti_core.arch_name(impl.current_cfg().arch)} backend."
+            )
+        try:
+            prog = impl.get_runtime().prog
+            # Compile kernel (& Online Cache & Offline Cache)
+            compiled_kernel_data = prog.compile_kernel(prog.config(), prog.get_device_caps(), t_kernel)
+            # Launch kernel
+            prog.launch_kernel(compiled_kernel_data, launch_ctx)
+        except Exception as e:
+            e = handle_exception_from_cpp(e)
+            if impl.get_runtime().print_full_traceback:
+                raise e
+            raise e from None
+        ret = None
+        ret_dt = self.return_type
+        has_ret = ret_dt is not None
+        if has_ret or self.has_print:
+            runtime_ops.sync()
+        if has_ret:
+            ret = []
+            for i, ret_type in enumerate(ret_dt):
+                ret.append(self.construct_kernel_ret(launch_ctx, ret_type, (i,)))
+            if len(ret_dt) == 1:
+                ret = ret[0]
+        if callbacks:
+            for c in callbacks:
+                c()
+        return ret
+    def construct_kernel_ret(self, launch_ctx, ret_type, index=()):
+        if isinstance(ret_type, CompoundType):
+            return ret_type.from_kernel_struct_ret(launch_ctx, index)
+        if ret_type in primitive_types.integer_types:
+            if is_signed(cook_dtype(ret_type)):
+                return launch_ctx.get_struct_ret_int(index)
+            return launch_ctx.get_struct_ret_uint(index)
+        if ret_type in primitive_types.real_types:
+            return launch_ctx.get_struct_ret_float(index)
+        raise TaichiRuntimeTypeError(f"Invalid return type on index={index}")
+    def ensure_compiled(self, *args):
+        instance_id, arg_features = self.mapper.lookup(args)
+        key = (self.func, instance_id, self.autodiff_mode)
+        self.materialize(key=key, args=args, arg_features=arg_features)
+        return key
+    # For small kernels (< 3us), the performance can be pretty sensitive to overhead in __call__
+    # Thus this part needs to be fast. (i.e. < 3us on a 4 GHz x64 CPU)
+    @_shell_pop_print
+    def __call__(self, *args, **kwargs):
+        args = _process_args(self, is_func=False, args=args, kwargs=kwargs)
+        # Transform the primal kernel to forward mode grad kernel
+        # then recover to primal when exiting the forward mode manager
+        if self.runtime.fwd_mode_manager and not self.runtime.grad_replaced:
+            # TODO: if we would like to compute 2nd-order derivatives by forward-on-reverse in a nested context manager fashion,
+            # i.e., a `Tape` nested in the `FwdMode`, we can transform the kernels with `mode_original == AutodiffMode.REVERSE` only,
+            # to avoid duplicate computation for 1st-order derivatives
+            self.runtime.fwd_mode_manager.insert(self)
+        # Both the class kernels and the plain-function kernels are unified now.
+        # In both cases, |self.grad| is another Kernel instance that computes the
+        # gradient. For class kernels, args[0] is always the kernel owner.
+        # No need to capture grad kernels because they are already bound with their primal kernels
+        if (
+            self.autodiff_mode in (AutodiffMode.NONE, AutodiffMode.VALIDATION)
+            and self.runtime.target_tape
+            and not self.runtime.grad_replaced
+        ):
+            self.runtime.target_tape.insert(self, args)
+        if self.autodiff_mode != AutodiffMode.NONE and impl.current_cfg().opt_level == 0:
+            _logging.warn("""opt_level = 1 is enforced to enable gradient computation.""")
+            impl.current_cfg().opt_level = 1
+        key = self.ensure_compiled(*args)
+        kernel_cpp = self.compiled_kernels[key]
+        return self.launch_kernel(kernel_cpp, *args)
+# For a Taichi class definition like below:
+#
+# @ti.data_oriented
+# class X:
+#   @ti.kernel
+#   def foo(self):
+#     ...
+#
+# When ti.kernel runs, the stackframe's |code_context| of Python 3.8(+) is
+# different from that of Python 3.7 and below. In 3.8+, it is 'class X:',
+# whereas in <=3.7, it is '@ti.data_oriented'. More interestingly, if the class
+# inherits, i.e. class X(object):, then in both versions, |code_context| is
+# 'class X(object):'...
+_KERNEL_CLASS_STACKFRAME_STMT_RES = [
+    re.compile(r"@(\w+\.)?data_oriented"),
+    re.compile(r"class "),
+]
+def _inside_class(level_of_class_stackframe):
+    try:
+        maybe_class_frame = sys._getframe(level_of_class_stackframe)
+        statement_list = inspect.getframeinfo(maybe_class_frame)[3]
+        if statement_list is None:
+            return False
+        first_statment = statement_list[0].strip()
+        for pat in _KERNEL_CLASS_STACKFRAME_STMT_RES:
+            if pat.match(first_statment):
+                return True
+    except:
+        pass
+    return False
+def _kernel_impl(_func: Callable, level_of_class_stackframe: int, verbose: bool = False):
+    # Can decorators determine if a function is being defined inside a class?
+    # https://stackoverflow.com/a/8793684/12003165
+    is_classkernel = _inside_class(level_of_class_stackframe + 1)
+    if verbose:
+        print(f"kernel={_func.__name__} is_classkernel={is_classkernel}")
+    primal = Kernel(_func, autodiff_mode=AutodiffMode.NONE, _classkernel=is_classkernel)
+    adjoint = Kernel(_func, autodiff_mode=AutodiffMode.REVERSE, _classkernel=is_classkernel)
+    # Having |primal| contains |grad| makes the tape work.
+    primal.grad = adjoint
+    if is_classkernel:
+        # For class kernels, their primal/adjoint callables are constructed
+        # when the kernel is accessed via the instance inside
+        # _BoundedDifferentiableMethod.
+        # This is because we need to bind the kernel or |grad| to the instance
+        # owning the kernel, which is not known until the kernel is accessed.
+        #
+        # See also: _BoundedDifferentiableMethod, data_oriented.
+        @functools.wraps(_func)
+        def wrapped(*args, **kwargs):
+            # If we reach here (we should never), it means the class is not decorated
+            # with @ti.data_oriented, otherwise getattr would have intercepted the call.
+            clsobj = type(args[0])
+            assert not hasattr(clsobj, "_data_oriented")
+            raise TaichiSyntaxError(f"Please decorate class {clsobj.__name__} with @ti.data_oriented")
+    else:
+        @functools.wraps(_func)
+        def wrapped(*args, **kwargs):
+            try:
+                return primal(*args, **kwargs)
+            except (TaichiCompilationError, TaichiRuntimeError) as e:
+                if impl.get_runtime().print_full_traceback:
+                    raise e
+                raise type(e)("\n" + str(e)) from None
+        wrapped.grad = adjoint
+    wrapped._is_wrapped_kernel = True
+    wrapped._is_classkernel = is_classkernel
+    wrapped._primal = primal
+    wrapped._adjoint = adjoint
+    return wrapped
+def kernel(fn: Callable):
+    """Marks a function as a Taichi kernel.
+    A Taichi kernel is a function written in Python, and gets JIT compiled by
+    Taichi into native CPU/GPU instructions (e.g. a series of CUDA kernels).
+    The top-level ``for`` loops are automatically parallelized, and distributed
+    to either a CPU thread pool or massively parallel GPUs.
+    Kernel's gradient kernel would be generated automatically by the AutoDiff system.
+    See also https://docs.taichi-lang.org/docs/syntax#kernel.
+    Args:
+        fn (Callable): the Python function to be decorated
+    Returns:
+        Callable: The decorated function
+    Example::
+        >>> x = ti.field(ti.i32, shape=(4, 8))
+        >>>
+        >>> @ti.kernel
+        >>> def run():
+        >>>     # Assigns all the elements of `x` in parallel.
+        >>>     for i in x:
+        >>>         x[i] = i
+    """
+    return _kernel_impl(fn, level_of_class_stackframe=3)
+class _BoundedDifferentiableMethod:
+    def __init__(self, kernel_owner, wrapped_kernel_func):
+        clsobj = type(kernel_owner)
+        if not getattr(clsobj, "_data_oriented", False):
+            raise TaichiSyntaxError(f"Please decorate class {clsobj.__name__} with @ti.data_oriented")
+        self._kernel_owner = kernel_owner
+        self._primal = wrapped_kernel_func._primal
+        self._adjoint = wrapped_kernel_func._adjoint
+        self._is_staticmethod = wrapped_kernel_func._is_staticmethod
+        self.__name__: str | None = None
+    def __call__(self, *args, **kwargs):
+        try:
+            if self._is_staticmethod:
+                return self._primal(*args, **kwargs)
+            return self._primal(self._kernel_owner, *args, **kwargs)
+        except (TaichiCompilationError, TaichiRuntimeError) as e:
+            if impl.get_runtime().print_full_traceback:
+                raise e
+            raise type(e)("\n" + str(e)) from None
+    def grad(self, *args, **kwargs):
+        return self._adjoint(self._kernel_owner, *args, **kwargs)
+def data_oriented(cls):
+    """Marks a class as Taichi compatible.
+    To allow for modularized code, Taichi provides this decorator so that
+    Taichi kernels can be defined inside a class.
+    See also https://docs.taichi-lang.org/docs/odop
+    Example::
+        >>> @ti.data_oriented
+        >>> class TiArray:
+        >>>     def __init__(self, n):
+        >>>         self.x = ti.field(ti.f32, shape=n)
+        >>>
+        >>>     @ti.kernel
+        >>>     def inc(self):
+        >>>         for i in self.x:
+        >>>             self.x[i] += 1.0
+        >>>
+        >>> a = TiArray(32)
+        >>> a.inc()
+    Args:
+        cls (Class): the class to be decorated
+    Returns:
+        The decorated class.
+    """
+    def _getattr(self, item):
+        method = cls.__dict__.get(item, None)
+        is_property = method.__class__ == property
+        is_staticmethod = method.__class__ == staticmethod
+        if is_property:
+            x = method.fget
+        else:
+            x = super(cls, self).__getattribute__(item)
+        if hasattr(x, "_is_wrapped_kernel"):
+            if inspect.ismethod(x):
+                wrapped = x.__func__
+            else:
+                wrapped = x
+            wrapped._is_staticmethod = is_staticmethod
+            assert inspect.isfunction(wrapped)
+            if wrapped._is_classkernel:
+                ret = _BoundedDifferentiableMethod(self, wrapped)
+                ret.__name__ = wrapped.__name__
+                if is_property:
+                    return ret()
+                return ret
+        if is_property:
+            return x(self)
+        return x
+    cls.__getattribute__ = _getattr
+    cls._data_oriented = True
+    return cls
+__all__ = ["data_oriented", "func", "kernel", "pyfunc", "real_func"]