PyPI - gstaichi - Versions diffs - 1.0.1__cp311-cp311-macosx_15_0_arm64.whl → 2.0.0__cp311-cp311-macosx_15_0_arm64.whl - Mend

gstaichi 1.0.1__cp311-cp311-macosx_15_0_arm64.whl → 2.0.0__cp311-cp311-macosx_15_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (105) hide show

gstaichi/lang/kernel_impl.py CHANGED Viewed

@@ -29,11 +29,11 @@ from gstaichi._lib.core.gstaichi_python import (
     KernelCxx,
     KernelLaunchContext,
 )
-from gstaichi.lang import impl, ops, runtime_ops
-from gstaichi.lang._template_mapper import GsTaichiCallableTemplateMapper
-from gstaichi.lang._wrap_inspect import getsourcefile, getsourcelines
+from gstaichi.lang import _kernel_impl_dataclass, impl, ops, runtime_ops
+from gstaichi.lang._fast_caching import src_hasher
+from gstaichi.lang._template_mapper import TemplateMapper
+from gstaichi.lang._wrap_inspect import FunctionSourceInfo, get_source_info_and_src
 from gstaichi.lang.any_array import AnyArray
-from gstaichi.lang.argpack import ArgPack, ArgPackType
 from gstaichi.lang.ast import (
     ASTTransformerContext,
     KernelSimplicityASTChecker,
@@ -49,11 +49,11 @@ from gstaichi.lang.exception import (
     handle_exception_from_cpp,
 )
 from gstaichi.lang.expr import Expr
-from gstaichi.lang.kernel_arguments import KernelArgument
+from gstaichi.lang.kernel_arguments import ArgMetadata
 from gstaichi.lang.matrix import MatrixType
 from gstaichi.lang.shell import _shell_pop_print
 from gstaichi.lang.struct import StructType
-from gstaichi.lang.util import cook_dtype, has_paddle, has_pytorch
+from gstaichi.lang.util import cook_dtype, has_pytorch
 from gstaichi.types import (
     ndarray_type,
     primitive_types,
@@ -152,6 +152,7 @@ class GsTaichiCallable:
         self._adjoint: Kernel | None = None
         self.grad: Kernel | None = None
         self._is_staticmethod: bool = False
+        self.is_pure = False
         functools.update_wrapper(self, fn)
     def __call__(self, *args, **kwargs):
@@ -243,17 +244,45 @@ def pyfunc(fn: Callable) -> GsTaichiCallable:
     return gstaichi_callable
+def _populate_global_vars_for_templates(
+    template_slot_locations: list[int],
+    argument_metas: list[ArgMetadata],
+    global_vars: dict[str, Any],
+    fn: Callable,
+    py_args: tuple[Any, ...],
+):
+    """
+    Inject template parameters into globals
+    Globals are being abused to store the python objects associated
+    with templates. We continue this approach, and in addition this function
+    handles injecting expanded python variables from dataclasses.
+    """
+    for i in template_slot_locations:
+        template_var_name = argument_metas[i].name
+        global_vars[template_var_name] = py_args[i]
+    parameters = inspect.signature(fn).parameters
+    for i, (parameter_name, parameter) in enumerate(parameters.items()):
+        if dataclasses.is_dataclass(parameter.annotation):
+            _kernel_impl_dataclass.populate_global_vars_from_dataclass(
+                parameter_name,
+                parameter.annotation,
+                py_args[i],
+                global_vars=global_vars,
+            )
 def _get_tree_and_ctx(
     self: "Func | Kernel",
     args: tuple[Any, ...],
     excluded_parameters=(),
     is_kernel: bool = True,
     arg_features=None,
-    ast_builder: ASTBuilder | None = None,
+    ast_builder: "ASTBuilder | None" = None,
     is_real_function: bool = False,
+    current_kernel: "Kernel | None" = None,
 ) -> tuple[ast.Module, ASTTransformerContext]:
-    file = getsourcefile(self.func)
-    src, start_lineno = getsourcelines(self.func)
+    function_source_info, src = get_source_info_and_src(self.func)
     src = [textwrap.fill(line, tabsize=4, width=9999) for line in src]
     tree = ast.parse(textwrap.dedent("\n".join(src)))
@@ -263,17 +292,20 @@ def _get_tree_and_ctx(
     global_vars = _get_global_vars(self.func)
     if is_kernel or is_real_function:
-        # inject template parameters into globals
-        for i in self.template_slot_locations:
-            template_var_name = self.arguments[i].name
-            global_vars[template_var_name] = args[i]
-        parameters = inspect.signature(self.func).parameters
-        for arg_i, (param_name, param) in enumerate(parameters.items()):
-            if dataclasses.is_dataclass(param.annotation):
-                for member_field in dataclasses.fields(param.annotation):
-                    child_value = getattr(args[arg_i], member_field.name)
-                    flat_name = f"__ti_{param_name}_{member_field.name}"
-                    global_vars[flat_name] = child_value
+        _populate_global_vars_for_templates(
+            template_slot_locations=self.template_slot_locations,
+            argument_metas=self.arg_metas,
+            global_vars=global_vars,
+            fn=self.func,
+            py_args=args,
+        )
+    if current_kernel is not None:  # Kernel
+        current_kernel.kernel_function_info = function_source_info
+    if current_kernel is None:
+        current_kernel = impl.get_runtime()._current_kernel
+    assert current_kernel is not None
+    current_kernel.visited_functions.add(function_source_info)
     return tree, ASTTransformerContext(
         excluded_parameters=excluded_parameters,
@@ -283,38 +315,24 @@ def _get_tree_and_ctx(
         global_vars=global_vars,
         argument_data=args,
         src=src,
-        start_lineno=start_lineno,
-        file=file,
+        start_lineno=function_source_info.start_lineno,
+        end_lineno=function_source_info.end_lineno,
+        file=function_source_info.filepath,
         ast_builder=ast_builder,
         is_real_function=is_real_function,
     )
-def expand_func_arguments(arguments: list[KernelArgument]) -> list[KernelArgument]:
-    new_arguments = []
-    for argument in arguments:
-        if dataclasses.is_dataclass(argument.annotation):
-            for field in dataclasses.fields(argument.annotation):
-                new_argument = KernelArgument(
-                    _annotation=field.type,
-                    _name=f"__ti_{argument.name}_{field.name}",
-                )
-                new_arguments.append(new_argument)
-        else:
-            new_arguments.append(argument)
-    return new_arguments
 def _process_args(self: "Func | Kernel", is_func: bool, args: tuple[Any, ...], kwargs) -> tuple[Any, ...]:
     if is_func:
-        self.arguments = expand_func_arguments(self.arguments)
-    fused_args = [argument.default for argument in self.arguments]
-    ret: list[Any] = [argument.default for argument in self.arguments]
+        self.arg_metas = _kernel_impl_dataclass.expand_func_arguments(self.arg_metas)
+    fused_args: list[Any] = [arg_meta.default for arg_meta in self.arg_metas]
     len_args = len(args)
     if len_args > len(fused_args):
-        arg_str = ", ".join([str(arg) for arg in args])
-        expected_str = ", ".join([f"{arg.name} : {arg.annotation}" for arg in self.arguments])
+        arg_str = ", ".join(map(str, args))
+        expected_str = ", ".join(f"{arg.name} : {arg.annotation}" for arg in self.arg_metas)
         msg = f"Too many arguments. Expected ({expected_str}), got ({arg_str})."
         raise GsTaichiSyntaxError(msg)
@@ -322,69 +340,27 @@ def _process_args(self: "Func | Kernel", is_func: bool, args: tuple[Any, ...], k
         fused_args[i] = arg
     for key, value in kwargs.items():
-        found = False
-        for i, arg in enumerate(self.arguments):
+        for i, arg in enumerate(self.arg_metas):
             if key == arg.name:
                 if i < len_args:
                     raise GsTaichiSyntaxError(f"Multiple values for argument '{key}'.")
                 fused_args[i] = value
-                found = True
                 break
-        if not found:
+        else:
             raise GsTaichiSyntaxError(f"Unexpected argument '{key}'.")
     for i, arg in enumerate(fused_args):
         if arg is inspect.Parameter.empty:
-            if self.arguments[i].annotation is inspect._empty:
-                raise GsTaichiSyntaxError(f"Parameter `{self.arguments[i].name}` missing.")
+            if self.arg_metas[i].annotation is inspect._empty:
+                raise GsTaichiSyntaxError(f"Parameter `{self.arg_metas[i].name}` missing.")
             else:
                 raise GsTaichiSyntaxError(
-                    f"Parameter `{self.arguments[i].name} : {self.arguments[i].annotation}` missing."
+                    f"Parameter `{self.arg_metas[i].name} : {self.arg_metas[i].annotation}` missing."
                 )
     return tuple(fused_args)
-def unpack_ndarray_struct(tree: ast.Module, struct_locals: set[str]) -> ast.Module:
-    class AttributeToNameTransformer(ast.NodeTransformer):
-        def visit_Attribute(self, node: ast.Attribute):
-            if isinstance(node.value, ast.Attribute):
-                return node
-            if not isinstance(node.value, ast.Name):
-                return node
-            base_id = node.value.id
-            attr_name = node.attr
-            new_id = f"__ti_{base_id}_{attr_name}"
-            if new_id not in struct_locals:
-                return node
-            return ast.copy_location(ast.Name(id=new_id, ctx=node.ctx), node)
-    transformer = AttributeToNameTransformer()
-    new_tree = transformer.visit(tree)
-    ast.fix_missing_locations(new_tree)
-    return new_tree
-def extract_struct_locals_from_context(ctx: ASTTransformerContext):
-    """
-    - Uses ctx.func.func to get the function signature.
-    - Searches this for any dataclasses:
-      - If it finds any dataclasses, then converts them into expanded names.
-      - E.g. my_struct: MyStruct, and MyStruct contains a, b, c would become:
-          {"__ti_my_struct_a", "__ti_my_struct_b, "__ti_my_struct_c"}
-    """
-    assert ctx.func is not None
-    sig = inspect.signature(ctx.func.func)
-    parameters = sig.parameters
-    struct_locals = set()
-    for param_name, parameter in parameters.items():
-        if dataclasses.is_dataclass(parameter.annotation):
-            for field in dataclasses.fields(parameter.annotation):
-                child_name = f"__ti_{param_name}_{field.name}"
-                struct_locals.add(child_name)
-    return struct_locals
 class Func:
     function_counter = 0
@@ -396,19 +372,19 @@ class Func:
         self.classfunc = _classfunc
         self.pyfunc = _pyfunc
         self.is_real_function = is_real_function
-        self.arguments: list[KernelArgument] = []
-        self.orig_arguments: list[KernelArgument] = []
+        self.arg_metas: list[ArgMetadata] = []
+        self.orig_arguments: list[ArgMetadata] = []
         self.return_type: tuple[Type, ...] | None = None
         self.extract_arguments()
         self.template_slot_locations: list[int] = []
-        for i, arg in enumerate(self.arguments):
+        for i, arg in enumerate(self.arg_metas):
             if arg.annotation == template or isinstance(arg.annotation, template):
                 self.template_slot_locations.append(i)
-        self.mapper = GsTaichiCallableTemplateMapper(self.arguments, self.template_slot_locations)
+        self.mapper = TemplateMapper(self.arg_metas, self.template_slot_locations)
         self.gstaichi_functions = {}  # The |Function| class in C++
         self.has_print = False
-    def __call__(self, *args, **kwargs) -> Any:
+    def __call__(self: "Func", *args, **kwargs) -> Any:
         args = _process_args(self, is_func=True, args=args, kwargs=kwargs)
         if not impl.inside_kernel():
@@ -433,8 +409,9 @@ class Func:
             is_real_function=self.is_real_function,
         )
-        struct_locals = extract_struct_locals_from_context(ctx)
-        tree = unpack_ndarray_struct(tree, struct_locals=struct_locals)
+        struct_locals = _kernel_impl_dataclass.extract_struct_locals_from_context(ctx)
+        tree = _kernel_impl_dataclass.unpack_ast_struct_expressions(tree, struct_locals=struct_locals)
         ret = transform_tree(tree, ctx)
         if not self.is_real_function:
             if self.return_type and ctx.returned != ReturnStatus.ReturnedValue:
@@ -446,7 +423,7 @@ class Func:
         assert self.is_real_function
         non_template_args = []
         dbg_info = _ti_core.DebugInfo(impl.get_runtime().get_current_src_info())
-        for i, kernel_arg in enumerate(self.arguments):
+        for i, kernel_arg in enumerate(self.arg_metas):
             anno = kernel_arg.annotation
             if not isinstance(anno, template):
                 if id(anno) in primitive_types.type_ids:
@@ -497,10 +474,10 @@ class Func:
         def func_body():
             old_callable = impl.get_runtime().compiling_callable
-            impl.get_runtime().compiling_callable = fn
+            impl.get_runtime()._compiling_callable = fn
             ctx.ast_builder = fn.ast_builder()
             transform_tree(tree, ctx)
-            impl.get_runtime().compiling_callable = old_callable
+            impl.get_runtime()._compiling_callable = old_callable
         self.gstaichi_functions[key.instance_id] = fn
         self.compiled[key.instance_id] = func_body
@@ -569,8 +546,8 @@ class Func:
                     raise GsTaichiSyntaxError(
                         f"Invalid type annotation (argument {i}) of GsTaichi function: {annotation}"
                     )
-            self.arguments.append(KernelArgument(annotation, param.name, param.default))
-            self.orig_arguments.append(KernelArgument(annotation, param.name, param.default))
+            self.arg_metas.append(ArgMetadata(annotation, param.name, param.default))
+            self.orig_arguments.append(ArgMetadata(annotation, param.name, param.default))
 def _get_global_vars(_func: Callable) -> dict[str, Any]:
@@ -587,6 +564,14 @@ def _get_global_vars(_func: Callable) -> dict[str, Any]:
     return global_vars
+@dataclasses.dataclass
+class SrcLlCacheObservations:
+    cache_key_generated: bool = False
+    cache_validated: bool = False
+    cache_loaded: bool = False
+    cache_stored: bool = False
 class Kernel:
     counter = 0
@@ -601,21 +586,29 @@ class Kernel:
             AutodiffMode.REVERSE,
         )
         self.autodiff_mode = autodiff_mode
-        self.grad: Kernel | None = None
-        self.arguments: list[KernelArgument] = []
+        self.grad: "Kernel | None" = None
+        self.arg_metas: list[ArgMetadata] = []
         self.return_type = None
         self.classkernel = _classkernel
         self.extract_arguments()
         self.template_slot_locations = []
-        for i, arg in enumerate(self.arguments):
+        for i, arg in enumerate(self.arg_metas):
             if arg.annotation == template or isinstance(arg.annotation, template):
                 self.template_slot_locations.append(i)
-        self.mapper = GsTaichiCallableTemplateMapper(self.arguments, self.template_slot_locations)
+        self.mapper = TemplateMapper(self.arg_metas, self.template_slot_locations)
         impl.get_runtime().kernels.append(self)
         self.reset()
         self.kernel_cpp = None
-        self.compiled_kernels: dict[CompiledKernelKeyType, KernelCxx] = {}
+        # A materialized kernel is a KernelCxx object which may or may not have
+        # been compiled. It generally has been converted at least as far as AST
+        # and front-end IR, but not necessarily any further.
+        self.materialized_kernels: dict[CompiledKernelKeyType, KernelCxx] = {}
         self.has_print = False
+        self.gstaichi_callable: GsTaichiCallable | None = None
+        self.visited_functions: set[FunctionSourceInfo] = set()
+        self.kernel_function_info: FunctionSourceInfo | None = None
+        self.src_ll_cache_observations: SrcLlCacheObservations = SrcLlCacheObservations()
     def ast_builder(self) -> ASTBuilder:
         assert self.kernel_cpp is not None
@@ -623,7 +616,7 @@ class Kernel:
     def reset(self) -> None:
         self.runtime = impl.get_runtime()
-        self.compiled_kernels = {}
+        self.materialized_kernels = {}
     def extract_arguments(self) -> None:
         sig = inspect.signature(self.func)
@@ -639,7 +632,7 @@ class Kernel:
             for return_type in self.return_type:
                 if return_type is Ellipsis:
                     raise GsTaichiSyntaxError("Ellipsis is not supported in return type annotations")
-        params = sig.parameters
+        params = dict(sig.parameters)
         arg_names = params.keys()
         for i, arg_name in enumerate(arg_names):
             param = params[arg_name]
@@ -682,34 +675,50 @@ class Kernel:
                     pass
                 elif isinstance(annotation, StructType):
                     pass
-                elif isinstance(annotation, ArgPackType):
-                    pass
                 elif annotation == template:
                     pass
                 elif isinstance(annotation, type) and dataclasses.is_dataclass(annotation):
                     pass
                 else:
-                    raise GsTaichiSyntaxError(
-                        f"Invalid type annotation (argument {i}) of GsTaichi kernel: {annotation}"
-                    )
-            self.arguments.append(KernelArgument(annotation, param.name, param.default))
+                    raise GsTaichiSyntaxError(f"Invalid type annotation (argument {i}) of Taichi kernel: {annotation}")
+            self.arg_metas.append(ArgMetadata(annotation, param.name, param.default))
-    def materialize(self, key: CompiledKernelKeyType | None, args: tuple[Any, ...], arg_features):
+    def materialize(self, key: CompiledKernelKeyType | None, args: tuple[Any, ...], arg_features=None):
         if key is None:
             key = (self.func, 0, self.autodiff_mode)
         self.runtime.materialize()
+        self.compiled_kernel_data = None
+        self.fast_checksum = None
-        if key in self.compiled_kernels:
+        if key in self.materialized_kernels:
             return
+        if self.gstaichi_callable and self.gstaichi_callable.is_pure:
+            kernel_source_info, _src = get_source_info_and_src(self.func)
+            self.fast_checksum = src_hasher.create_cache_key(kernel_source_info, args)
+            if self.fast_checksum:
+                self.src_ll_cache_observations.cache_key_generated = True
+            if self.fast_checksum and src_hasher.validate_cache_key(self.fast_checksum):
+                self.src_ll_cache_observations.cache_validated = True
+                prog = impl.get_runtime().prog
+                self.compiled_kernel_data = prog.load_fast_cache(
+                    self.fast_checksum,
+                    self.func.__name__,
+                    prog.config(),
+                    prog.get_device_caps(),
+                )
+                if self.compiled_kernel_data:
+                    self.src_ll_cache_observations.cache_loaded = True
         kernel_name = f"{self.func.__name__}_c{self.kernel_counter}_{key[1]}"
-        _logging.trace(f"Compiling kernel {kernel_name} in {self.autodiff_mode}...")
+        _logging.trace(f"Materializing kernel {kernel_name} in {self.autodiff_mode}...")
         tree, ctx = _get_tree_and_ctx(
             self,
             args=args,
             excluded_parameters=self.template_slot_locations,
             arg_features=arg_features,
+            current_kernel=self,
         )
         if self.autodiff_mode != AutodiffMode.NONE:
@@ -717,7 +726,7 @@ class Kernel:
         # Do not change the name of 'gstaichi_ast_generator'
         # The warning system needs this identifier to remove unnecessary messages
-        def gstaichi_ast_generator(kernel_cxx: Kernel):  # not sure if this type is correct, seems doubtful
+        def gstaichi_ast_generator(kernel_cxx: KernelCxx):
             nonlocal tree
             if self.runtime.inside_kernel:
                 raise GsTaichiSyntaxError(
@@ -729,8 +738,8 @@ class Kernel:
             self.kernel_cpp = kernel_cxx
             self.runtime.inside_kernel = True
             self.runtime._current_kernel = self
-            assert self.runtime.compiling_callable is None
-            self.runtime.compiling_callable = kernel_cxx
+            assert self.runtime._compiling_callable is None
+            self.runtime._compiling_callable = kernel_cxx
             try:
                 ctx.ast_builder = kernel_cxx.ast_builder()
@@ -767,8 +776,9 @@ class Kernel:
                     output_file.write_text(
                         json.dumps({"elapsed_txt": elapsed_txt, "elapsed_json": elapsed_json}, indent=2)
                     )
-                struct_locals = extract_struct_locals_from_context(ctx)
-                tree = unpack_ndarray_struct(tree, struct_locals=struct_locals)
+                struct_locals = _kernel_impl_dataclass.extract_struct_locals_from_context(ctx)
+                tree = _kernel_impl_dataclass.unpack_ast_struct_expressions(tree, struct_locals=struct_locals)
+                ctx.only_parse_function_def = self.compiled_kernel_data is not None
                 transform_tree(tree, ctx)
                 if not ctx.is_real_function:
                     if self.return_type and ctx.returned != ReturnStatus.ReturnedValue:
@@ -776,14 +786,14 @@ class Kernel:
             finally:
                 self.runtime.inside_kernel = False
                 self.runtime._current_kernel = None
-                self.runtime.compiling_callable = None
+                self.runtime._compiling_callable = None
         gstaichi_kernel = impl.get_runtime().prog.create_kernel(gstaichi_ast_generator, kernel_name, self.autodiff_mode)
-        assert key not in self.compiled_kernels
-        self.compiled_kernels[key] = gstaichi_kernel
+        assert key not in self.materialized_kernels
+        self.materialized_kernels[key] = gstaichi_kernel
     def launch_kernel(self, t_kernel: KernelCxx, *args) -> Any:
-        assert len(args) == len(self.arguments), f"{len(self.arguments)} arguments needed but {len(args)} provided"
+        assert len(args) == len(self.arg_metas), f"{len(self.arg_metas)} arguments needed but {len(args)} provided"
         tmps = []
         callbacks = []
@@ -897,43 +907,8 @@ class Kernel:
                     )
                 else:
                     raise GsTaichiRuntimeTypeError(
-                        f"Argument {needed} cannot be converted into required type {type(v)}"
+                        f"Argument of type {type(v)} cannot be converted into required type {needed}"
                     )
-            elif has_paddle():
-                # Do we want to continue to support paddle? :thinking_face:
-                # #maybeprunable
-                import paddle  # pylint: disable=C0415  # type: ignore
-                if isinstance(v, paddle.Tensor):
-                    # For now, paddle.fluid.core.Tensor._ptr() is only available on develop branch
-                    def get_call_back(u, v):
-                        def call_back():
-                            u.copy_(v, False)
-                        return call_back
-                    tmp = v.value().get_tensor()
-                    gstaichi_arch = self.runtime.prog.config().arch
-                    if v.place.is_gpu_place():
-                        if gstaichi_arch != _ti_core.Arch.cuda:
-                            # Paddle cuda tensor on GsTaichi non-cuda arch
-                            host_v = v.cpu()
-                            tmp = host_v.value().get_tensor()
-                            callbacks.append(get_call_back(v, host_v))
-                    elif v.place.is_cpu_place():
-                        if gstaichi_arch == _ti_core.Arch.cuda:
-                            # Paddle cpu tensor on GsTaichi cuda arch
-                            gpu_v = v.cuda()
-                            tmp = gpu_v.value().get_tensor()
-                            callbacks.append(get_call_back(v, gpu_v))
-                    else:
-                        # Paddle do support many other backends like XPU, NPU, MLU, IPU
-                        raise GsTaichiRuntimeTypeError(f"GsTaichi do not support backend {v.place} that Paddle support")
-                    launch_ctx.set_arg_external_array_with_shape(
-                        indices, int(tmp._ptr()), v.element_size() * v.size, array_shape, 0
-                    )
-                else:
-                    raise GsTaichiRuntimeTypeError(f"Argument {needed} cannot be converted into required type {v}")
             else:
                 raise GsTaichiRuntimeTypeError(f"Argument {needed} cannot be converted into required type {v}")
@@ -979,43 +954,28 @@ class Kernel:
             e.g. templates don't set kernel args, so returns 0
             a single ndarray is 1 kernel arg, so returns 1
             a struct of 3 ndarrays would set 3 kernel args, so return 3
+            note: len(indices) > 1 only happens with argpack (which we are removing support for)
             """
-            in_argpack = len(indices) > 1
             nonlocal actual_argument_slot, exceed_max_arg_num, set_later_list
             if actual_argument_slot >= max_arg_num:
                 exceed_max_arg_num = True
                 return 0
             actual_argument_slot += 1
-            if isinstance(needed_arg_type, ArgPackType):
-                if not isinstance(v, ArgPack):
-                    raise GsTaichiRuntimeTypeError.get(indices, str(needed_arg_type), str(provided_arg_type))
-                idx_new = 0
-                for j, (name, anno) in enumerate(needed_arg_type.members.items()):
-                    idx_new += recursive_set_args(anno, type(v[name]), v[name], indices + (idx_new,))
-                launch_ctx.set_arg_argpack(indices, v._ArgPack__argpack)  # type: ignore
-                return 1
             # Note: do not use sth like "needed == f32". That would be slow.
             if id(needed_arg_type) in primitive_types.real_type_ids:
                 if not isinstance(v, (float, int, np.floating, np.integer)):
                     raise GsTaichiRuntimeTypeError.get(indices, needed_arg_type.to_string(), provided_arg_type)
-                if in_argpack:
-                    return 1
                 launch_ctx.set_arg_float(indices, float(v))
                 return 1
             if id(needed_arg_type) in primitive_types.integer_type_ids:
                 if not isinstance(v, (int, np.integer)):
                     raise GsTaichiRuntimeTypeError.get(indices, needed_arg_type.to_string(), provided_arg_type)
-                if in_argpack:
-                    return 1
                 if is_signed(cook_dtype(needed_arg_type)):
                     launch_ctx.set_arg_int(indices, int(v))
                 else:
                     launch_ctx.set_arg_uint(indices, int(v))
                 return 1
             if isinstance(needed_arg_type, sparse_matrix_builder):
-                if in_argpack:
-                    set_later_list.append((set_arg_sparse_matrix_builder, (v,)))
-                    return 0
                 set_arg_sparse_matrix_builder(indices, v)
                 return 1
             if dataclasses.is_dataclass(needed_arg_type):
@@ -1027,39 +987,23 @@ class Kernel:
                     idx += recursive_set_args(field.type, field.type, field_value, (indices[0] + idx,))
                 return idx
             if isinstance(needed_arg_type, ndarray_type.NdarrayType) and isinstance(v, gstaichi.lang._ndarray.Ndarray):
-                if in_argpack:
-                    set_later_list.append((set_arg_ndarray, (v,)))
-                    return 0
                 set_arg_ndarray(indices, v)
                 return 1
             if isinstance(needed_arg_type, texture_type.TextureType) and isinstance(v, gstaichi.lang._texture.Texture):
-                if in_argpack:
-                    set_later_list.append((set_arg_texture, (v,)))
-                    return 0
                 set_arg_texture(indices, v)
                 return 1
             if isinstance(needed_arg_type, texture_type.RWTextureType) and isinstance(
                 v, gstaichi.lang._texture.Texture
             ):
-                if in_argpack:
-                    set_later_list.append((set_arg_rw_texture, (v,)))
-                    return 0
                 set_arg_rw_texture(indices, v)
                 return 1
             if isinstance(needed_arg_type, ndarray_type.NdarrayType):
-                if in_argpack:
-                    set_later_list.append((set_arg_ext_array, (v, needed_arg_type)))
-                    return 0
                 set_arg_ext_array(indices, v, needed_arg_type)
                 return 1
             if isinstance(needed_arg_type, MatrixType):
-                if in_argpack:
-                    return 1
                 set_arg_matrix(indices, v, needed_arg_type)
                 return 1
             if isinstance(needed_arg_type, StructType):
-                if in_argpack:
-                    return 1
                 # Unclear how to make the following pass typing checks
                 # StructType implements __instancecheck__, which should be a classmethod, but
                 # is currently an instance method
@@ -1077,7 +1021,7 @@ class Kernel:
         template_num = 0
         i_out = 0
         for i_in, val in enumerate(args):
-            needed_ = self.arguments[i_in].annotation
+            needed_ = self.arg_metas[i_in].annotation
             if needed_ == template or isinstance(needed_, template):
                 template_num += 1
                 i_out += 1
@@ -1094,10 +1038,19 @@ class Kernel:
         try:
             prog = impl.get_runtime().prog
-            # Compile kernel (& Online Cache & Offline Cache)
-            compiled_kernel_data = prog.compile_kernel(prog.config(), prog.get_device_caps(), t_kernel)
-            # Launch kernel
-            prog.launch_kernel(compiled_kernel_data, launch_ctx)
+            if not self.compiled_kernel_data:
+                self.compiled_kernel_data = prog.compile_kernel(prog.config(), prog.get_device_caps(), t_kernel)
+                if self.fast_checksum:
+                    src_hasher.store(self.fast_checksum, self.visited_functions)
+                    prog.store_fast_cache(
+                        self.fast_checksum,
+                        self.kernel_cpp,
+                        prog.config(),
+                        prog.get_device_caps(),
+                        self.compiled_kernel_data,
+                    )
+                    self.src_ll_cache_observations.cache_stored = True
+            prog.launch_kernel(self.compiled_kernel_data, launch_ctx)
         except Exception as e:
             e = handle_exception_from_cpp(e)
             if impl.get_runtime().print_full_traceback:
@@ -1170,7 +1123,7 @@ class Kernel:
             _logging.warn("""opt_level = 1 is enforced to enable gradient computation.""")
             impl.current_cfg().opt_level = 1
         key = self.ensure_compiled(*args)
-        kernel_cpp = self.compiled_kernels[key]
+        kernel_cpp = self.materialized_kernels[key]
         return self.launch_kernel(kernel_cpp, *args)
@@ -1256,6 +1209,7 @@ def _kernel_impl(_func: Callable, level_of_class_stackframe: int, verbose: bool
     wrapped._is_classkernel = is_classkernel
     wrapped._primal = primal
     wrapped._adjoint = adjoint
+    primal.gstaichi_callable = wrapped
     return wrapped