PyPI - dlinfer-ascend - Versions diffs - 0.2.3.post2__cp311-cp311-manylinux2014_aarch64.whl - Mend

dlinfer-ascend 0.2.3.post2__cp311-cp311-manylinux2014_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (70) hide show

dlinfer/__init__.py +5 -0
dlinfer/framework/__init__.py +1 -0
dlinfer/framework/lmdeploy_ext/__init__.py +6 -0
dlinfer/framework/lmdeploy_ext/cudagraph/__init__.py +20 -0
dlinfer/framework/lmdeploy_ext/cudagraph/ascend_cudagraph.py +391 -0
dlinfer/framework/lmdeploy_ext/cudagraph/camb_cudagraph.py +133 -0
dlinfer/framework/lmdeploy_ext/cudagraph/maca_cudagraph.py +128 -0
dlinfer/framework/lmdeploy_ext/cudagraph/ppu_cudagraph.py +131 -0
dlinfer/framework/lmdeploy_ext/device/__init__.py +79 -0
dlinfer/framework/lmdeploy_ext/device/ascend.py +205 -0
dlinfer/framework/lmdeploy_ext/device/camb.py +24 -0
dlinfer/framework/lmdeploy_ext/quants/__init__.py +20 -0
dlinfer/framework/lmdeploy_ext/quants/ascend_awq.py +248 -0
dlinfer/framework/torch_npu_ext/__init__.py +12 -0
dlinfer/framework/torch_npu_ext/aclgraph.py +59 -0
dlinfer/framework/transformers_ext/__init__.py +17 -0
dlinfer/framework/transformers_ext/cogvlm.py +25 -0
dlinfer/framework/transformers_ext/internlm2.py +242 -0
dlinfer/framework/transformers_ext/internvl.py +33 -0
dlinfer/framework/transformers_ext/patch.py +33 -0
dlinfer/graph/__init__.py +5 -0
dlinfer/graph/custom_op.py +147 -0
dlinfer/graph/dicp/__init__.py +0 -0
dlinfer/graph/dicp/dynamo_bridge/__init__.py +0 -0
dlinfer/graph/dicp/dynamo_bridge/compile.py +42 -0
dlinfer/graph/dicp/dynamo_bridge/compile_fx.py +305 -0
dlinfer/graph/dicp/dynamo_bridge/conversion.py +75 -0
dlinfer/graph/dicp/dynamo_bridge/decompositions.py +38 -0
dlinfer/graph/dicp/dynamo_bridge/graph.py +141 -0
dlinfer/graph/dicp/dynamo_bridge/op_transformer.py +293 -0
dlinfer/graph/dicp/dynamo_bridge/operator.py +87 -0
dlinfer/graph/dicp/dynamo_bridge/pt_patch.py +320 -0
dlinfer/graph/dicp/dynamo_bridge/torch_version.py +38 -0
dlinfer/graph/dicp/dynamo_bridge/utils.py +158 -0
dlinfer/graph/dicp/vendor/AtbGraph/__init__.py +13 -0
dlinfer/graph/dicp/vendor/AtbGraph/atb_op.py +853 -0
dlinfer/graph/dicp/vendor/AtbGraph/codegen/__init__.py +0 -0
dlinfer/graph/dicp/vendor/AtbGraph/codegen/atb.py +318 -0
dlinfer/graph/dicp/vendor/AtbGraph/codegen/atb_graph.py +768 -0
dlinfer/graph/dicp/vendor/AtbGraph/codegen/atb_infer_param.py +763 -0
dlinfer/graph/dicp/vendor/AtbGraph/codegen/atb_op.py +1279 -0
dlinfer/graph/dicp/vendor/AtbGraph/codegen/libdicp_model.so +0 -0
dlinfer/graph/dicp/vendor/AtbGraph/codegen/load_and_run.py +21 -0
dlinfer/graph/dicp/vendor/AtbGraph/codegen/utils.py +178 -0
dlinfer/graph/dicp/vendor/AtbGraph/compile_job.py +52 -0
dlinfer/graph/dicp/vendor/AtbGraph/config.py +36 -0
dlinfer/graph/dicp/vendor/AtbGraph/conversion.py +908 -0
dlinfer/graph/dicp/vendor/AtbGraph/ext_ops.py +95 -0
dlinfer/graph/dicp/vendor/AtbGraph/infer_res_utils.py +200 -0
dlinfer/graph/dicp/vendor/AtbGraph/opset_convert.py +70 -0
dlinfer/graph/dicp/vendor/AtbGraph/pattern_replacement.py +152 -0
dlinfer/graph/dicp/vendor/__init__.py +0 -0
dlinfer/ops/__init__.py +2 -0
dlinfer/ops/llm.py +879 -0
dlinfer/utils/__init__.py +1 -0
dlinfer/utils/config.py +18 -0
dlinfer/utils/registry.py +8 -0
dlinfer/utils/type_annotation.py +3 -0
dlinfer/vendor/__init__.py +33 -0
dlinfer/vendor/ascend/__init__.py +5 -0
dlinfer/vendor/ascend/pytorch_patch.py +55 -0
dlinfer/vendor/ascend/torch_npu_ops.py +601 -0
dlinfer/vendor/ascend/utils.py +20 -0
dlinfer/vendor/vendor.yaml +2 -0
dlinfer_ascend-0.2.3.post2.dist-info/LICENSE +28 -0
dlinfer_ascend-0.2.3.post2.dist-info/METADATA +213 -0
dlinfer_ascend-0.2.3.post2.dist-info/RECORD +70 -0
dlinfer_ascend-0.2.3.post2.dist-info/WHEEL +5 -0
dlinfer_ascend-0.2.3.post2.dist-info/entry_points.txt +2 -0
dlinfer_ascend-0.2.3.post2.dist-info/top_level.txt +1 -0

dlinfer/graph/custom_op.py ADDED Viewed

@@ -0,0 +1,147 @@
+# Copyright (c) 2024, DeepLink. All rights reserved.
+import inspect
+from functools import wraps
+from torch.library import Library, impl
+import dlinfer.graph
+from dlinfer.utils.type_annotation import Callable, Optional, Sequence, Dict
+from dlinfer.vendor import dispatch_key, vendor_name
+library_impl_dict: Dict[str, Library] = dict()
+graph_enabled_backends = ["ascend"]
+def register_custom_op(
+    qualname: str,
+    shape_param_keys: Optional[Sequence[str]] = None,
+    default_value: Optional[Dict] = None,
+    impl_abstract_func: Optional[Callable] = None,
+) -> Callable:
+    disable = vendor_name not in graph_enabled_backends
+    def inner_func(func: Callable):
+        if disable:
+            return override_default_value_static(default_value)(func)
+        import torch._custom_ops
+        nonlocal impl_abstract_func
+        lib_name, func_name = qualname.split("::")
+        torch._custom_ops.custom_op(qualname)(func)
+        # using low level torch.library APIs in case of the registration
+        # of fallback kernels which raises error in torch._custom_ops.impl
+        if lib_name not in library_impl_dict:
+            library_impl_dict[lib_name] = Library(lib_name, "IMPL")
+        impl(library_impl_dict[lib_name], func_name, dispatch_key)(func)
+        if impl_abstract_func is None:
+            assert shape_param_keys is not None
+            params_name_list = [name for name in inspect.signature(func).parameters]
+            def _impl_abstract_func(*args, **kwargs):
+                assert len(args) + len(kwargs) == len(params_name_list)
+                result = []
+                for key in shape_param_keys:
+                    key_index = params_name_list.index(key)
+                    if key_index < len(args):
+                        target = args[key_index]
+                    else:
+                        target = kwargs[key]
+                    result.append(torch.empty_like(target))
+                if len(result) == 1:
+                    return result[0]
+                return tuple(result)
+            impl_abstract_func = _impl_abstract_func
+        torch._custom_ops.impl_abstract(qualname)(impl_abstract_func)
+        torch_ops_namespace = getattr(torch.ops, lib_name)
+        torch_ops_func = getattr(torch_ops_namespace, func_name)
+        assert torch_ops_func is not None
+        # override default value
+        func_with_default = override_default_value_static(default_value)(func)
+        torch_ops_func_with_default = override_default_value_dynamic(
+            default_value, func
+        )(torch_ops_func)
+        # use config.enable_graph_mode to control func call
+        @wraps(func)
+        def patched_func(*args, **kwargs):
+            if not dlinfer.graph.config.enable_graph_mode:
+                return func_with_default(*args, **kwargs)
+            else:
+                return torch_ops_func_with_default(*args, **kwargs)
+        return patched_func
+    return inner_func
+def override_default_value_dynamic(
+    default_value: Optional[Dict], origin_func: Callable
+):
+    def inner_func(func):
+        if default_value is None:
+            return func
+        sig = inspect.signature(origin_func)
+        sig_param_keys = sig.parameters.keys()
+        params_str = ", ".join(sig_param_keys)
+        params_with_default = []
+        for name in sig_param_keys:
+            if name in default_value:
+                if isinstance(default_value[name], str):
+                    params_with_default.append(f"{name}='{default_value[name]}'")
+                else:
+                    params_with_default.append(f"{name}={default_value[name]}")
+            else:
+                params_with_default.append(name)
+        params_str_with_default = ", ".join(params_with_default)
+        func_code = f"""
+def {func.__name__}({params_str_with_default}):
+    return original_func({params_str})
+"""
+        exec_namespace = {}
+        # it's hard not to use exec here
+        exec(func_code, {"original_func": func}, exec_namespace)
+        dynamic_func = exec_namespace[func.__name__]
+        return dynamic_func
+    return inner_func
+def override_default_value_static(default_value: Optional[Dict]):
+    # suitable for the function which signature isn't (*args, **kwargs)
+    def inner_func(func):
+        if default_value is None:
+            return func
+        sig = inspect.signature(func)
+        old_params = sig.parameters
+        new_params = []
+        default_arg = []
+        default_kwarg = []
+        func_co_argcount = func.__code__.co_argcount
+        param_has_default_value = False
+        for idx, (name, param) in enumerate(old_params.items()):
+            if name in default_value:
+                new_param = param.replace(default=default_value[name])
+            else:
+                new_param = param
+            new_params.append(new_param)
+            if new_param.default is not inspect._empty:
+                if not param_has_default_value:
+                    param_has_default_value = True
+                if idx < func_co_argcount:
+                    default_arg.append(new_param.default)
+                else:
+                    default_kwarg.append((name, new_param.default))
+            else:
+                if param_has_default_value:
+                    raise SyntaxError(
+                        f"non-default argument '{name}' follows default argument"
+                    )
+        new_signature = sig.replace(parameters=new_params)
+        func.__signature__ = new_signature
+        func.__defaults__ = tuple(default_arg)
+        func.__kwdefaults__ = dict(default_kwarg)
+        return func
+    return inner_func

dlinfer/graph/dicp/__init__.py ADDED Viewed

File without changes

dlinfer/graph/dicp/dynamo_bridge/__init__.py ADDED Viewed

File without changes

dlinfer/graph/dicp/dynamo_bridge/compile.py ADDED Viewed

@@ -0,0 +1,42 @@
+from abc import ABCMeta, abstractmethod
+from dlinfer.graph.dicp.dynamo_bridge.torch_version import is_torch_251_or_higher
+if is_torch_251_or_higher:
+    from torch._inductor.async_compile import AsyncCompile
+else:
+    from torch._inductor.codecache import AsyncCompile
+class DeviceCompileJob:
+    __metaclass__ = ABCMeta
+    def __init__(self):
+        pass
+    @abstractmethod
+    def get_key():
+        pass
+    @abstractmethod
+    def get_compile_result():
+        pass
+class DeviceKernelCache:
+    cache = dict()
+    clear = staticmethod(cache.clear)
+    @classmethod
+    def get_kernel(cls, device_compile_job):
+        key = device_compile_job.get_key()
+        if key not in cls.cache:
+            loaded = device_compile_job.get_compile_result()
+            cls.cache[key] = loaded
+            cls.cache[key].key = key
+        return cls.cache[key]
+class AsyncCompileKernel(AsyncCompile):
+    def compile_kernel(self, device_compile_job):
+        return DeviceKernelCache.get_kernel(device_compile_job).run

dlinfer/graph/dicp/dynamo_bridge/compile_fx.py ADDED Viewed

@@ -0,0 +1,305 @@
+from torch._dynamo.backends.common import aot_autograd
+from torch._functorch.aot_autograd import make_boxed_func
+from .graph import GraphTransformer
+import functools
+import itertools
+import logging
+import sys
+import functorch
+import torch.fx
+import importlib
+import os
+from typing import List
+from importlib import import_module
+import torch
+from dlinfer.graph.dicp.dynamo_bridge import pt_patch  # noqa F401
+from dlinfer.graph.dicp.dynamo_bridge.torch_version import (
+    is_torch_200,
+    is_torch_210_or_higher,
+)
+log = logging.getLogger(__name__)
+dynamo_logging = import_module("torch._dynamo.logging")
+dynamo_utils = import_module("torch._dynamo.utils")
+count_calls = dynamo_utils.count_calls
+def get_fake_mode_from_tensors(input_tensors):
+    if is_torch_200:
+        from torch._dynamo.utils import fake_mode_from_tensors
+        return fake_mode_from_tensors(input_tensors)
+    elif is_torch_210_or_higher:
+        from torch._dynamo.utils import detect_fake_mode
+        return detect_fake_mode(input_tensors)
+    else:
+        raise ValueError(f"unsupported dicp torch version: {torch.__version__}")
+def used_nodes_all_symint(nodes):
+    for node in nodes:
+        if node.op == "placeholder" and len(node.users) > 0:
+            if hasattr(node, "meta"):
+                node = node.meta["val"]
+            if not isinstance(node, torch.SymInt):
+                return False
+        elif node.op == "output":
+            if hasattr(node, "meta") and "val" in node.meta:
+                node = node.meta["val"]
+            if not isinstance(node, torch.SymInt):
+                return False
+    return True
+@torch.utils._python_dispatch._disable_current_modes()
+def compile_fx_inner(
+    gm: torch.fx.GraphModule,
+    example_inputs: List[torch.Tensor],
+    num_fixed=0,
+    is_backward=False,
+    graph_id=None,
+    backend=None,
+):
+    if dynamo_utils.count_calls(gm.graph) == 0:
+        return make_boxed_func(gm.forward)
+    # all symint inputs fallback to eager mode
+    if used_nodes_all_symint(list(gm.graph.nodes)):
+        return gm
+    # lift the maximum depth of the Python interpreter stack
+    # to adapt large/deep models
+    sys.setrecursionlimit(max(sys.getrecursionlimit(), 2000))
+    gt = GraphTransformer(gm, backend)
+    gt.transform()
+    compiled_fn = gt.compile_to_fn()
+    # aot autograd needs to know to pass in inputs as a list
+    compiled_fn._boxed_call = True
+    return compiled_fn
+_graph_counter = itertools.count(0)
+def compile_fx(
+    model_: torch.fx.GraphModule,
+    example_inputs_: List[torch.Tensor],
+    backend: str,
+    inner_compile=compile_fx_inner,
+):
+    if is_torch_200:
+        return compile_fx_200(model_, example_inputs_, backend, inner_compile)
+    elif is_torch_210_or_higher:
+        return compile_fx_210(model_, example_inputs_, backend, inner_compile)
+    else:
+        raise ValueError(f"unsupported dicp torch version: {torch.__version__}")
+def compile_fx_200(
+    model_: torch.fx.GraphModule,
+    example_inputs_: List[torch.Tensor],
+    backend: str,
+    inner_compile=compile_fx_inner,
+):
+    """Main entrypoint to a compile given FX graph"""
+    functorch.compile.config.use_functionalize = True
+    functorch.compile.config.use_fake_tensor = True
+    num_example_inputs = len(example_inputs_)
+    graph_id = next(_graph_counter)
+    @dynamo_utils.dynamo_timed
+    def fw_compiler(model: torch.fx.GraphModule, example_inputs):
+        fixed = len(example_inputs) - num_example_inputs
+        return inner_compile(
+            model,
+            example_inputs,
+            num_fixed=fixed,
+            graph_id=graph_id,
+            backend=backend,
+        )
+    @dynamo_utils.dynamo_timed
+    def bw_compiler(model: torch.fx.GraphModule, example_inputs):
+        fixed = count_tangents(model)
+        return inner_compile(
+            model,
+            example_inputs,
+            num_fixed=fixed,
+            is_backward=True,
+            graph_id=graph_id,
+            backend=backend,
+        )
+    decompositions = get_decompositions(backend=backend)
+    return aot_autograd(
+        fw_compiler=fw_compiler, bw_compiler=bw_compiler, decompositions=decompositions
+    )(model_, example_inputs_)
+def compile_fx_210(
+    model_: torch.fx.GraphModule,
+    example_inputs_: List[torch.Tensor],
+    backend: str,
+    inner_compile=compile_fx_inner,
+):
+    import torch._dynamo.config as dynamo_config
+    from torch._inductor.compile_fx import (
+        flatten_graph_inputs,
+        graph_returns_tuple,
+        make_graph_return_tuple,
+        pre_grad_passes,
+        joint_graph_passes,
+        min_cut_rematerialization_partition,
+        _PyTreeCodeGen,
+        handle_dynamo_export_graph,
+    )
+    decompositions = get_decompositions(backend=backend)
+    recursive_compile_fx = functools.partial(
+        compile_fx,
+        inner_compile=inner_compile,
+        decompositions=decompositions,
+    )
+    if not graph_returns_tuple(model_):
+        return make_graph_return_tuple(
+            model_,
+            example_inputs_,
+            recursive_compile_fx,
+        )
+    if isinstance(model_, torch.fx.GraphModule):
+        if isinstance(model_.graph._codegen, _PyTreeCodeGen):
+            # this graph is the result of dynamo.export()
+            return handle_dynamo_export_graph(
+                model_,
+                example_inputs_,
+                recursive_compile_fx,
+            )
+        # Since handle_dynamo_export_graph will trigger compile_fx again,
+        # Move these passes after handle_dynamo_export_graph to avoid repeated calls.
+        model_ = pre_grad_passes(model_, example_inputs_)
+    if any(isinstance(x, (list, tuple, dict)) for x in example_inputs_):
+        return flatten_graph_inputs(
+            model_,
+            example_inputs_,
+            recursive_compile_fx,
+        )
+    # assert not config._raise_error_for_testing
+    num_example_inputs = len(example_inputs_)
+    graph_id = next(_graph_counter)
+    def fw_compiler_base(
+        model: torch.fx.GraphModule,
+        example_inputs: List[torch.Tensor],
+        is_inference: bool,
+    ):
+        return _fw_compiler_base(model, example_inputs, is_inference)
+    def _fw_compiler_base(
+        model: torch.fx.GraphModule,
+        example_inputs: List[torch.Tensor],
+        is_inference: bool,
+    ):
+        if is_inference:
+            # partition_fn won't be called
+            # joint_graph_passes(model)
+            pass
+        fixed = len(example_inputs) - num_example_inputs
+        return inner_compile(
+            model,
+            example_inputs,
+            num_fixed=fixed,
+            graph_id=graph_id,
+            backend=backend,
+        )
+    fw_compiler = functools.partial(fw_compiler_base, is_inference=False)
+    inference_compiler = functools.partial(fw_compiler_base, is_inference=True)
+    def partition_fn(graph, joint_inputs, **kwargs):
+        joint_graph_passes(graph)
+        return min_cut_rematerialization_partition(
+            graph, joint_inputs, **kwargs, compiler="inductor"
+        )
+    # Save and restore dynamic shapes setting for backwards, as it is
+    # sometimes done as a context manager which won't be set when we
+    # hit backwards compile
+    dynamic_shapes = dynamo_config.dynamic_shapes
+    def bw_compiler(model: torch.fx.GraphModule, example_inputs):
+        with dynamo_config.patch(dynamic_shapes=dynamic_shapes):
+            fixed = count_tangents(model)
+            return inner_compile(
+                model,
+                example_inputs,
+                num_fixed=fixed,
+                is_backward=True,
+                graph_id=graph_id,
+                backend=backend,
+            )
+    # TODO: can add logging before/after the call to create_aot_dispatcher_function
+    # in torch._functorch/aot_autograd.py::aot_module_simplified::aot_function_simplified::new_func
+    # once torchdynamo is merged into pytorch
+    return aot_autograd(
+        fw_compiler=fw_compiler,
+        bw_compiler=bw_compiler,
+        inference_compiler=inference_compiler,
+        decompositions=decompositions,
+        partition_fn=partition_fn,
+        keep_inference_input_mutations=True,
+    )(model_, example_inputs_)
+def count_tangents(fx_g: torch.fx.GraphModule):
+    """
+    Infers which inputs are static for a backwards graph
+    """
+    def is_not_gradout(x):
+        return "tangents" not in x.name
+    arg_count = 0
+    static_arg_idxs = []
+    for n in fx_g.graph.nodes:
+        if n.op == "placeholder":
+            if is_not_gradout(n):
+                static_arg_idxs.append(arg_count)
+            arg_count += 1
+    assert static_arg_idxs == list(range(len(static_arg_idxs)))
+    return len(static_arg_idxs)
+def get_decompositions(backend):
+    decompositions = {}
+    folder_list = os.listdir(os.path.dirname(os.path.dirname(__file__)) + "/vendor")
+    found_decomp = False
+    for folder in folder_list:
+        if backend.lower() == folder.lower():
+            config = importlib.import_module(
+                "dlinfer.graph.dicp.vendor." + folder + ".config"
+            )
+            decompositions = config.decomp
+            found_decomp = True
+    assert found_decomp, "Not found decomp table!"
+    return decompositions

dlinfer/graph/dicp/dynamo_bridge/conversion.py ADDED Viewed

@@ -0,0 +1,75 @@
+import functools
+import torch
+from dlinfer.graph.dicp.dynamo_bridge.operator import Operator
+def args_kwargs_unchange(args, kwargs):
+    return args, kwargs
+def register_conversion_impl(
+    conversions: list, aten_fn, decomp_fn, process_args_kwargs_fn=None
+):
+    register_op_singleton_flag = isinstance(decomp_fn, type) and issubclass(
+        decomp_fn, Operator
+    )
+    if register_op_singleton_flag:
+        wrapped = (
+            decomp_fn.get_singleton(),
+            (
+                args_kwargs_unchange
+                if process_args_kwargs_fn is None
+                else process_args_kwargs_fn
+            ),
+        )
+    else:
+        @functools.wraps(decomp_fn)
+        def wrapped(*args, **kwargs):
+            return decomp_fn(*args, **kwargs)
+    if not isinstance(aten_fn, (list, tuple)):
+        aten_fn = [aten_fn]
+    else:
+        aten_fn = list(aten_fn)
+    aten_fn_for_key = []
+    for fn in list(aten_fn):
+        if isinstance(fn, str):
+            assert fn.startswith("torch.ops")
+            real_fn_name = fn.replace("torch.ops.", "")
+            ns, op_overload = real_fn_name.split(".", 1)
+            if not hasattr(torch.ops, ns):
+                print(
+                    f"[dicp] can't find torch.ops.{ns}, conversion for {fn} is ignored"
+                )
+                continue
+            ns_obj = getattr(torch.ops, ns)
+            if "." in op_overload:
+                op, overload = op_overload.split(".", 1)
+                if not hasattr(ns_obj, op):
+                    print(
+                        f"[dicp] can't find torch.ops.{ns}.{op}, conversion for {fn} is ignored"
+                    )
+                    continue
+                op_obj = getattr(ns_obj, op)
+                fn = getattr(op_obj, overload)
+            else:
+                if not hasattr(ns_obj, op_overload):
+                    print(
+                        f"[dicp] can't find torch.ops.{ns}.{op_overload}, conversion for {fn} is ignored"
+                    )
+                    continue
+                fn = getattr(ns_obj, op_overload)
+        if isinstance(fn, torch._ops.OpOverloadPacket):
+            for overload in fn.overloads():
+                other_fn = getattr(fn, overload)
+                if other_fn not in conversions:
+                    aten_fn_for_key.append(other_fn)
+        aten_fn_for_key.append(fn)
+    conversions.update({fn: wrapped for fn in aten_fn_for_key})
+    if register_op_singleton_flag:
+        return wrapped[0]
+    else:
+        return wrapped

dlinfer/graph/dicp/dynamo_bridge/decompositions.py ADDED Viewed

@@ -0,0 +1,38 @@
+from collections import defaultdict
+from typing import Callable, Dict, Sequence, Union
+import torch
+from torch._decomp import register_decomposition
+from torch._ops import OpOverload, OpOverloadPacket
+dicp_decomposition_table = {}
+aten = torch.ops.aten
+def register_decomposition_for_dicp(fn):
+    return register_decomposition(fn, registry=dicp_decomposition_table)
+@register_decomposition_for_dicp(aten.count_nonzero.default)
+def count_nonzero_default(x, dim=None):
+    cond = x != 0
+    dim = [] if dim is None else dim
+    return aten.sum.dim_IntList(cond, dim=dim, keepdim=False, dtype=torch.int64)
+def get_decompositions(
+    aten_ops: Sequence[Union[OpOverload, OpOverloadPacket]],
+    target_decomposition_table: Dict[OpOverload, Callable] = None,
+) -> Dict[OpOverload, Callable]:
+    registry = dicp_decomposition_table
+    packets_to_overloads = defaultdict(list)
+    for opo in registry:
+        packets_to_overloads[opo.overloadpacket].append(opo)
+    decompositions = target_decomposition_table if target_decomposition_table else {}
+    for op in aten_ops:
+        if isinstance(op, OpOverloadPacket) and op in packets_to_overloads:
+            for op_overload in packets_to_overloads[op]:
+                decompositions[op_overload] = registry[op_overload]
+        elif isinstance(op, OpOverload) and op in registry:
+            decompositions[op] = registry[op]
+    return decompositions