PyPI - tico - Versions diffs - 0.1.0__py3-none-any.whl - Mend

tico 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (206) hide show

tico/__init__.py +42 -0
tico/config/__init__.py +4 -0
tico/config/base.py +37 -0
tico/config/factory.py +41 -0
tico/config/v1.py +35 -0
tico/experimental/__init__.py +1 -0
tico/experimental/quantization/__init__.py +1 -0
tico/experimental/quantization/algorithm/__init__.py +1 -0
tico/experimental/quantization/algorithm/gptq/__init__.py +1 -0
tico/experimental/quantization/algorithm/gptq/gptq.py +172 -0
tico/experimental/quantization/algorithm/gptq/quant.py +153 -0
tico/experimental/quantization/algorithm/gptq/quantizer.py +225 -0
tico/experimental/quantization/algorithm/gptq/utils.py +65 -0
tico/experimental/quantization/algorithm/pt2e/__init__.py +1 -0
tico/experimental/quantization/algorithm/pt2e/annotation/__init__.py +1 -0
tico/experimental/quantization/algorithm/pt2e/annotation/annotator.py +215 -0
tico/experimental/quantization/algorithm/pt2e/annotation/config.py +26 -0
tico/experimental/quantization/algorithm/pt2e/annotation/op/__init__.py +21 -0
tico/experimental/quantization/algorithm/pt2e/annotation/op/adaptive_avg_pool2d.py +65 -0
tico/experimental/quantization/algorithm/pt2e/annotation/op/add.py +57 -0
tico/experimental/quantization/algorithm/pt2e/annotation/op/conv2d.py +92 -0
tico/experimental/quantization/algorithm/pt2e/annotation/op/div.py +57 -0
tico/experimental/quantization/algorithm/pt2e/annotation/op/linear.py +94 -0
tico/experimental/quantization/algorithm/pt2e/annotation/op/mean.py +53 -0
tico/experimental/quantization/algorithm/pt2e/annotation/op/mul.py +57 -0
tico/experimental/quantization/algorithm/pt2e/annotation/op/relu6.py +53 -0
tico/experimental/quantization/algorithm/pt2e/annotation/op/rsqrt.py +53 -0
tico/experimental/quantization/algorithm/pt2e/annotation/op/sub.py +57 -0
tico/experimental/quantization/algorithm/pt2e/annotation/spec.py +47 -0
tico/experimental/quantization/algorithm/pt2e/annotation/utils.py +88 -0
tico/experimental/quantization/algorithm/pt2e/quantizer.py +78 -0
tico/experimental/quantization/algorithm/pt2e/transformation/__init__.py +1 -0
tico/experimental/quantization/algorithm/pt2e/transformation/convert_scalars_to_attrs.py +58 -0
tico/experimental/quantization/algorithm/pt2e/utils.py +138 -0
tico/experimental/quantization/algorithm/smoothquant/__init__.py +1 -0
tico/experimental/quantization/algorithm/smoothquant/observer.py +78 -0
tico/experimental/quantization/algorithm/smoothquant/quantizer.py +81 -0
tico/experimental/quantization/algorithm/smoothquant/smooth_quant.py +164 -0
tico/experimental/quantization/config.py +68 -0
tico/experimental/quantization/evaluation/__init__.py +1 -0
tico/experimental/quantization/evaluation/backend.py +20 -0
tico/experimental/quantization/evaluation/evaluate.py +223 -0
tico/experimental/quantization/evaluation/executor/__init__.py +1 -0
tico/experimental/quantization/evaluation/executor/backend_executor.py +54 -0
tico/experimental/quantization/evaluation/executor/circle_executor.py +75 -0
tico/experimental/quantization/evaluation/executor/triv24_executor.py +128 -0
tico/experimental/quantization/evaluation/metric.py +109 -0
tico/experimental/quantization/evaluation/utils.py +185 -0
tico/experimental/quantization/passes/__init__.py +1 -0
tico/experimental/quantization/passes/fold_quant_ops.py +154 -0
tico/experimental/quantization/passes/insert_quantize_on_dtype_mismatch.py +345 -0
tico/experimental/quantization/passes/propagate_qparam_backward.py +91 -0
tico/experimental/quantization/passes/propagate_qparam_forward.py +141 -0
tico/experimental/quantization/passes/quantize_bias.py +123 -0
tico/experimental/quantization/passes/remove_weight_dequant_op.py +177 -0
tico/experimental/quantization/public_interface.py +108 -0
tico/experimental/quantization/quantizer.py +71 -0
tico/interpreter/__init__.py +1 -0
tico/interpreter/infer.py +116 -0
tico/interpreter/interpreter.py +93 -0
tico/passes/__init__.py +1 -0
tico/passes/cast_aten_where_arg_type.py +191 -0
tico/passes/cast_mixed_type_args.py +187 -0
tico/passes/const_prop_pass.py +307 -0
tico/passes/convert_conv1d_to_conv2d.py +160 -0
tico/passes/convert_layout_op_to_reshape.py +85 -0
tico/passes/convert_repeat_to_expand_copy.py +89 -0
tico/passes/convert_to_relu6.py +181 -0
tico/passes/decompose_addmm.py +124 -0
tico/passes/decompose_batch_norm.py +192 -0
tico/passes/decompose_fake_quantize.py +134 -0
tico/passes/decompose_fake_quantize_tensor_qparams.py +294 -0
tico/passes/decompose_group_norm.py +275 -0
tico/passes/decompose_grouped_conv2d.py +209 -0
tico/passes/decompose_slice_scatter.py +169 -0
tico/passes/extract_dtype_kwargs.py +122 -0
tico/passes/fill_meta_val.py +57 -0
tico/passes/fuse_leading_unsqueeze_reshape.py +112 -0
tico/passes/fuse_redundant_reshape_to_mean.py +102 -0
tico/passes/legalize_causal_mask_value.py +108 -0
tico/passes/legalize_predefined_layout_operators.py +386 -0
tico/passes/lower_pow2_to_mul.py +75 -0
tico/passes/lower_to_resize_nearest_neighbor.py +235 -0
tico/passes/lower_to_slice.py +230 -0
tico/passes/merge_consecutive_cat.py +80 -0
tico/passes/ops.py +78 -0
tico/passes/remove_nop.py +84 -0
tico/passes/remove_redundant_assert_nodes.py +51 -0
tico/passes/remove_redundant_expand.py +66 -0
tico/passes/remove_redundant_permute.py +122 -0
tico/passes/remove_redundant_reshape.py +436 -0
tico/passes/remove_redundant_slice.py +62 -0
tico/passes/remove_redundant_to_copy.py +86 -0
tico/passes/restore_linear.py +115 -0
tico/passes/segment_index_select.py +145 -0
tico/pt2_to_circle.py +105 -0
tico/serialize/__init__.py +1 -0
tico/serialize/circle_graph.py +319 -0
tico/serialize/circle_mapping.py +177 -0
tico/serialize/circle_serializer.py +240 -0
tico/serialize/operators/__init__.py +28 -0
tico/serialize/operators/hashable_opcode.py +43 -0
tico/serialize/operators/node_visitor.py +80 -0
tico/serialize/operators/op_abs.py +53 -0
tico/serialize/operators/op_add.py +69 -0
tico/serialize/operators/op_alias_copy.py +64 -0
tico/serialize/operators/op_any.py +150 -0
tico/serialize/operators/op_arange_start_step.py +61 -0
tico/serialize/operators/op_argmax.py +62 -0
tico/serialize/operators/op_avg_pool2d.py +192 -0
tico/serialize/operators/op_bmm.py +62 -0
tico/serialize/operators/op_cat.py +66 -0
tico/serialize/operators/op_clamp.py +126 -0
tico/serialize/operators/op_clone.py +71 -0
tico/serialize/operators/op_constant_pad_nd.py +72 -0
tico/serialize/operators/op_conv2d.py +186 -0
tico/serialize/operators/op_copy.py +164 -0
tico/serialize/operators/op_cos.py +59 -0
tico/serialize/operators/op_cumsum.py +95 -0
tico/serialize/operators/op_depthwise_conv2d.py +199 -0
tico/serialize/operators/op_dequantize_per_channel.py +82 -0
tico/serialize/operators/op_dequantize_per_tensor.py +64 -0
tico/serialize/operators/op_div.py +62 -0
tico/serialize/operators/op_embedding.py +60 -0
tico/serialize/operators/op_eq.py +64 -0
tico/serialize/operators/op_exp.py +60 -0
tico/serialize/operators/op_expand.py +91 -0
tico/serialize/operators/op_full.py +48 -0
tico/serialize/operators/op_full_like.py +55 -0
tico/serialize/operators/op_ge.py +54 -0
tico/serialize/operators/op_gelu.py +59 -0
tico/serialize/operators/op_gt.py +54 -0
tico/serialize/operators/op_index.py +82 -0
tico/serialize/operators/op_index_select.py +64 -0
tico/serialize/operators/op_instance_norm.py +91 -0
tico/serialize/operators/op_leaky_relu.py +60 -0
tico/serialize/operators/op_linear.py +70 -0
tico/serialize/operators/op_log.py +53 -0
tico/serialize/operators/op_log1p.py +86 -0
tico/serialize/operators/op_logical_and.py +63 -0
tico/serialize/operators/op_logical_not.py +62 -0
tico/serialize/operators/op_lt.py +61 -0
tico/serialize/operators/op_max_dim.py +70 -0
tico/serialize/operators/op_max_pool2d_with_indices.py +155 -0
tico/serialize/operators/op_maximum.py +53 -0
tico/serialize/operators/op_mean.py +66 -0
tico/serialize/operators/op_minimum.py +53 -0
tico/serialize/operators/op_mm.py +177 -0
tico/serialize/operators/op_mul.py +99 -0
tico/serialize/operators/op_ne.py +54 -0
tico/serialize/operators/op_neg.py +59 -0
tico/serialize/operators/op_permute.py +65 -0
tico/serialize/operators/op_pow.py +141 -0
tico/serialize/operators/op_prelu.py +54 -0
tico/serialize/operators/op_quantize_per_tensor.py +79 -0
tico/serialize/operators/op_reciprocal.py +64 -0
tico/serialize/operators/op_relu.py +53 -0
tico/serialize/operators/op_relu6.py +52 -0
tico/serialize/operators/op_repeat.py +100 -0
tico/serialize/operators/op_reshape.py +73 -0
tico/serialize/operators/op_resize_nearest_neighbor.py +70 -0
tico/serialize/operators/op_rsqrt.py +53 -0
tico/serialize/operators/op_scalar_tensor.py +51 -0
tico/serialize/operators/op_select_copy.py +65 -0
tico/serialize/operators/op_sigmoid.py +56 -0
tico/serialize/operators/op_sin.py +53 -0
tico/serialize/operators/op_slice.py +155 -0
tico/serialize/operators/op_softmax.py +100 -0
tico/serialize/operators/op_split_with_sizes.py +99 -0
tico/serialize/operators/op_sqrt.py +55 -0
tico/serialize/operators/op_squeeze.py +73 -0
tico/serialize/operators/op_sub.py +71 -0
tico/serialize/operators/op_sum.py +63 -0
tico/serialize/operators/op_tanh.py +54 -0
tico/serialize/operators/op_to_copy.py +105 -0
tico/serialize/operators/op_unsqueeze.py +66 -0
tico/serialize/operators/op_view.py +74 -0
tico/serialize/operators/op_where.py +82 -0
tico/serialize/operators/utils.py +94 -0
tico/serialize/pack.py +35 -0
tico/serialize/quant_param.py +42 -0
tico/utils/__init__.py +1 -0
tico/utils/convert.py +296 -0
tico/utils/define.py +35 -0
tico/utils/diff_graph.py +181 -0
tico/utils/errors.py +35 -0
tico/utils/graph.py +282 -0
tico/utils/logging.py +45 -0
tico/utils/model.py +37 -0
tico/utils/mx/__init__.py +1 -0
tico/utils/mx/elemwise_ops.py +267 -0
tico/utils/mx/formats.py +125 -0
tico/utils/mx/mx_ops.py +270 -0
tico/utils/padding.py +47 -0
tico/utils/passes.py +76 -0
tico/utils/register_custom_op.py +609 -0
tico/utils/serialize.py +42 -0
tico/utils/trace_decorators.py +101 -0
tico/utils/utils.py +406 -0
tico/utils/validate_args_kwargs.py +1149 -0
tico-0.1.0.dist-info/LICENSE +241 -0
tico-0.1.0.dist-info/METADATA +354 -0
tico-0.1.0.dist-info/RECORD +206 -0
tico-0.1.0.dist-info/WHEEL +5 -0
tico-0.1.0.dist-info/entry_points.txt +3 -0
tico-0.1.0.dist-info/top_level.txt +1 -0

tico/passes/fuse_redundant_reshape_to_mean.py ADDED Viewed

@@ -0,0 +1,102 @@
+# Copyright (c) 2025 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+if TYPE_CHECKING:
+    import torch.fx
+import torch
+from torch.export import ExportedProgram
+from torch.utils import _pytree as pytree
+from tico.passes import ops
+from tico.serialize.circle_mapping import extract_shape
+from tico.utils import logging
+from tico.utils.passes import PassBase, PassResult
+from tico.utils.trace_decorators import trace_graph_diff_on_pass
+from tico.utils.utils import is_target_node
+@trace_graph_diff_on_pass
+class FuseRedundantReshapeToMean(PassBase):
+    """
+    This pass removes redundant `aten.reshape` operators that can be fused to `aten.mean` with `keep_dims`.
+    Shape(aten.reshape(aten.mean(input))) == Shape(aten.mean(input, keep_dims=True))
+    """
+    def __init__(self):
+        super().__init__()
+    def call(self, exported_program: ExportedProgram) -> PassResult:
+        logger = logging.getLogger(__name__)
+        graph_module = exported_program.graph_module
+        graph = graph_module.graph
+        modified = False
+        for node in graph.nodes:
+            if not is_target_node(node, torch.ops.aten.mean.dim):
+                continue
+            # If mean is being used in other nodes, do not fuse it.
+            if len(node.users) != 1:
+                continue
+            user_node = next(iter(node.users))
+            if not is_target_node(user_node, ops.aten.reshape):
+                continue
+            mean_args, mean_kwargs = pytree.tree_map_only(
+                torch.fx.Node,
+                lambda n: n.meta["val"],
+                (node.args, node.kwargs),
+            )
+            # Signature of aten.mean.dim is as follows.
+            # mean.dim(Tensor self, int[1]? dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
+            # `keepdim` in `node.kwargs` is moved to `node.args` in `run_decompositions`.
+            # `dtype` in `node.kwargs` is not moved
+            assert len(mean_args) == 3 or len(mean_args) == 2  # keepdim exists or not
+            assert len(mean_kwargs) <= 1  # dtype exists or not
+            fused_mean_args = mean_args
+            keep_dims = True
+            if len(mean_args) == 2:
+                fused_mean_args += (keep_dims,)
+            fused_val = node.target(*fused_mean_args, **mean_kwargs)
+            # Check if both shapes are same
+            # 1. Shape(aten.reshape(aten.mean))
+            # 2. Shape(aten.mean(keep_dims=True))
+            if fused_val.size() != extract_shape(user_node):
+                continue
+            # update args
+            if len(mean_args) == 2:
+                updated_args = node.args + (keep_dims,)
+            elif len(mean_args) == 3:
+                updated_args = node.args
+            else:
+                raise RuntimeError("Invalid input")
+            node.args = updated_args
+            node.meta["val"] = fused_val
+            user_node.replace_all_uses_with(node, propagate_meta=False)
+            modified = True
+            logger.debug(f"{user_node.name} is replaced with {node.name}")
+        graph.eliminate_dead_code()
+        graph.lint()
+        graph_module.recompile()
+        return PassResult(modified)

tico/passes/legalize_causal_mask_value.py ADDED Viewed

@@ -0,0 +1,108 @@
+# Copyright (c) 2025 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+if TYPE_CHECKING:
+    import torch.fx
+import torch
+from torch.export import ExportedProgram
+from tico.passes import ops
+from tico.utils import logging
+from tico.utils.passes import PassBase, PassResult
+from tico.utils.trace_decorators import trace_graph_diff_on_pass
+from tico.utils.utils import is_target_node
+from tico.utils.validate_args_kwargs import AddTensorArgs
+@trace_graph_diff_on_pass
+class LegalizeCausalMaskValue(PassBase):
+    """
+    This pass replaces occurrences of -inf in attention masks with a large negative finite value (e.g., -120) to ensure numerical stability in computations, particularly in softmax operations.
+    This pass can be turned enable only when
+        1. The model will be quantized later (e.g., by circle-quantizer).
+        2. Softmax kernel of our backend does not support masking.
+        3. `Add with -inf` is used only for masking.
+    """
+    def __init__(self, enabled: bool = False):
+        super().__init__()
+        self.enabled = enabled
+    def call(self, exported_program: ExportedProgram) -> PassResult:
+        if not self.enabled:
+            return PassResult(False)
+        new_mask = -120  # Make it configurable
+        logger = logging.getLogger(__name__)
+        graph_module = exported_program.graph_module
+        graph = graph_module.graph
+        modified = False
+        for node in graph.nodes:
+            if not is_target_node(node, ops.aten.add):
+                continue
+            args = AddTensorArgs(*node.args, **node.kwargs)
+            input = args.input
+            other = args.other
+            if (
+                isinstance(input, torch.fx.Node)
+                and input.name
+                in exported_program.graph_signature.lifted_tensor_constants
+            ):
+                mask_node = input
+            elif (
+                isinstance(other, torch.fx.Node)
+                and other.name
+                in exported_program.graph_signature.lifted_tensor_constants
+            ):
+                mask_node = other
+            else:
+                continue
+            mask_node_name = (
+                exported_program.graph_signature.inputs_to_lifted_tensor_constants[
+                    mask_node.name
+                ]
+            )
+            mask_data = exported_program.constants[mask_node_name]
+            # WHY Use -1.e+38, not -float('inf') or torch.finfo(torch.float32).min?
+            #
+            # torch.finfo(torch.float32).min is -3.4028234663852886e+38 but it changes while processed in const prop or other passes.
+            # Therefore, use a rounded value and compare to know it's very large negative number.
+            fp32_minus_inf_rounded = -1.0e38
+            if torch.all(
+                torch.logical_or(mask_data == 0, mask_data < fp32_minus_inf_rounded)
+            ):
+                exported_program.constants[mask_node_name] = torch.where(
+                    mask_data < fp32_minus_inf_rounded,
+                    torch.tensor(new_mask, dtype=mask_data.dtype),
+                    mask_data,
+                )
+            modified = False  # To run only once
+            logger.debug(
+                f"{mask_node.name}'s mask data are changed from '-inf' to {new_mask}"
+            )
+        graph.eliminate_dead_code()
+        graph.lint()
+        graph_module.recompile()
+        return PassResult(modified)

tico/passes/legalize_predefined_layout_operators.py ADDED Viewed

@@ -0,0 +1,386 @@
+# Copyright (c) 2025 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from types import NoneType
+from typing import Optional, TYPE_CHECKING
+if TYPE_CHECKING:
+    import torch.fx
+import torch
+from torch.export import ExportedProgram
+from tico.serialize.circle_graph import extract_shape
+from tico.utils import logging
+from tico.utils.errors import NotYetSupportedError
+from tico.utils.graph import create_node
+from tico.utils.passes import PassBase, PassResult
+from tico.utils.trace_decorators import trace_graph_diff_on_pass
+from tico.utils.utils import is_target_node
+from tico.utils.validate_args_kwargs import (
+    AvgPool2dArgs,
+    Conv2DArgs,
+    DequantizePerChannelArgs,
+    DequantizePerTensorArgs,
+    InstanceNormArgs,
+    MaxPool2dWithIndicesArgs,
+)
+def get_permute_weight_input(conv_args: Conv2DArgs) -> torch.fx.Node:
+    """
+    Retrieves the weight input for the permute operation.
+    This function extracts the weight tensor from the given convolution arguments.
+    If the weight is in floating point format, it is returned directly.
+    If the weight is quantized and followed by a Dequantize operation, the function
+     returns the input of the Dequantize node (i.e., the original quantized weight)
+    """
+    weight = conv_args.weight
+    dq_args: Optional[DequantizePerChannelArgs | DequantizePerTensorArgs] = None
+    if weight.target == torch.ops.quantized_decomposed.dequantize_per_channel.default:
+        dq_args = DequantizePerChannelArgs(*weight.args, *weight.kwargs)  # type: ignore[arg-type]
+    elif weight.target == torch.ops.quantized_decomposed.dequantize_per_tensor.default:
+        dq_args = DequantizePerTensorArgs(*weight.args, *weight.kwargs)  # type: ignore[arg-type]
+    return getattr(dq_args, "input", weight)
+@trace_graph_diff_on_pass
+class LegalizePreDefinedLayoutOperators(PassBase):
+    """
+    Pytorch basically assumes NCHW memory format. But, Circle assumes NHWC. Specifcally, some operators have kernels only for NHWC memory format.
+    So, we need to permute the dimensions accordingly.
+    NOTE. This pass DOES NOT CHANGE node.kwargs["memory_format"]. It changes memory formats by inserting `aten.permute` operators.
+    [1] aten.conv2d with group = 1 (circle_custom.conv2d)
+        [BEFORE PASS]
+          Input[NCHW] ------------------- aten.conv2d[NCHW] ---- OUTPUT[NCHW]
+          Weight[NCHW] - (aten.dequantize) ---/
+          Bias --------- (aten.dequantize) --/
+        [AFTER PASS]
+          Input[NCHW] ---- aten.permute(NCHW_to_NHWC) ---------- circle_cumstom.conv2d[NHWC] ---- aten.permute(NHWC_to_NCHW) ---- OUTPUT[NCHW]
+          Weight[NCHW] - (aten.dequantize) - aten.permute(NCHW_to_NHWC) ---/
+          Bias --------- (aten.dequantize) -------------------------------/
+    [2] aten.conv2d with group == Input[C] (circle_custom.depthwise_conv2d)
+        NOTE: Weight layout is CNHW (IOHW)
+        [BEFORE PASS]
+          Input[NCHW] -------------- aten.conv2d[NCHW] ---- OUTPUT[NCHW]
+          Weight[CNHW] - (aten.dequantize) --/
+          Bias ----------(aten.dequantize) -/
+        [AFTER PASS]
+          Input[NCHW] ---- aten.permute(NCHW_to_NHWC) ---- circle_cumstom.depthwise_conv2d[NHWC] ---- aten.permute(NHWC_to_NCHW) ---- OUTPUT[NCHW]
+          Weight[CNHW] - (aten.dequantize) - aten.permute(CNHW_to_NHWC) ---/
+          Bias ----------(aten.dequantize) -------------------------------/
+    """
+    def __init__(self):
+        super().__init__()
+    def legalize_conv2d(self, exported_program, node) -> bool:
+        logger = logging.getLogger(__name__)
+        modified = False
+        graph_module = exported_program.graph_module
+        graph = graph_module.graph
+        # conv2d            (Tensor input, Tensor weight, Tensor? bias=None, SymInt[2] stride=1, SymInt[2] padding=0, SymInt[2] dilation=1, SymInt groups=1) -> Tensor
+        # conv2d.padding    (Tensor input, Tensor weight, Tensor? bias=None, SymInt[2] stride=1, str padding="valid", SymInt[2] dilation=1, SymInt groups=1) -> Tensor
+        args = Conv2DArgs(*node.args, **node.kwargs)  # type: ignore[arg-type]
+        input = args.input
+        padding = args.padding
+        groups = args.groups
+        input_shape = extract_shape(input)
+        if not (len(input_shape) == 4):
+            raise NotYetSupportedError(
+                f"Only support 4D input tensor: node's input shape: {input_shape}"
+            )
+        if not (groups == 1 or groups == input_shape[1]):
+            raise NotYetSupportedError(
+                f"Only support groups=1 or groups=input_channels: node's groups: {groups}, input channels: {input_shape[1]}"
+            )
+        NCHW_to_NHWC = [0, 2, 3, 1]
+        # TODO Introduce a method that inserts permute op.
+        # input permute
+        with graph.inserting_after(input):
+            input_permute = create_node(
+                graph,
+                torch.ops.aten.permute.default,
+                args=(input, NCHW_to_NHWC),
+                origin=input,
+            )
+            node.update_arg(node.args.index(input), input_permute)
+        # weight permute
+        weight = get_permute_weight_input(args)
+        with graph.inserting_after(weight):
+            if groups == 1:
+                # circle_custom.conv2d
+                perm = [0, 2, 3, 1]  # OIHW_to_OHWI
+            elif groups == input_shape[1]:
+                # circle_custom.depthwise_conv2d
+                perm = [1, 2, 3, 0]  # O1HW_to_1HWO
+            else:
+                assert groups == 1 or groups == input_shape[1]  # Cannot reach here
+            weight_permute = create_node(
+                graph,
+                torch.ops.aten.permute.default,
+                args=(weight, perm),
+                origin=weight,
+            )
+            if args.weight.target in [
+                torch.ops.quantized_decomposed.dequantize_per_channel.default,
+                torch.ops.quantized_decomposed.dequantize_per_tensor.default,
+            ]:
+                dq = args.weight
+                dq.update_arg(dq.args.index(weight), weight_permute)
+                # Need to update dq.meta["val"] in FillMetaVal pass.
+                del dq.meta["val"]
+            else:
+                node.update_arg(node.args.index(weight), weight_permute)
+        with graph.inserting_before(node):
+            legalized_op = None
+            if groups == 1:
+                if isinstance(padding, list):
+                    legalized_op = torch.ops.circle_custom.conv2d
+                elif isinstance(padding, str):
+                    legalized_op = torch.ops.circle_custom.conv2d.padding
+            elif groups == input_shape[1]:
+                if isinstance(padding, list):
+                    legalized_op = torch.ops.circle_custom.depthwise_conv2d
+                elif isinstance(padding, str):
+                    legalized_op = torch.ops.circle_custom.depthwise_conv2d.padding
+            else:
+                assert groups == 1 or groups == input_shape[1]  # Cannot reach here
+            assert legalized_op is not None
+            circle_op = create_node(
+                graph, legalized_op, args=node.args, kwargs=node.kwargs, origin=node
+            )
+            # output permute
+            NHWC_to_NCHW = [0, 3, 1, 2]
+            conv_out_permute = create_node(
+                graph,
+                torch.ops.aten.permute.default,
+                args=(circle_op, NHWC_to_NCHW),
+            )
+        node.replace_all_uses_with(conv_out_permute, propagate_meta=True)
+        logger.debug(f"{node.name} is replaced with {circle_op.name}")
+        modified = True
+        return modified
+    def legalize_instance_norm(self, exported_program, node) -> bool:
+        logger = logging.getLogger(__name__)
+        modified = False
+        graph_module = exported_program.graph_module
+        graph = graph_module.graph
+        # instance_norm(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool use_input_stats, float momentum, float eps, bool cudnn_enabled) -> Tensor
+        args = InstanceNormArgs(*node.args, **node.kwargs)  # type: ignore[arg-type]
+        input = args.input
+        weight = args.weight
+        bias = args.bias
+        eps = args.eps
+        running_mean = args.running_mean
+        running_var = args.running_var
+        use_input_stats = args.use_input_stats
+        if not (use_input_stats == True):
+            raise NotYetSupportedError("Only support use_input_stats is True.")
+        if not isinstance(running_mean, NoneType):
+            raise NotYetSupportedError("Only support running_mean=None")
+        if not isinstance(running_var, NoneType):
+            raise NotYetSupportedError("Only support running_var=None")
+        if weight is None:
+            # TODO Support weight=None
+            raise NotYetSupportedError("Only support weight is not None.")
+        if bias is None:
+            # TODO Support bias=None
+            raise NotYetSupportedError("Only support bias is not None.")
+        with graph.inserting_after(input):
+            # input permute
+            NCHW_to_NHWC = [0, 2, 3, 1]
+            input_permute = create_node(
+                graph,
+                torch.ops.aten.permute.default,
+                args=(input, NCHW_to_NHWC),
+                origin=input,
+            )
+            node.update_arg(node.args.index(input), input_permute)
+        with graph.inserting_before(node):
+            # circle instnorm
+            circle_instnorm = create_node(
+                graph,
+                torch.ops.circle_custom.instance_norm,
+                args=node.args,
+                kwargs=node.kwargs,
+                origin=node,
+            )
+            # output permute
+            NHWC_to_NCHW = [0, 3, 1, 2]
+            instnorm_out_permute = create_node(
+                graph,
+                torch.ops.aten.permute.default,
+                args=(circle_instnorm, NHWC_to_NCHW),
+            )
+        node.replace_all_uses_with(instnorm_out_permute, propagate_meta=True)
+        logger.debug(f"{node.name} is replaced with {circle_instnorm.name}")
+        modified = True
+        return modified
+    def legalize_max_pool2d_with_indices(self, exported_program, node) -> bool:
+        logger = logging.getLogger(__name__)
+        modified = False
+        graph_module = exported_program.graph_module
+        graph = graph_module.graph
+        # max_pool2d_with_indices(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> (Tensor, Tensor)
+        args = MaxPool2dWithIndicesArgs(*node.args, **node.kwargs)  # type: ignore[arg-type]
+        input_ = args.input
+        kernel_size = args.kernel_size
+        stride = args.stride
+        padding = args.padding
+        dilation = args.dilation
+        ceil_mode = args.ceil_mode
+        if ceil_mode:
+            raise NotYetSupportedError("Only support non-ceil model.")
+        if len(node.users.keys()) != 1:
+            raise NotYetSupportedError(
+                "Only support maxpool2d with 'return_indices=False'."
+            )
+        NCHW_to_NHWC = [0, 2, 3, 1]
+        # TODO Introduce a method that inserts permute op.
+        # input permute
+        with graph.inserting_after(input_):
+            input_permute = create_node(
+                graph,
+                torch.ops.aten.permute.default,
+                args=(input_, NCHW_to_NHWC),
+                origin=input_,
+            )
+            node.update_arg(node.args.index(input_), input_permute)
+        with graph.inserting_before(node):
+            legalized_op = torch.ops.circle_custom.maxpool2d
+            circle_maxpool2d = create_node(
+                graph, legalized_op, args=node.args, kwargs=node.kwargs, origin=node
+            )
+            # output permute
+            NHWC_to_NCHW = [0, 3, 1, 2]
+            maxpool_out_permute = create_node(
+                graph,
+                torch.ops.aten.permute.default,
+                args=(circle_maxpool2d, NHWC_to_NCHW),
+            )
+        get_item, *_ = node.users.keys()
+        get_item.replace_all_uses_with(maxpool_out_permute, propagate_meta=True)
+        logger.debug(f"{node.name} is replaced with {circle_maxpool2d.name}")
+        modified = True
+        return modified
+    def legalize_avg_pool2d(self, exported_program, node) -> bool:
+        logger = logging.getLogger(__name__)
+        modified = False
+        graph_module = exported_program.graph_module
+        graph = graph_module.graph
+        # avg_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, bool ceil_mode=False, bool count_include_pad=True, int? divisor_override=None) -> (Tensor)
+        args = AvgPool2dArgs(*node.args, **node.kwargs)  # type: ignore[arg-type]
+        input_ = args.input
+        kernel_size = args.kernel_size
+        stride = args.stride
+        padding = args.padding
+        ceil_mode = args.ceil_mode
+        if ceil_mode:
+            raise NotYetSupportedError("Only support non-ceil model.")
+        divisor_override = args.divisor_override
+        if divisor_override is not None:
+            raise NotYetSupportedError(
+                "For the case that the divisor_override is not None is not yet supported."
+            )
+        NCHW_to_NHWC = [0, 2, 3, 1]
+        # TODO Introduce a method that inserts permute op.
+        # input permute
+        with graph.inserting_after(input_):
+            input_permute = create_node(
+                graph,
+                torch.ops.aten.permute.default,
+                args=(input_, NCHW_to_NHWC),
+                origin=input_,
+            )
+            node.update_arg(node.args.index(input_), input_permute)
+        with graph.inserting_before(node):
+            legalized_op = torch.ops.circle_custom.avgpool2d
+            circle_avgpool2d = create_node(
+                graph, legalized_op, args=node.args, kwargs=node.kwargs, origin=node
+            )
+            # output permute
+            NHWC_to_NCHW = [0, 3, 1, 2]
+            avgpool_out_permute = create_node(
+                graph,
+                torch.ops.aten.permute.default,
+                args=(circle_avgpool2d, NHWC_to_NCHW),
+            )
+        node.replace_all_uses_with(avgpool_out_permute, propagate_meta=True)
+        logger.debug(f"{node.name} is replaced with {circle_avgpool2d.name}")
+        modified = True
+        return modified
+    def call(self, exported_program: ExportedProgram) -> PassResult:
+        target_to_legalize_func = {
+            torch.ops.aten.conv2d.default: self.legalize_conv2d,
+            torch.ops.aten.conv2d.padding: self.legalize_conv2d,
+            torch.ops.aten.max_pool2d_with_indices.default: self.legalize_max_pool2d_with_indices,
+            torch.ops.aten.avg_pool2d.default: self.legalize_avg_pool2d,
+            torch.ops.aten.instance_norm.default: self.legalize_instance_norm,
+        }
+        graph_module = exported_program.graph_module
+        graph = graph_module.graph
+        modified = False
+        for node in graph.nodes:
+            if not is_target_node(node, list(target_to_legalize_func.keys())):
+                continue
+            modified |= target_to_legalize_func[node.target](exported_program, node)
+        graph.eliminate_dead_code()
+        graph.lint()
+        graph_module.recompile()
+        return PassResult(modified)

tico/passes/lower_pow2_to_mul.py ADDED Viewed

@@ -0,0 +1,75 @@
+# Copyright (c) 2025 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+if TYPE_CHECKING:
+    import torch.fx
+import torch
+from torch.export import ExportedProgram
+from tico.utils import logging
+from tico.utils.graph import create_node
+from tico.utils.passes import PassBase, PassResult
+from tico.utils.trace_decorators import trace_graph_diff_on_pass
+from tico.utils.utils import is_target_node
+from tico.utils.validate_args_kwargs import PowTensorScalarArgs
+@trace_graph_diff_on_pass
+class LowerPow2ToMul(PassBase):
+    """
+    This pass lowers pow operator whose exponent is 2 to mul.
+    E.g. `Pow(in_, 2)` -> `Mul(in_, in_)`
+    """
+    def __init__(self):
+        super().__init__()
+    def call(self, exported_program: ExportedProgram) -> PassResult:
+        logger = logging.getLogger(__name__)
+        graph_module = exported_program.graph_module
+        graph = graph_module.graph
+        modified = False
+        for node in graph.nodes:
+            if not is_target_node(node, torch.ops.aten.pow.Tensor_Scalar):
+                continue
+            args = PowTensorScalarArgs(*node.args, **node.kwargs)  # type: ignore[arg-type]
+            in_, exp = args.input, args.exponent
+            if exp != 2:
+                continue
+            lhs = rhs = in_
+            with graph.inserting_after(node):
+                new_mul = create_node(
+                    graph,
+                    torch.ops.aten.mul.Tensor,
+                    args=(lhs, rhs),
+                    kwargs={},
+                )
+            node.replace_all_uses_with(new_mul, propagate_meta=True)
+            modified = True
+            logger.debug(f"{node.name} is replaced with {new_mul.name}")
+        graph.eliminate_dead_code()
+        graph.lint()
+        graph_module.recompile()
+        return PassResult(modified)