PyPI - tico - Versions diffs - 0.1.0__py3-none-any.whl - Mend

tico 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (206) hide show

tico/__init__.py +42 -0
tico/config/__init__.py +4 -0
tico/config/base.py +37 -0
tico/config/factory.py +41 -0
tico/config/v1.py +35 -0
tico/experimental/__init__.py +1 -0
tico/experimental/quantization/__init__.py +1 -0
tico/experimental/quantization/algorithm/__init__.py +1 -0
tico/experimental/quantization/algorithm/gptq/__init__.py +1 -0
tico/experimental/quantization/algorithm/gptq/gptq.py +172 -0
tico/experimental/quantization/algorithm/gptq/quant.py +153 -0
tico/experimental/quantization/algorithm/gptq/quantizer.py +225 -0
tico/experimental/quantization/algorithm/gptq/utils.py +65 -0
tico/experimental/quantization/algorithm/pt2e/__init__.py +1 -0
tico/experimental/quantization/algorithm/pt2e/annotation/__init__.py +1 -0
tico/experimental/quantization/algorithm/pt2e/annotation/annotator.py +215 -0
tico/experimental/quantization/algorithm/pt2e/annotation/config.py +26 -0
tico/experimental/quantization/algorithm/pt2e/annotation/op/__init__.py +21 -0
tico/experimental/quantization/algorithm/pt2e/annotation/op/adaptive_avg_pool2d.py +65 -0
tico/experimental/quantization/algorithm/pt2e/annotation/op/add.py +57 -0
tico/experimental/quantization/algorithm/pt2e/annotation/op/conv2d.py +92 -0
tico/experimental/quantization/algorithm/pt2e/annotation/op/div.py +57 -0
tico/experimental/quantization/algorithm/pt2e/annotation/op/linear.py +94 -0
tico/experimental/quantization/algorithm/pt2e/annotation/op/mean.py +53 -0
tico/experimental/quantization/algorithm/pt2e/annotation/op/mul.py +57 -0
tico/experimental/quantization/algorithm/pt2e/annotation/op/relu6.py +53 -0
tico/experimental/quantization/algorithm/pt2e/annotation/op/rsqrt.py +53 -0
tico/experimental/quantization/algorithm/pt2e/annotation/op/sub.py +57 -0
tico/experimental/quantization/algorithm/pt2e/annotation/spec.py +47 -0
tico/experimental/quantization/algorithm/pt2e/annotation/utils.py +88 -0
tico/experimental/quantization/algorithm/pt2e/quantizer.py +78 -0
tico/experimental/quantization/algorithm/pt2e/transformation/__init__.py +1 -0
tico/experimental/quantization/algorithm/pt2e/transformation/convert_scalars_to_attrs.py +58 -0
tico/experimental/quantization/algorithm/pt2e/utils.py +138 -0
tico/experimental/quantization/algorithm/smoothquant/__init__.py +1 -0
tico/experimental/quantization/algorithm/smoothquant/observer.py +78 -0
tico/experimental/quantization/algorithm/smoothquant/quantizer.py +81 -0
tico/experimental/quantization/algorithm/smoothquant/smooth_quant.py +164 -0
tico/experimental/quantization/config.py +68 -0
tico/experimental/quantization/evaluation/__init__.py +1 -0
tico/experimental/quantization/evaluation/backend.py +20 -0
tico/experimental/quantization/evaluation/evaluate.py +223 -0
tico/experimental/quantization/evaluation/executor/__init__.py +1 -0
tico/experimental/quantization/evaluation/executor/backend_executor.py +54 -0
tico/experimental/quantization/evaluation/executor/circle_executor.py +75 -0
tico/experimental/quantization/evaluation/executor/triv24_executor.py +128 -0
tico/experimental/quantization/evaluation/metric.py +109 -0
tico/experimental/quantization/evaluation/utils.py +185 -0
tico/experimental/quantization/passes/__init__.py +1 -0
tico/experimental/quantization/passes/fold_quant_ops.py +154 -0
tico/experimental/quantization/passes/insert_quantize_on_dtype_mismatch.py +345 -0
tico/experimental/quantization/passes/propagate_qparam_backward.py +91 -0
tico/experimental/quantization/passes/propagate_qparam_forward.py +141 -0
tico/experimental/quantization/passes/quantize_bias.py +123 -0
tico/experimental/quantization/passes/remove_weight_dequant_op.py +177 -0
tico/experimental/quantization/public_interface.py +108 -0
tico/experimental/quantization/quantizer.py +71 -0
tico/interpreter/__init__.py +1 -0
tico/interpreter/infer.py +116 -0
tico/interpreter/interpreter.py +93 -0
tico/passes/__init__.py +1 -0
tico/passes/cast_aten_where_arg_type.py +191 -0
tico/passes/cast_mixed_type_args.py +187 -0
tico/passes/const_prop_pass.py +307 -0
tico/passes/convert_conv1d_to_conv2d.py +160 -0
tico/passes/convert_layout_op_to_reshape.py +85 -0
tico/passes/convert_repeat_to_expand_copy.py +89 -0
tico/passes/convert_to_relu6.py +181 -0
tico/passes/decompose_addmm.py +124 -0
tico/passes/decompose_batch_norm.py +192 -0
tico/passes/decompose_fake_quantize.py +134 -0
tico/passes/decompose_fake_quantize_tensor_qparams.py +294 -0
tico/passes/decompose_group_norm.py +275 -0
tico/passes/decompose_grouped_conv2d.py +209 -0
tico/passes/decompose_slice_scatter.py +169 -0
tico/passes/extract_dtype_kwargs.py +122 -0
tico/passes/fill_meta_val.py +57 -0
tico/passes/fuse_leading_unsqueeze_reshape.py +112 -0
tico/passes/fuse_redundant_reshape_to_mean.py +102 -0
tico/passes/legalize_causal_mask_value.py +108 -0
tico/passes/legalize_predefined_layout_operators.py +386 -0
tico/passes/lower_pow2_to_mul.py +75 -0
tico/passes/lower_to_resize_nearest_neighbor.py +235 -0
tico/passes/lower_to_slice.py +230 -0
tico/passes/merge_consecutive_cat.py +80 -0
tico/passes/ops.py +78 -0
tico/passes/remove_nop.py +84 -0
tico/passes/remove_redundant_assert_nodes.py +51 -0
tico/passes/remove_redundant_expand.py +66 -0
tico/passes/remove_redundant_permute.py +122 -0
tico/passes/remove_redundant_reshape.py +436 -0
tico/passes/remove_redundant_slice.py +62 -0
tico/passes/remove_redundant_to_copy.py +86 -0
tico/passes/restore_linear.py +115 -0
tico/passes/segment_index_select.py +145 -0
tico/pt2_to_circle.py +105 -0
tico/serialize/__init__.py +1 -0
tico/serialize/circle_graph.py +319 -0
tico/serialize/circle_mapping.py +177 -0
tico/serialize/circle_serializer.py +240 -0
tico/serialize/operators/__init__.py +28 -0
tico/serialize/operators/hashable_opcode.py +43 -0
tico/serialize/operators/node_visitor.py +80 -0
tico/serialize/operators/op_abs.py +53 -0
tico/serialize/operators/op_add.py +69 -0
tico/serialize/operators/op_alias_copy.py +64 -0
tico/serialize/operators/op_any.py +150 -0
tico/serialize/operators/op_arange_start_step.py +61 -0
tico/serialize/operators/op_argmax.py +62 -0
tico/serialize/operators/op_avg_pool2d.py +192 -0
tico/serialize/operators/op_bmm.py +62 -0
tico/serialize/operators/op_cat.py +66 -0
tico/serialize/operators/op_clamp.py +126 -0
tico/serialize/operators/op_clone.py +71 -0
tico/serialize/operators/op_constant_pad_nd.py +72 -0
tico/serialize/operators/op_conv2d.py +186 -0
tico/serialize/operators/op_copy.py +164 -0
tico/serialize/operators/op_cos.py +59 -0
tico/serialize/operators/op_cumsum.py +95 -0
tico/serialize/operators/op_depthwise_conv2d.py +199 -0
tico/serialize/operators/op_dequantize_per_channel.py +82 -0
tico/serialize/operators/op_dequantize_per_tensor.py +64 -0
tico/serialize/operators/op_div.py +62 -0
tico/serialize/operators/op_embedding.py +60 -0
tico/serialize/operators/op_eq.py +64 -0
tico/serialize/operators/op_exp.py +60 -0
tico/serialize/operators/op_expand.py +91 -0
tico/serialize/operators/op_full.py +48 -0
tico/serialize/operators/op_full_like.py +55 -0
tico/serialize/operators/op_ge.py +54 -0
tico/serialize/operators/op_gelu.py +59 -0
tico/serialize/operators/op_gt.py +54 -0
tico/serialize/operators/op_index.py +82 -0
tico/serialize/operators/op_index_select.py +64 -0
tico/serialize/operators/op_instance_norm.py +91 -0
tico/serialize/operators/op_leaky_relu.py +60 -0
tico/serialize/operators/op_linear.py +70 -0
tico/serialize/operators/op_log.py +53 -0
tico/serialize/operators/op_log1p.py +86 -0
tico/serialize/operators/op_logical_and.py +63 -0
tico/serialize/operators/op_logical_not.py +62 -0
tico/serialize/operators/op_lt.py +61 -0
tico/serialize/operators/op_max_dim.py +70 -0
tico/serialize/operators/op_max_pool2d_with_indices.py +155 -0
tico/serialize/operators/op_maximum.py +53 -0
tico/serialize/operators/op_mean.py +66 -0
tico/serialize/operators/op_minimum.py +53 -0
tico/serialize/operators/op_mm.py +177 -0
tico/serialize/operators/op_mul.py +99 -0
tico/serialize/operators/op_ne.py +54 -0
tico/serialize/operators/op_neg.py +59 -0
tico/serialize/operators/op_permute.py +65 -0
tico/serialize/operators/op_pow.py +141 -0
tico/serialize/operators/op_prelu.py +54 -0
tico/serialize/operators/op_quantize_per_tensor.py +79 -0
tico/serialize/operators/op_reciprocal.py +64 -0
tico/serialize/operators/op_relu.py +53 -0
tico/serialize/operators/op_relu6.py +52 -0
tico/serialize/operators/op_repeat.py +100 -0
tico/serialize/operators/op_reshape.py +73 -0
tico/serialize/operators/op_resize_nearest_neighbor.py +70 -0
tico/serialize/operators/op_rsqrt.py +53 -0
tico/serialize/operators/op_scalar_tensor.py +51 -0
tico/serialize/operators/op_select_copy.py +65 -0
tico/serialize/operators/op_sigmoid.py +56 -0
tico/serialize/operators/op_sin.py +53 -0
tico/serialize/operators/op_slice.py +155 -0
tico/serialize/operators/op_softmax.py +100 -0
tico/serialize/operators/op_split_with_sizes.py +99 -0
tico/serialize/operators/op_sqrt.py +55 -0
tico/serialize/operators/op_squeeze.py +73 -0
tico/serialize/operators/op_sub.py +71 -0
tico/serialize/operators/op_sum.py +63 -0
tico/serialize/operators/op_tanh.py +54 -0
tico/serialize/operators/op_to_copy.py +105 -0
tico/serialize/operators/op_unsqueeze.py +66 -0
tico/serialize/operators/op_view.py +74 -0
tico/serialize/operators/op_where.py +82 -0
tico/serialize/operators/utils.py +94 -0
tico/serialize/pack.py +35 -0
tico/serialize/quant_param.py +42 -0
tico/utils/__init__.py +1 -0
tico/utils/convert.py +296 -0
tico/utils/define.py +35 -0
tico/utils/diff_graph.py +181 -0
tico/utils/errors.py +35 -0
tico/utils/graph.py +282 -0
tico/utils/logging.py +45 -0
tico/utils/model.py +37 -0
tico/utils/mx/__init__.py +1 -0
tico/utils/mx/elemwise_ops.py +267 -0
tico/utils/mx/formats.py +125 -0
tico/utils/mx/mx_ops.py +270 -0
tico/utils/padding.py +47 -0
tico/utils/passes.py +76 -0
tico/utils/register_custom_op.py +609 -0
tico/utils/serialize.py +42 -0
tico/utils/trace_decorators.py +101 -0
tico/utils/utils.py +406 -0
tico/utils/validate_args_kwargs.py +1149 -0
tico-0.1.0.dist-info/LICENSE +241 -0
tico-0.1.0.dist-info/METADATA +354 -0
tico-0.1.0.dist-info/RECORD +206 -0
tico-0.1.0.dist-info/WHEEL +5 -0
tico-0.1.0.dist-info/entry_points.txt +3 -0
tico-0.1.0.dist-info/top_level.txt +1 -0

tico/passes/remove_redundant_assert_nodes.py ADDED Viewed

@@ -0,0 +1,51 @@
+# Copyright (c) 2025 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+from torch.export import ExportedProgram
+from tico.utils.passes import PassBase, PassResult
+from tico.utils.trace_decorators import trace_graph_diff_on_pass
+from tico.utils.utils import is_target_node
+assert_node_targets = [
+    torch.ops.aten._assert_tensor_metadata.default,
+]
+@trace_graph_diff_on_pass
+class RemoveRedundantAssertionNodes(PassBase):
+    """
+    This removes redundant assertion nodes.
+    - `aten.assert_tensor_meta.default`
+    """
+    def __init__(self):
+        super().__init__()
+    def call(self, exported_program: ExportedProgram) -> PassResult:
+        graph_module = exported_program.graph_module
+        graph = graph_module.graph
+        modified = False
+        for node in graph.nodes:
+            if is_target_node(node, assert_node_targets):
+                graph.erase_node(node)
+                modified = True
+        graph.eliminate_dead_code()
+        graph.lint()
+        graph_module.recompile()
+        return PassResult(modified)

tico/passes/remove_redundant_expand.py ADDED Viewed

@@ -0,0 +1,66 @@
+# Copyright (c) 2025 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+if TYPE_CHECKING:
+    import torch.fx
+import torch
+from torch.export import ExportedProgram
+from tico.passes import ops
+from tico.serialize.circle_mapping import extract_shape
+from tico.utils import logging
+from tico.utils.passes import PassBase, PassResult
+from tico.utils.trace_decorators import trace_graph_diff_on_pass
+from tico.utils.utils import is_target_node
+from tico.utils.validate_args_kwargs import ExpandArgs
+@trace_graph_diff_on_pass
+class RemoveRedundantExpand(PassBase):
+    """
+    This pass removes redundant `aten.expand` operators where shapes of input and output are same.
+    """
+    def __init__(self):
+        super().__init__()
+    def call(self, exported_program: ExportedProgram) -> PassResult:
+        logger = logging.getLogger(__name__)
+        graph_module = exported_program.graph_module
+        graph = graph_module.graph
+        modified = False
+        for node in graph.nodes:
+            if not is_target_node(node, ops.aten.expand):
+                continue
+            args = ExpandArgs(*node.args, **node.kwargs)  # type: ignore[arg-type]
+            input, size = args.input, args.size
+            input_shape = extract_shape(input)
+            if list(input_shape) != size:
+                continue
+            node.replace_all_uses_with(input, propagate_meta=False)
+            modified = True
+            logger.debug(f"{node.name} is replaced with {input.name}")
+        graph.eliminate_dead_code()
+        graph.lint()
+        graph_module.recompile()
+        return PassResult(modified)

tico/passes/remove_redundant_permute.py ADDED Viewed

@@ -0,0 +1,122 @@
+# Copyright (c) 2025 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+if TYPE_CHECKING:
+    import torch.fx
+import torch
+from torch.export import ExportedProgram
+from tico.passes import ops
+from tico.serialize.circle_mapping import extract_shape
+from tico.utils import logging
+from tico.utils.graph import create_node
+from tico.utils.passes import PassBase, PassResult
+from tico.utils.trace_decorators import trace_graph_diff_on_pass
+from tico.utils.utils import is_target_node
+from tico.utils.validate_args_kwargs import PermuteArgs
+def _compose_permutation(dims1: list[int], dims2: list[int]):
+    """
+    Compose two permutation vectors.
+    Given y = x.permute(dims1) and z = y.permute(dims2),
+    the overall permutation p = dims2 ∘ dims1 is
+        p[i] = dims1[dims2[i]]
+    """
+    assert len(dims1) == len(
+        dims2
+    ), f"len(dims1): {len(dims1)}, len(dims2): {len(dims2)}"
+    return [dims1[i] for i in dims2]
+def passes():
+    """
+    Return a list of passes that remove redundant `aten.permute` operators.
+    NOTE Both shape and stride of input/output should be same.
+    """
+    return [
+        RemoveRedundantPermutePattern1(),
+    ]
+@trace_graph_diff_on_pass
+class RemoveRedundantPermutePattern1(PassBase):
+    def __init__(self):
+        super().__init__()
+    def call(self, exported_program: ExportedProgram) -> PassResult:
+        """
+        [BEFORE]
+            (AxBxC) - aten.permute_1 - aten.permute_2 - (OUT_SHAPE)
+        [AFTER]
+            if OUT_SHAPE == (AxBxC):
+                (AxBxC)
+            else:
+                (AxBxC) - aten.permute (fused dims) - (OUT_SHAPE)
+        """
+        logger = logging.getLogger(__name__)
+        graph_module = exported_program.graph_module
+        graph = graph_module.graph
+        modified = False
+        for permute2 in graph.nodes:
+            if not is_target_node(permute2, ops.aten.permute):
+                continue
+            if len(permute2.users) != 1:
+                continue
+            permute2_args = PermuteArgs(*permute2.args, **permute2.kwargs)  # type: ignore[arg-type]
+            permute1, permute2_dims = permute2_args.input, permute2_args.dims
+            if not is_target_node(permute1, ops.aten.permute):
+                continue
+            if len(permute1.users) != 1:
+                continue
+            permute1_args = PermuteArgs(*permute1.args, **permute1.kwargs)  # type: ignore[arg-type]
+            permute1_input, permute1_dims = permute1_args.input, permute1_args.dims
+            fused_dims = _compose_permutation(permute1_dims, permute2_dims)
+            identity = list(range(len(fused_dims)))
+            if fused_dims == identity:
+                # shape
+                permute1_input_shape = extract_shape(permute1_input)
+                permute2_shape = extract_shape(permute2)
+                assert permute1_input_shape == permute2_shape
+                permute2.replace_all_uses_with(permute1_input, propagate_meta=False)
+                logger.debug(f"{permute1.name} and {permute2.name} are removed.")
+            else:
+                with graph.inserting_after(permute2):
+                    new_args = (permute1_input, fused_dims)
+                    fused_permute = create_node(
+                        graph,
+                        torch.ops.aten.permute.default,
+                        args=new_args,
+                    )
+                    permute2.replace_all_uses_with(fused_permute, propagate_meta=True)
+                    logger.debug(f"{permute1.name} and {permute2.name} are fused.")
+            modified = True
+        graph.eliminate_dead_code()
+        graph.lint()
+        graph_module.recompile()
+        return PassResult(modified)

tico/passes/remove_redundant_reshape.py ADDED Viewed

@@ -0,0 +1,436 @@
+# Copyright (c) 2025 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import List, TYPE_CHECKING
+if TYPE_CHECKING:
+    import torch.fx
+import torch
+from torch.export import ExportedProgram
+from tico.passes import ops
+from tico.serialize.circle_mapping import extract_shape
+from tico.utils import logging
+from tico.utils.graph import create_node
+from tico.utils.passes import PassBase, PassResult
+from tico.utils.trace_decorators import trace_graph_diff_on_pass
+from tico.utils.utils import broadcastable, is_target_node, set_new_meta_val
+from tico.utils.validate_args_kwargs import (
+    AddTensorArgs,
+    PermuteArgs,
+    ReshapeArgs,
+    SafeSoftmaxArgs,
+    SoftmaxArgs,
+)
+def passes():
+    """
+    Return list of passes that remove redundant `aten.reshape` operators.
+    """
+    return [
+        RemoveRedundantReshapePattern1(),
+        RemoveRedundantReshapePattern2(),
+        RemoveRedundantReshapePattern3(),
+        RemoveRedundantReshapePattern4(),
+        RemoveRedundantReshapePattern5(),
+    ]
+@trace_graph_diff_on_pass
+class RemoveRedundantReshapePattern1(PassBase):
+    mul_ops: List[torch._ops.OpOverload] = ops.aten.mul_scalar + ops.aten.mul_tensor
+    def __init__(self):
+        super().__init__()
+    def call(self, exported_program: ExportedProgram) -> PassResult:
+        """
+        [BEFORE]
+            `(AxBxC) - aten.reshape` - (1xAxBxC) - `aten.permute` - (1xAxCxB) - `aten.mul` - (1xAxCxB) - `aten.reshape - (AxCxB)`
+        [AFTER]
+            `(AxBxC) - `aten.permute` - (AxCxB) - `aten.mul` - (AxCxB)`
+        """
+        logger = logging.getLogger(__name__)
+        graph_module = exported_program.graph_module
+        graph = graph_module.graph
+        modified = False
+        for reshape1 in graph.nodes:
+            ### first reshape
+            if not is_target_node(reshape1, ops.aten.reshape):
+                continue
+            # Assumes that other node do not use ops in the pattern for simplisity.
+            if len(reshape1.users) != 1:
+                continue
+            reshape1_args = ReshapeArgs(*reshape1.args, **reshape1.kwargs)  # type: ignore[arg-type]
+            reshape1_input = reshape1_args.input
+            # `(AxBxC) - aten.reshape` - (1xAxBxC)
+            if [1] + list(extract_shape(reshape1_input)) != list(
+                extract_shape(reshape1)
+            ):
+                continue
+            ### permute
+            permute = next(iter(reshape1.users))
+            if not is_target_node(permute, ops.aten.permute):
+                continue
+            if len(permute.users) != 1:
+                continue
+            permute_args = PermuteArgs(*permute.args, **permute.kwargs)  # type: ignore[arg-type]
+            permute_input, permute_dims = permute_args.input, permute_args.dims
+            # (1xAxBxC) - `aten.permute` - (1xAxCxB)
+            if permute_dims != [0, 1, 3, 2]:
+                continue
+            ### mul
+            mul = next(iter(permute.users))
+            if not is_target_node(mul, RemoveRedundantReshapePattern1.mul_ops):
+                continue
+            if len(mul.users) != 1:
+                continue
+            ### second reshape
+            reshape2 = next(iter(mul.users))
+            if not is_target_node(reshape2, ops.aten.reshape):
+                continue
+            if len(reshape2.users) != 1:
+                continue
+            reshape2_args = ReshapeArgs(*reshape2.args, **reshape2.kwargs)  # type: ignore[arg-type]
+            reshape2_input = reshape2_args.input
+            # (1xAxCxB) - `aten.reshape - (AxCxB)
+            if list(extract_shape(reshape2_input)) != [1] + list(
+                extract_shape(reshape2)
+            ):
+                continue
+            ### remove redundant reshapes
+            # update permute (remove reshape1)
+            permute.args = (reshape1_input, [0, 2, 1])
+            set_new_meta_val(permute)
+            set_new_meta_val(mul)
+            # remove reshape2
+            reshape2.replace_all_uses_with(mul, propagate_meta=False)
+            modified = True
+            logger.debug(f"{reshape1.name} and {reshape2.name} are removed.")
+        graph.eliminate_dead_code()
+        graph.lint()
+        graph_module.recompile()
+        return PassResult(modified)
+@trace_graph_diff_on_pass
+class RemoveRedundantReshapePattern2(PassBase):
+    def __init__(self):
+        super().__init__()
+    def call(self, exported_program: ExportedProgram) -> PassResult:
+        """
+        [BEFORE]
+            `(AxBxC) - aten.reshape` - (1xAxBxC) - `aten.permute` - (Bx1xAxC) - `aten.reshape - (Bx(A*C))`
+        [AFTER]
+            `(AxBxC) - `aten.permute` - (BxAxC) - `aten.reshape` - (Bx(A*C))`
+        """
+        logger = logging.getLogger(__name__)
+        graph_module = exported_program.graph_module
+        graph = graph_module.graph
+        modified = False
+        for reshape1 in graph.nodes:
+            ### first reshape
+            if not is_target_node(reshape1, ops.aten.reshape):
+                continue
+            if len(reshape1.users) != 1:
+                continue
+            reshape1_args = ReshapeArgs(*reshape1.args, **reshape1.kwargs)  # type: ignore[arg-type]
+            reshape1_input = reshape1_args.input
+            # `(AxBxC) - aten.reshape` - (1xAxBxC)
+            if [1] + list(extract_shape(reshape1_input)) != list(
+                extract_shape(reshape1)
+            ):
+                continue
+            ### permute
+            permute = next(iter(reshape1.users))
+            if not is_target_node(permute, ops.aten.permute):
+                continue
+            if len(permute.users) != 1:
+                continue
+            permute_args = PermuteArgs(*permute.args, **permute.kwargs)  # type: ignore[arg-type]
+            permute_input, permute_dims = permute_args.input, permute_args.dims
+            # (1xAxBxC) - `aten.permute` - (Bx1xAxC)
+            if permute_dims != [2, 0, 1, 3]:
+                continue
+            ### second reshape
+            reshape2 = next(iter(permute.users))
+            if not is_target_node(reshape2, ops.aten.reshape):
+                continue
+            if len(reshape2.users) != 1:
+                continue
+            reshape2_args = ReshapeArgs(*reshape2.args, **reshape2.kwargs)  # type: ignore[arg-type]
+            reshape2_input, reshape2_size = reshape2_args.input, reshape2_args.shape
+            # (Bx1xAxC) - `aten.reshape - (Bx(A*C))
+            reshape2_input_shape = list(extract_shape(reshape2_input))
+            assert len(reshape2_input_shape) == 4
+            if list(extract_shape(reshape2)) != [
+                reshape2_input_shape[0],
+                (reshape2_input_shape[2] * reshape2_input_shape[3]),
+            ]:
+                continue
+            ### remove redundant reshapes
+            # update permute (remove reshape1)
+            permute.args = (reshape1_input, [1, 0, 2])
+            set_new_meta_val(permute)
+            reshape1.replace_all_uses_with(permute, propagate_meta=False)
+            # update reshape2 args
+            assert permute == reshape2_input
+            reshape2.args = (permute, reshape2_size)
+            modified = True
+            logger.debug(f"{reshape1.name} is removed.")
+        graph.eliminate_dead_code()
+        graph.lint()
+        graph_module.recompile()
+        return PassResult(modified)
+@trace_graph_diff_on_pass
+class RemoveRedundantReshapePattern3(PassBase):
+    def __init__(self):
+        super().__init__()
+    def call(self, exported_program: ExportedProgram) -> PassResult:
+        """
+        [BEFORE]
+            (AxBxC) - aten.reshape - (1xAxBxC) - aten.add - (1xAxBxC) - aten.softmax - (1xAxBxC) - aten.reshape - (AxBxC)
+                      (reshape_2)                 (add)                 (softmax)                  (reshape_1)
+            (AxBxC) - aten.reshape - (1xAxBxC) /
+                      (reshape_3)
+        [AFTER]
+            (AxBxC) - aten.add - (AxBxC) - aten.softmax - (AxBxC)
+            (AxBxC) /   (add)                (softmax)
+        """
+        logger = logging.getLogger(__name__)
+        graph_module = exported_program.graph_module
+        graph = graph_module.graph
+        modified = False
+        for reshape_1 in graph.nodes:
+            # reshape_1
+            if not is_target_node(reshape_1, ops.aten.reshape):
+                continue
+            reshape_1_args = ReshapeArgs(*reshape_1.args, **reshape_1.kwargs)  # type: ignore[arg-type]
+            softmax = reshape_1_args.input
+            # softmax
+            softmax_args = None
+            if not is_target_node(softmax, ops.aten.softmax):
+                continue
+            if softmax.target == torch.ops.aten._softmax.default:
+                softmax_args = SoftmaxArgs(*softmax.args, **softmax.kwargs)  # type: ignore[arg-type, assignment]
+            elif softmax.target == torch.ops.aten._safe_softmax.default:
+                softmax_args = SafeSoftmaxArgs(*softmax.args, **softmax.kwargs)  # type: ignore[arg-type, assignment]
+            else:
+                raise RuntimeError("Invalid input")
+            assert softmax_args is not None
+            add, softmax_dim = (
+                softmax_args.input,
+                softmax_args.dim,
+            )
+            softmax_shape = extract_shape(softmax)
+            # TODO support other dimension
+            if softmax_dim != -1 and softmax_dim != len(softmax_shape) - 1:
+                continue
+            # add
+            if not add.target in ops.aten.add:
+                continue
+            add_args = AddTensorArgs(*add.args, **add.kwargs)  # type: ignore[arg-type]
+            reshape_2, reshape_3 = add_args.input, add_args.other
+            assert isinstance(reshape_2, torch.fx.Node), type(reshape_2)
+            assert isinstance(reshape_3, torch.fx.Node), type(reshape_3)
+            # reshape_2
+            if not reshape_2.op == "call_function":
+                continue
+            if not reshape_2.target in ops.aten.reshape:
+                continue
+            reshape_2_args = ReshapeArgs(*reshape_2.args, **reshape_2.kwargs)  # type: ignore[arg-type]
+            reshape_2_input = reshape_2_args.input
+            assert isinstance(reshape_2_input, torch.fx.Node), type(reshape_2_input)
+            # reshape_3
+            if not reshape_3.op == "call_function":
+                continue
+            if not reshape_3.target in ops.aten.reshape:
+                continue
+            reshape_3_args = ReshapeArgs(*reshape_3.args, **reshape_3.kwargs)  # type: ignore[arg-type]
+            reshape_3_input = reshape_3_args.input
+            assert isinstance(reshape_3_input, torch.fx.Node), type(reshape_3_input)
+            # Check condition
+            reshape_2_input_shape = extract_shape(reshape_2_input)
+            reshape_3_input_shape = extract_shape(reshape_3_input)
+            if not broadcastable(reshape_2_input_shape, reshape_3_input_shape):
+                continue
+            reshape_1_shape = extract_shape(reshape_1)
+            if (
+                reshape_2_input_shape != reshape_1_shape
+                and reshape_3_input_shape != reshape_1_shape
+            ):
+                continue
+            # Make sure the softmax axis length is unchanged.
+            if softmax_shape[-1] != reshape_1_shape[-1]:
+                continue
+            # Assume `aten.add` and `aten.softmax` have only one user.
+            if len(add.users) != 1:
+                continue
+            if len(softmax.users) != 1:
+                continue
+            # Update add
+            add.args = (reshape_2_input, reshape_3_input)
+            set_new_meta_val(add)
+            # Update softmax
+            if softmax_dim == len(softmax_shape) - 1:
+                softmax.update_arg(1, -1)  # (index, last_dim)
+            set_new_meta_val(softmax)
+            reshape_1.replace_all_uses_with(softmax, propagate_meta=False)
+            modified = True
+            logger.debug(
+                f"{reshape_2.name}, {reshape_3.name} and {reshape_1.name} are removed."
+            )
+        graph.eliminate_dead_code()
+        graph.lint()
+        graph_module.recompile()
+        return PassResult(modified)
+@trace_graph_diff_on_pass
+class RemoveRedundantReshapePattern4(PassBase):
+    def __init__(self):
+        super().__init__()
+    def call(self, exported_program: ExportedProgram) -> PassResult:
+        """
+        NOTE: Below graph is just an example. This pattern matches not only for the 3D tensors.
+        What this pattern aims to remove is that the consecutive `aten.reshape` ops.
+        [BEFORE]
+            (AxBxC) - aten.reshape - (AxB'xC') - aten.reshape - (A'xB''xC')
+        [AFTER]
+            (AxBxC) - aten.reshape - (A'xB''xC')
+        """
+        logger = logging.getLogger(__name__)
+        graph_module = exported_program.graph_module
+        graph = graph_module.graph
+        modified = False
+        for reshape1 in graph.nodes:
+            # reshape_1
+            if not is_target_node(reshape1, ops.aten.reshape):
+                continue
+            reshape1_args = ReshapeArgs(*reshape1.args, **reshape1.kwargs)  # type: ignore[arg-type]
+            reshape1_input, size = reshape1_args.input, reshape1_args.shape
+            assert isinstance(reshape1_input, torch.fx.Node), type(reshape1_input)
+            assert isinstance(size, list), type(size)
+            for s in size:
+                assert isinstance(s, int), type(s)
+            if not len(reshape1.users) == 1:
+                continue
+            # reshape_2
+            reshape2 = next(iter(reshape1.users))
+            if not is_target_node(reshape2, ops.aten.reshape):
+                continue
+            reshape2_args = ReshapeArgs(*reshape2.args, **reshape2.kwargs)  # type: ignore[arg-type]
+            reshape2_input, reshape2_size = reshape2_args.input, reshape2_args.shape
+            assert isinstance(reshape2_input, torch.fx.Node), type(reshape2_input)
+            assert isinstance(reshape2_size, list), type(reshape2_size)
+            for s in reshape2_size:
+                assert isinstance(s, int), type(s)
+            with graph.inserting_before(reshape1):
+                fused_reshape = create_node(
+                    graph,
+                    reshape1.target,
+                    (reshape1_input, reshape2_size),
+                )
+            reshape2.replace_all_uses_with(fused_reshape, propagate_meta=True)
+            modified = True
+            logger.debug(
+                f"{reshape1.name} and {reshape2.name} are fused to {fused_reshape.name}"
+            )
+        graph.eliminate_dead_code()
+        graph.lint()
+        graph_module.recompile()
+        return PassResult(modified)
+@trace_graph_diff_on_pass
+class RemoveRedundantReshapePattern5(PassBase):
+    def __init__(self):
+        super().__init__()
+    def call(self, exported_program: ExportedProgram) -> PassResult:
+        """
+        [BEFORE]
+            (AxBxC) - aten.reshape - (AxBxC)
+        [AFTER]
+            (AxBxC)
+        """
+        logger = logging.getLogger(__name__)
+        graph_module = exported_program.graph_module
+        graph = graph_module.graph
+        modified = False
+        for node in graph.nodes:
+            if not is_target_node(node, ops.aten.reshape):
+                continue
+            args = ReshapeArgs(*node.args, **node.kwargs)  # type: ignore[arg-type]
+            output_shape = args.shape
+            input_shape = list(extract_shape(args.input))
+            if output_shape != input_shape:
+                continue
+            with graph.inserting_after(node):
+                node.replace_all_uses_with(args.input, propagate_meta=False)
+            modified = True
+            logger.debug(f"{node.name} is replaced with {args.input}")
+        graph.eliminate_dead_code()
+        graph.lint()
+        graph_module.recompile()
+        return PassResult(modified)