PyPI - tico - Versions diffs - 0.1.0.dev250714__py3-none-any.whl → 0.1.0.dev251102__py3-none-any.whl - Mend

tico 0.1.0.dev250714py3-none-any.whl → 0.1.0.dev251102py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (181) hide show

tico/passes/convert_matmul_to_linear.py ADDED Viewed

@@ -0,0 +1,312 @@
+# Copyright (c) 2025 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import List, Optional, TYPE_CHECKING
+if TYPE_CHECKING:
+    import torch.fx
+import torch
+from torch._export.utils import is_buffer, is_lifted_tensor_constant, is_param
+from torch.export import ExportedProgram
+from tico.serialize.circle_mapping import extract_shape
+from tico.utils import logging
+from tico.utils.graph import create_node
+from tico.utils.passes import PassBase, PassResult
+from tico.utils.trace_decorators import trace_graph_diff_on_pass
+from tico.utils.validate_args_kwargs import BmmArgs, MatmulArgs
+class Converter:  # type: ignore[empty-body]
+    def __init__(self):
+        super().__init__()
+    def match(self, exported_program, node) -> bool:  # type: ignore[empty-body]
+        return False
+    def convert(self, exported_program, node) -> torch.fx.Node:  # type: ignore[empty-body]
+        pass
+class MatmulToLinearConverter(Converter):
+    def __init__(self):
+        super().__init__()
+    def convert(self, exported_program, node) -> torch.fx.Node:
+        graph_module = exported_program.graph_module
+        graph = graph_module.graph
+        mm_args = MatmulArgs(*node.args, **node.kwargs)  # type: ignore[arg-type]
+        lhs = mm_args.input
+        rhs = mm_args.other
+        with graph.inserting_before(node):
+            transpose_node = create_node(
+                graph,
+                torch.ops.aten.permute.default,
+                args=(rhs, [1, 0]),
+            )
+            linear_node = create_node(
+                graph,
+                torch.ops.aten.linear.default,
+                args=(lhs, transpose_node),
+            )
+            node.replace_all_uses_with(linear_node, propagate_meta=True)
+        return linear_node
+class RhsConstMatmulToLinearConverter(MatmulToLinearConverter):
+    def __init__(self):
+        super().__init__()
+    def match(self, exported_program, node) -> bool:
+        if not node.target == torch.ops.aten.mm.default:
+            return False
+        mm_args = MatmulArgs(*node.args, **node.kwargs)  # type: ignore[arg-type]
+        rhs = mm_args.other
+        if isinstance(rhs, torch.fx.Node):
+            if is_lifted_tensor_constant(exported_program, rhs):
+                return True
+            elif is_param(exported_program, rhs):
+                return True
+            elif is_buffer(exported_program, rhs):
+                return True
+            else:
+                return False
+        return False
+    def convert(self, exported_program, node) -> torch.fx.Node:
+        return super().convert(exported_program, node)
+class LhsConstMatmulToLinearConverter(MatmulToLinearConverter):
+    def __init__(self):
+        super().__init__()
+    def match(self, exported_program, node) -> bool:
+        if not node.target == torch.ops.aten.mm.default:
+            return False
+        mm_args = MatmulArgs(*node.args, **node.kwargs)
+        lhs = mm_args.input
+        if isinstance(lhs, torch.fx.Node):
+            if is_lifted_tensor_constant(exported_program, lhs):
+                return True
+            elif is_param(exported_program, lhs):
+                return True
+            elif is_buffer(exported_program, lhs):
+                return True
+        return False
+    def convert(self, exported_program, node) -> torch.fx.Node:
+        return super().convert(exported_program, node)
+class SingleBatchLhsConstBmmToLinearConverter(Converter):
+    """
+    Convert `single-batched & lhs-const BatchMatMul` to `linear` operation.
+    [1] exchange lhs and rhs
+    [2] transpose rhs
+    [3] transpose output
+    **Before**
+    lhs[1,a,b](const)   rhs[1,b,c]
+    |                   |
+    |                   |
+    ---------bmm---------
+              |
+            output[1,a,c]
+    **After**
+    rhs[1,b,c]
+    |
+    tr                  lhs'[a,b](const-folded)
+    |[1,c,b]            |
+    |                   |
+    ---------fc--------
+             |[1,c,a]
+             tr
+             |
+            output[1,a,c]
+    """
+    def __init__(self):
+        super().__init__()
+    def match(self, exported_program, node) -> bool:
+        if not node.target == torch.ops.aten.bmm.default:
+            return False
+        bmm_args = BmmArgs(*node.args, **node.kwargs)
+        lhs = bmm_args.input
+        rhs = bmm_args.mat2
+        # [1] Single-batch
+        lhs_shape = extract_shape(lhs)
+        rhs_shape = extract_shape(rhs)
+        assert len(lhs_shape) == len(
+            rhs_shape
+        ), f"Bmm input's ranks must be the same but got {lhs_shape} and {rhs_shape}"
+        if not (lhs_shape[0] == rhs_shape[0] == 1):
+            return False
+        # [2] Lhs is constant
+        if not isinstance(lhs, torch.fx.Node):
+            return False
+        if not (
+            is_lifted_tensor_constant(exported_program, lhs)
+            or is_param(exported_program, lhs)
+            or is_buffer(exported_program, lhs)
+        ):
+            return False
+        return True
+    def convert(self, exported_program, node) -> torch.fx.Node:
+        graph_module = exported_program.graph_module
+        graph = graph_module.graph
+        bmm_args = BmmArgs(*node.args, **node.kwargs)  # type: ignore[arg-type]
+        lhs = bmm_args.input  # const
+        rhs = bmm_args.mat2  # non-const
+        lhs_shape = extract_shape(lhs)
+        rhs_shape = extract_shape(rhs)
+        assert rhs_shape[0] == 1
+        assert lhs_shape[0] == 1
+        with graph.inserting_before(node):
+            rhs_tr = create_node(
+                graph,
+                torch.ops.aten.permute.default,
+                args=(rhs, [0, 2, 1]),
+            )
+            lhs_reshape = create_node(
+                graph,
+                torch.ops.aten.view.default,
+                args=(lhs, list(lhs_shape[1:])),
+            )
+            linear_node = create_node(
+                graph,
+                torch.ops.aten.linear.default,
+                args=(rhs_tr, lhs_reshape),
+            )
+            tr_linear_node = create_node(
+                graph,
+                torch.ops.aten.permute.default,
+                args=(linear_node, [0, 2, 1]),
+            )
+            node.replace_all_uses_with(tr_linear_node, propagate_meta=False)
+        return tr_linear_node
+@trace_graph_diff_on_pass
+class ConvertMatmulToLinear(PassBase):
+    """
+    This pass converts matmul(partially includes single-batch bmm) to linear selectively
+    How to select between `matmul` and `linear`?
+    * Linear has better quantization accuracy (NPU backend)
+        Due to ONE compiler's quantization policy;
+        FullyConnected(=Linear) uses per-channel quantization for weight and per-tensor for input.
+        BatchMatmul(=matmul) uses per-tensor quantization for both rhs and lhs.
+    * Matmul to Linear requires Transpose, which may harm latency
+        When RHS is constant, addtional transpose can be folded.
+    [RHS non-const case]
+    Constant folding cannot be performed.
+    lhs         rhs (non-const)
+    |           |
+    |           transpose
+    |           |
+     -- linear --
+         |
+         out
+    [RHS const case]
+    Constant folding can be performed to
+    lhs         rhs (const)         lh          rhs (folded const)
+    |           |                   |           |
+    |           transpose           |           |
+    |           |                   |           |
+     -- linear --         -->        -- linear --
+         |                                |
+         out                              out
+    enable_lhs_const: If true, convert matmul where LHS is constant tensor. Default is False.
+    enable_rhs_const: If true, convert matmul where RHS is constant tensor. Default is True.
+    """
+    def __init__(
+        self,
+        enable_lhs_const: Optional[bool] = False,
+        enable_rhs_const: Optional[bool] = True,
+        enable_single_batch_lhs_const_bmm: Optional[bool] = False,
+    ):
+        super().__init__()
+        self.converters: List[Converter] = []
+        if enable_lhs_const:
+            self.converters.append(LhsConstMatmulToLinearConverter())
+        if enable_rhs_const:
+            self.converters.append(RhsConstMatmulToLinearConverter())
+        if enable_single_batch_lhs_const_bmm:
+            self.converters.append(SingleBatchLhsConstBmmToLinearConverter())
+    def call(self, exported_program: ExportedProgram) -> PassResult:
+        logger = logging.getLogger(__name__)
+        graph_module = exported_program.graph_module
+        graph = graph_module.graph
+        modified = False
+        for node in graph.nodes:
+            if not node.op == "call_function":
+                continue
+            for converter in self.converters:
+                if not converter.match(exported_program, node):
+                    continue
+                new_node = converter.convert(exported_program, node)
+                modified = True
+                logger.debug(
+                    f"{node.name} is replaced with {new_node.name} operator (permute + linear)"
+                )
+                continue
+        graph.eliminate_dead_code()
+        graph.lint()
+        graph_module.recompile()
+        return PassResult(modified)

tico/passes/convert_to_relu6.py CHANGED Viewed

@@ -172,7 +172,7 @@ class ConvertToReLU6(PassBase):
                 converter.convert(exported_program, node)
                 modified = True
                 logger.debug(f"{node.name} is replaced with ReLU6 operator")
-                break
+                continue
         graph.eliminate_dead_code()
         graph.lint()

tico/passes/decompose_addmm.py CHANGED Viewed

@@ -20,7 +20,6 @@ import torch
 from torch.export import ExportedProgram
 from tico.serialize.circle_mapping import extract_shape
-from tico.utils import logging
 from tico.utils.graph import add_placeholder, create_node
 from tico.utils.passes import PassBase, PassResult
 from tico.utils.trace_decorators import trace_graph_diff_on_pass
@@ -59,8 +58,6 @@ class DecomposeAddmm(PassBase):
         super().__init__()
     def call(self, exported_program: ExportedProgram) -> PassResult:
-        logger = logging.getLogger(__name__)
         gm = exported_program.graph_module
         graph: torch.fx.Graph = gm.graph
         modified = False

tico/passes/decompose_batch_norm.py CHANGED Viewed

@@ -96,9 +96,9 @@ class DecomposeBatchNorm(PassBase):
             eps = args.eps
             if not running_mean:
-                raise NotYetSupportedError(f"running_mean=None is not supported yet")
+                raise NotYetSupportedError("running_mean=None is not supported yet")
             if not running_var:
-                raise NotYetSupportedError(f"running_var=None is not supported yet")
+                raise NotYetSupportedError("running_var=None is not supported yet")
             """
             Only support the cases generated from torch.nn.BatchNorm2d module,

tico/passes/decompose_fake_quantize.py CHANGED Viewed

@@ -19,10 +19,8 @@ if TYPE_CHECKING:
 import torch
 # To import torch.ops.quantized_decomposed related operator
-from torch.ao.quantization.fx._decomposed import quantized_decomposed_lib
 from torch.export import ExportedProgram
-from tico.utils import logging
 from tico.utils.graph import create_node
 from tico.utils.passes import PassBase, PassResult
 from tico.utils.trace_decorators import trace_graph_diff_on_pass
@@ -66,7 +64,6 @@ class DecomposeFakeQuantize(PassBase):
         super().__init__()
     def call(self, exported_program: ExportedProgram) -> PassResult:
-        logger = logging.getLogger(__name__)
         modified = False
         gm = exported_program.graph_module

tico/passes/decompose_fake_quantize_tensor_qparams.py CHANGED Viewed

@@ -26,10 +26,8 @@ from torch._export.utils import (
 )
 # To import torch.ops.quantized_decomposed related operator
-from torch.ao.quantization.fx._decomposed import quantized_decomposed_lib
 from torch.export import ExportedProgram
-from tico.utils import logging
 from tico.utils.graph import create_node
 from tico.utils.passes import PassBase, PassResult
 from tico.utils.trace_decorators import (
@@ -246,10 +244,11 @@ class DecomposeFakeQuantizeTensorQParams(PassBase):
                     # So, let's remove `mask` from the output.args first.
                     # mask_user(output).args == (dequantize_per_tensor.tensor, mask)
                     if mask:
-                        len(mask) == 1
-                        mask_user = list(mask[0].users.keys())[0]
-                        assert len(mask_user.args) == 1
-                        mask_user.args = ((mask_user.args[0][0],),)
+                        assert len(mask) == 1
+                        if len(mask[0].users) > 0:
+                            mask_user = list(mask[0].users.keys())[0]
+                            assert len(mask_user.args) == 1
+                            mask_user.args = ((mask_user.args[0][0],),)
                 modified = True
             if (
                 node.target

tico/passes/decompose_group_norm.py CHANGED Viewed

@@ -22,7 +22,6 @@ import torch
 from torch.export import ExportedProgram
 from tico.serialize.circle_mapping import extract_shape
-from tico.utils import logging
 from tico.utils.graph import create_node
 from tico.utils.passes import PassBase, PassResult
 from tico.utils.trace_decorators import trace_graph_diff_on_pass
@@ -126,8 +125,6 @@ class DecomposeGroupNorm(PassBase):
         )
     def call(self, exported_program: ExportedProgram) -> PassResult:
-        logger = logging.getLogger(__name__)
         gm = exported_program.graph_module
         graph: torch.fx.Graph = gm.graph
         modified = False

tico/passes/legalize_predefined_layout_operators.py CHANGED Viewed

@@ -20,7 +20,7 @@ if TYPE_CHECKING:
 import torch
 from torch.export import ExportedProgram
-from tico.serialize.circle_graph import extract_shape
+from tico.serialize.circle_mapping import extract_shape
 from tico.utils import logging
 from tico.utils.errors import NotYetSupportedError
 from tico.utils.graph import create_node
@@ -206,7 +206,6 @@ class LegalizePreDefinedLayoutOperators(PassBase):
         args = ConvTranspose2DArgs(*node.args, **node.kwargs)  # type: ignore[arg-type]
         input = args.input
-        padding = args.padding
         groups = args.groups
         dilation = args.dilation
@@ -288,13 +287,12 @@ class LegalizePreDefinedLayoutOperators(PassBase):
         input = args.input
         weight = args.weight
         bias = args.bias
-        eps = args.eps
         running_mean = args.running_mean
         running_var = args.running_var
         use_input_stats = args.use_input_stats
-        if not (use_input_stats == True):
+        if not use_input_stats:
             raise NotYetSupportedError("Only support use_input_stats is True.")
         if not isinstance(running_mean, NoneType):
             raise NotYetSupportedError("Only support running_mean=None")
@@ -350,10 +348,6 @@ class LegalizePreDefinedLayoutOperators(PassBase):
         # max_pool2d_with_indices(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> (Tensor, Tensor)
         args = MaxPool2dWithIndicesArgs(*node.args, **node.kwargs)  # type: ignore[arg-type]
         input_ = args.input
-        kernel_size = args.kernel_size
-        stride = args.stride
-        padding = args.padding
-        dilation = args.dilation
         ceil_mode = args.ceil_mode
         if ceil_mode:
             raise NotYetSupportedError("Only support non-ceil model.")
@@ -402,9 +396,6 @@ class LegalizePreDefinedLayoutOperators(PassBase):
         # avg_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, bool ceil_mode=False, bool count_include_pad=True, int? divisor_override=None) -> (Tensor)
         args = AvgPool2dArgs(*node.args, **node.kwargs)  # type: ignore[arg-type]
         input_ = args.input
-        kernel_size = args.kernel_size
-        stride = args.stride
-        padding = args.padding
         ceil_mode = args.ceil_mode
         if ceil_mode:
             raise NotYetSupportedError("Only support non-ceil model.")

tico/passes/lower_to_resize_nearest_neighbor.py CHANGED Viewed

@@ -67,7 +67,7 @@ class LowerToResizeNearestNeighbor(PassBase):
             return None
         # indices = [None, None, H index, W index]
         N, C, H, W = indices
-        if N != None or C != None:
+        if N is not None or C is not None:
             return None
         if not isinstance(H, torch.fx.Node):
             return None

tico/passes/lower_to_slice.py CHANGED Viewed

@@ -28,7 +28,7 @@ from torch._export.utils import (
 from torch.export import ExportedProgram
 from tico.passes import ops
-from tico.serialize.circle_graph import extract_shape
+from tico.serialize.circle_mapping import extract_shape
 from tico.utils import logging
 from tico.utils.graph import create_node, is_single_value_tensor
 from tico.utils.passes import PassBase, PassResult

tico/passes/merge_consecutive_cat.py CHANGED Viewed

@@ -51,7 +51,7 @@ class MergeConsecutiveCat(PassBase):
                 if not prev_cat.op == "call_function":
                     continue
-                if not prev_cat.target in ops.aten.cat:
+                if prev_cat.target not in ops.aten.cat:
                     continue
                 prev_args = CatArgs(*prev_cat.args, **prev_cat.kwargs)  # type: ignore[arg-type]

tico/passes/ops.py CHANGED Viewed

@@ -69,10 +69,10 @@ class AtenOps:
             torch.ops.aten.unsqueeze_copy.default,
         ]
         self.view = [
-            torch.ops.aten.view,
             torch.ops.aten.view.default,
             torch.ops.aten.view_copy.default,
         ]
+        self._to_copy = [torch.ops.aten._to_copy.default]
 aten = AtenOps()

tico/passes/remove_redundant_assert_nodes.py CHANGED Viewed

@@ -21,7 +21,9 @@ from tico.utils.utils import is_target_node
 assert_node_targets = [
+    torch.ops.aten._assert_scalar.default,
     torch.ops.aten._assert_tensor_metadata.default,
+    torch.ops.aten.sym_constrain_range_for_size.default,  # Related to symbolic shape validation
 ]
@@ -29,7 +31,7 @@ assert_node_targets = [
 class RemoveRedundantAssertionNodes(PassBase):
     """
     This removes redundant assertion nodes.
-    - `aten.assert_tensor_meta.default`
+    When assertion node is erased, related comparison nodes are also removed by graph.eliminate_dead_code().
     """
     def __init__(self):

tico/passes/remove_redundant_expand.py CHANGED Viewed

@@ -12,11 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import TYPE_CHECKING
-if TYPE_CHECKING:
-    import torch.fx
-import torch
 from torch.export import ExportedProgram
 from tico.passes import ops
@@ -51,7 +46,9 @@ class RemoveRedundantExpand(PassBase):
             input, size = args.input, args.size
             input_shape = extract_shape(input)
-            if list(input_shape) != size:
+            output_shape = extract_shape(node)
+            if input_shape != output_shape:
                 continue
             node.replace_all_uses_with(input, propagate_meta=False)

tico/passes/remove_redundant_reshape.py CHANGED Viewed

@@ -90,7 +90,7 @@ class RemoveRedundantReshapePattern1(PassBase):
             if len(permute.users) != 1:
                 continue
             permute_args = PermuteArgs(*permute.args, **permute.kwargs)  # type: ignore[arg-type]
-            permute_input, permute_dims = permute_args.input, permute_args.dims
+            permute_dims = permute_args.dims
             # (1xAxBxC) - `aten.permute` - (1xAxCxB)
             if permute_dims != [0, 1, 3, 2]:
                 continue
@@ -172,7 +172,7 @@ class RemoveRedundantReshapePattern2(PassBase):
             if len(permute.users) != 1:
                 continue
             permute_args = PermuteArgs(*permute.args, **permute.kwargs)  # type: ignore[arg-type]
-            permute_input, permute_dims = permute_args.input, permute_args.dims
+            permute_dims = permute_args.dims
             # (1xAxBxC) - `aten.permute` - (Bx1xAxC)
             if permute_dims != [2, 0, 1, 3]:
                 continue
@@ -262,7 +262,7 @@ class RemoveRedundantReshapePattern3(PassBase):
                 continue
             # add
-            if not add.target in ops.aten.add:
+            if add.target not in ops.aten.add:
                 continue
             add_args = AddTensorArgs(*add.args, **add.kwargs)  # type: ignore[arg-type]
             reshape_2, reshape_3 = add_args.input, add_args.other
@@ -272,7 +272,7 @@ class RemoveRedundantReshapePattern3(PassBase):
             # reshape_2
             if not reshape_2.op == "call_function":
                 continue
-            if not reshape_2.target in ops.aten.reshape:
+            if reshape_2.target not in ops.aten.reshape:
                 continue
             reshape_2_args = ReshapeArgs(*reshape_2.args, **reshape_2.kwargs)  # type: ignore[arg-type]
             reshape_2_input = reshape_2_args.input
@@ -280,7 +280,7 @@ class RemoveRedundantReshapePattern3(PassBase):
             # reshape_3
             if not reshape_3.op == "call_function":
                 continue
-            if not reshape_3.target in ops.aten.reshape:
+            if reshape_3.target not in ops.aten.reshape:
                 continue
             reshape_3_args = ReshapeArgs(*reshape_3.args, **reshape_3.kwargs)  # type: ignore[arg-type]
             reshape_3_input = reshape_3_args.input

tico/passes/segment_index_select.py CHANGED Viewed

@@ -29,7 +29,7 @@ from torch._export.utils import (
 from torch.export import ExportedProgram
 from tico.passes import ops
-from tico.serialize.circle_graph import extract_shape
+from tico.serialize.circle_mapping import extract_shape
 from tico.utils import logging
 from tico.utils.graph import add_placeholder, create_node, is_single_value_tensor
 from tico.utils.passes import PassBase, PassResult

tico/quantization/__init__.py ADDED Viewed

@@ -0,0 +1,6 @@
+from tico.quantization.public_interface import convert, prepare
+__all__ = [
+    "convert",
+    "prepare",
+]

tico/{experimental/quantization → quantization}/algorithm/gptq/gptq.py RENAMED Viewed

@@ -25,7 +25,7 @@ from typing import Optional
 import torch
 import torch.nn as nn
-from tico.experimental.quantization.algorithm.gptq.quant import quantize, Quantizer
+from tico.quantization.algorithm.gptq.quant import quantize, Quantizer
 torch.backends.cuda.matmul.allow_tf32 = False
 torch.backends.cudnn.allow_tf32 = False

tico 0.1.0.dev250714__py3-none-any.whl → 0.1.0.dev251102__py3-none-any.whl

tico 0.1.0.dev250714py3-none-any.whl → 0.1.0.dev251102py3-none-any.whl