PyPI - tico - Versions diffs - 0.2.0.dev260511__tar.gz → 0.2.0.dev260512__tar.gz - Mend

tico 0.2.0.dev260511tar.gz → 0.2.0.dev260512tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (345) hide show

{tico-0.2.0.dev260511 → tico-0.2.0.dev260512}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: tico
-Version: 0.2.0.dev260511
+Version: 0.2.0.dev260512
 Summary: Convert Exported Torch Module To Circle
 License: This file provides full text of licenses used in this project

tico-0.2.0.dev260512/tico/_version.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ __version__ = "0.2.0.dev260512"

tico-0.2.0.dev260512/tico/passes/remove_unused_placeholder.py ADDED Viewed

@@ -0,0 +1,130 @@
+# Copyright (c) 2026 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+if TYPE_CHECKING:
+    import torch.fx
+import torch
+from torch._export.utils import is_buffer, is_lifted_tensor_constant, is_param
+from torch.export import ExportedProgram
+from tico.utils import logging
+from tico.utils.passes import PassBase, PassResult
+from tico.utils.trace_decorators import (
+    trace_const_diff_on_pass,
+    trace_graph_diff_on_pass,
+)
+def _is_constant_placeholder(
+    exported_program: ExportedProgram,
+    node: "torch.fx.Node",
+) -> bool:
+    """
+    Return whether the given placeholder represents a lifted constant.
+    Parameters, buffers, and lifted tensor constants are treated as constant
+    placeholders because they are backed by ExportedProgram state instead of
+    runtime user inputs.
+    """
+    if node.op != "placeholder":
+        return False
+    return (
+        is_param(exported_program, node)
+        or is_buffer(exported_program, node)
+        or is_lifted_tensor_constant(exported_program, node)
+    )
+def _remove_constant_placeholder(
+    exported_program: ExportedProgram,
+    node: "torch.fx.Node",
+) -> None:
+    """
+    Remove an unused constant placeholder from the graph and ExportedProgram state.
+    The graph signature is updated by the caller after all unused placeholders are
+    removed.
+    """
+    signature = exported_program.graph_signature
+    if name := signature.inputs_to_parameters.get(node.name, None):
+        exported_program.state_dict.pop(name, None)
+    elif name := signature.inputs_to_lifted_tensor_constants.get(node.name, None):
+        exported_program.constants.pop(name, None)
+    elif name := signature.inputs_to_buffers.get(node.name, None):
+        exported_program.constants.pop(name, None)
+        exported_program.state_dict.pop(name, None)
+    exported_program.graph.erase_node(node)
+@trace_graph_diff_on_pass
+@trace_const_diff_on_pass
+class RemoveUnusedPlaceholder(PassBase):
+    """
+    Remove unused constant placeholders from an exported graph.
+    FX dead-code elimination does not remove placeholder nodes even when they have
+    no users. This pass removes unused placeholders that correspond to parameters,
+    buffers, or lifted tensor constants, and then updates the ExportedProgram graph
+    signature accordingly.
+    Runtime user input placeholders are never removed by this pass.
+    """
+    def __init__(self) -> None:
+        super().__init__()
+    def call(self, exported_program: ExportedProgram) -> PassResult:
+        logger = logging.getLogger(__name__)
+        graph_module = exported_program.graph_module
+        graph: torch.fx.Graph = graph_module.graph
+        unused_placeholders = [
+            node
+            for node in graph.nodes
+            if _is_constant_placeholder(exported_program, node) and len(node.users) == 0
+        ]
+        if not unused_placeholders:
+            return PassResult(False)
+        removed_names = [node.name for node in unused_placeholders]
+        for node in unused_placeholders:
+            _remove_constant_placeholder(exported_program, node)
+        existing_name_to_spec = {
+            spec.arg.name: spec for spec in exported_program.graph_signature.input_specs
+        }
+        exported_program.graph_signature.input_specs = [
+            existing_name_to_spec[node.name]
+            for node in graph.nodes
+            if node.op == "placeholder"
+        ]
+        graph.lint()
+        graph_module.recompile()
+        logger.debug(f"Unused constant placeholders are removed: {removed_names}")
+        # Run only once.
+        return PassResult(False)

{tico-0.2.0.dev260511 → tico-0.2.0.dev260512}/tico/quantization/config/builders.py RENAMED Viewed

@@ -13,9 +13,13 @@
 # limitations under the License.
 import copy
-from dataclasses import dataclass, field
 from typing import Any, Dict, Mapping, Optional, Tuple, Type
+from tico.quantization.config.llama_attention import (
+    DEFAULT_EXECUTION_PROFILE,
+    ExecutionProfile,
+    normalize_execution_profile,
+)
 from tico.quantization.config.ptq import PTQConfig
 from tico.quantization.config.utils import auto_qscheme_for
 from tico.quantization.wrapq.dtypes import DType
@@ -336,6 +340,7 @@ def build_llm_ptq_config(
     norm_weight_bits: Optional[int] = None,
     norm_weight_dtype: Optional[DType] = None,
     strict_wrap: bool = True,
+    profile: ExecutionProfile = DEFAULT_EXECUTION_PROFILE,
 ) -> PTQConfig:
     """
     Build a PTQConfig for an LLM using model-family-aware override generation.
@@ -363,9 +368,7 @@ def build_llm_ptq_config(
         explicit override.
     default_observer : Type[ObserverBase], default=MinMaxObserver
         Observer class to instantiate when no explicit observer is provided
-        via overrides.
-        This should be a subclass of `ObserverBase` (e.g., MinMaxObserver,
-        EMAObserver). The class itself (not an instance) must be passed.
+        through overrides.
     linear_weight_bits : Optional[int], default=None
         Convenience bit-width for decoder-layer linear projection weights.
         Used only when `linear_weight_dtype` is not provided.
@@ -391,6 +394,12 @@ def build_llm_ptq_config(
     strict_wrap : bool, default=True
         If True, preparing a model will raise when a required module cannot be
         wrapped.
+    profile : ExecutionProfile, default="npu_export"
+        Execution profile stored as `PTQConfig.model_args["profile"]`.
+        "reference_eval" selects a GPU-friendly, Hugging Face-like path.
+        "npu_export" preserves the existing NPU-export-oriented graph.
+        Advanced users may override or extend `qcfg.model_args` directly
+        before calling `prepare()`.
     Returns
     -------
@@ -402,6 +411,11 @@ def build_llm_ptq_config(
     NotImplementedError
         If the requested `model_type` is not supported.
     """
+    profile = normalize_execution_profile(
+        profile,
+        context="build_llm_ptq_config.profile",
+    )
     resolved_linear_weight_dtype = _resolve_weight_dtype(
         dtype=linear_weight_dtype,
         bits=linear_weight_bits,
@@ -438,6 +452,7 @@ def build_llm_ptq_config(
         default_qscheme=default_qscheme,
         default_observer=default_observer,
         overrides=overrides,
+        model_args={"profile": profile},
         strict_wrap=strict_wrap,
     )
@@ -448,7 +463,10 @@ def _build_qwen3_vl_norm_override(
     norm_weight_dtype: Optional[DType],
 ) -> Dict[str, Any]:
     """
-    Build an override dictionary for Qwen3-VL norm modules (RMSNorm and LayerNorm).
+    Build an override dictionary for Qwen3-VL norm modules.
+    The generated override covers both RMSNorm-style observers used by text
+    modules and LayerNorm-style observers used by vision modules.
     Parameters
     ----------

tico-0.2.0.dev260512/tico/quantization/config/llama_attention.py ADDED Viewed

@@ -0,0 +1,209 @@
+# Copyright (c) 2026 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass, fields, replace
+from typing import Any, cast, Literal, Mapping, Optional
+from tico.quantization.config.ptq import PTQConfig
+ExecutionProfile = Literal["reference_eval", "npu_export"]
+ScaleFusion = Literal["none", "q_proj", "k_proj"]
+RopeConvention = Literal["hf", "pre_negated_sin"]
+AttentionLayout = Literal["batched", "unrolled"]
+DEFAULT_EXECUTION_PROFILE: ExecutionProfile = "npu_export"
+SUPPORTED_EXECUTION_PROFILES: tuple[ExecutionProfile, ...] = (
+    "reference_eval",
+    "npu_export",
+)
+@dataclass(frozen=True)
+class LlamaAttentionOptions:
+    """
+    Execution options for quantized Llama attention wrappers.
+    These options describe graph-level implementation choices, not quantization
+    policy. They are intentionally read from `PTQConfig.model_args` instead of
+    `PTQConfig.overrides`.
+    Attributes
+    ----------
+    scale_fusion : ScaleFusion
+        Where to apply the attention scale `1 / sqrt(head_dim)`.
+        "none" applies it to logits at runtime, while "q_proj" and
+        "k_proj" fold it into the corresponding projection weights.
+    rope : RopeConvention
+        Rotary embedding sign convention. "hf" uses `rotate_half` as
+        `(-x2, x1)` with normal sine values. "pre_negated_sin" expects the
+        first half of sine values to be pre-negated and uses `(x2, x1)` in the
+        rotate-half operation.
+    layout : AttentionLayout
+        Attention implementation layout. "batched" is closer to the
+        Hugging Face implementation and is preferable for GPU evaluation.
+        "unrolled" preserves the NPU-export-friendly per-head loop.
+    """
+    scale_fusion: ScaleFusion = "k_proj"
+    rope: RopeConvention = "pre_negated_sin"
+    layout: AttentionLayout = "unrolled"
+_PRESETS: dict[ExecutionProfile, LlamaAttentionOptions] = {
+    "reference_eval": LlamaAttentionOptions(
+        scale_fusion="none",
+        rope="hf",
+        layout="batched",
+    ),
+    "npu_export": LlamaAttentionOptions(
+        scale_fusion="k_proj",
+        rope="pre_negated_sin",
+        layout="unrolled",
+    ),
+}
+def normalize_execution_profile(
+    profile: Any,
+    *,
+    context: str = "profile",
+) -> ExecutionProfile:
+    """
+    Validate and return an execution profile string.
+    Parameters
+    ----------
+    profile : Any
+        User-provided profile value.
+    context : str
+        Human-readable location used in error messages.
+    Returns
+    -------
+    ExecutionProfile
+        Validated profile value.
+    Raises
+    ------
+    TypeError
+        If the profile value is not a string.
+    ValueError
+        If the profile string is not supported.
+    """
+    if not isinstance(profile, str):
+        raise TypeError(f"{context} must be a string, got {type(profile).__name__}.")
+    if profile not in SUPPORTED_EXECUTION_PROFILES:
+        raise ValueError(
+            f"Unsupported execution profile at {context}: {profile!r}. "
+            f"Supported profiles: {list(SUPPORTED_EXECUTION_PROFILES)}."
+        )
+    return cast(ExecutionProfile, profile)
+def get_llama_attention_options(
+    qcfg: Optional[PTQConfig],
+) -> LlamaAttentionOptions:
+    """
+    Resolve Llama attention implementation options from a PTQConfig.
+    The root-level `model_args["profile"]` selects the default execution
+    profile for all profile-aware wrappers. The attention wrapper may override
+    that default through `model_args["attention"]`.
+    Supported examples are::
+        PTQConfig(..., model_args={"profile": "reference_eval"})
+    and::
+        PTQConfig(
+            ...,
+            model_args={
+                "profile": "reference_eval",
+                "attention": {
+                    "layout": "unrolled",
+                },
+            },
+        )
+    `model_args["attention"]` may also be a plain profile string, for example
+    "npu_export". When no option is provided, the default profile is
+    "npu_export" to preserve the existing export-oriented graph.
+    Parameters
+    ----------
+    qcfg : Optional[PTQConfig]
+        PTQ configuration associated with the wrapper.
+    Returns
+    -------
+    LlamaAttentionOptions
+        Validated execution options.
+    """
+    if qcfg is None:
+        return _PRESETS[DEFAULT_EXECUTION_PROFILE]
+    root_profile = normalize_execution_profile(
+        qcfg.get_model_arg("profile", DEFAULT_EXECUTION_PROFILE),
+        context="PTQConfig.model_args['profile']",
+    )
+    raw_attention = qcfg.get_model_arg("attention", {})
+    if raw_attention is None:
+        raw_attention = {}
+    if isinstance(raw_attention, str):
+        raw_attention = {"profile": raw_attention}
+    if not isinstance(raw_attention, Mapping):
+        raise TypeError(
+            "PTQConfig.model_args['attention'] must be a mapping, a string, or None."
+        )
+    raw = dict(raw_attention)
+    profile = normalize_execution_profile(
+        raw.pop("profile", root_profile),
+        context="PTQConfig.model_args['attention']['profile']",
+    )
+    valid_keys = {field.name for field in fields(LlamaAttentionOptions)}
+    unknown_keys = sorted(set(raw) - valid_keys)
+    if unknown_keys:
+        raise ValueError(f"Unknown Llama attention option(s): {unknown_keys}.")
+    options = replace(_PRESETS[profile], **raw)
+    _validate_llama_attention_options(options)
+    return options
+def is_npu_export_attention_options(options: LlamaAttentionOptions) -> bool:
+    """
+    Return whether the options match the NPU-export-friendly attention graph.
+    """
+    return (
+        options.scale_fusion == "k_proj"
+        and options.rope == "pre_negated_sin"
+        and options.layout == "unrolled"
+    )
+def _validate_llama_attention_options(options: LlamaAttentionOptions) -> None:
+    """
+    Validate a fully resolved LlamaAttentionOptions instance.
+    """
+    if options.scale_fusion not in ("none", "q_proj", "k_proj"):
+        raise ValueError(f"Unsupported scale_fusion: {options.scale_fusion!r}.")
+    if options.rope not in ("hf", "pre_negated_sin"):
+        raise ValueError(f"Unsupported rope convention: {options.rope!r}.")
+    if options.layout not in ("batched", "unrolled"):
+        raise ValueError(f"Unsupported attention layout: {options.layout!r}.")

tico-0.2.0.dev260512/tico/quantization/passes/quantize_bias.py ADDED Viewed

@@ -0,0 +1,145 @@
+# Copyright (c) 2025 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Optional, Tuple, TYPE_CHECKING
+if TYPE_CHECKING:
+    import torch.fx
+import torch
+from torch.export import ExportedProgram
+from tico.serialize.quant_param import QPARAM_KEY, QuantParam, to_qparam_dtype
+from tico.utils import logging
+from tico.utils.graph import add_placeholder, get_torch_param_value, is_torch_param
+from tico.utils.passes import PassBase, PassResult
+from tico.utils.trace_decorators import trace_graph_diff_on_pass
+from tico.utils.validate_args_kwargs import Conv2DArgs, LinearArgs
+def _get_input_weight_bias_for_bias_quantization(
+    node: "torch.fx.Node",
+) -> Optional[Tuple["torch.fx.Node", "torch.fx.Node", "torch.fx.Node"]]:
+    """
+    Return input, weight, and bias nodes for operators whose bias can be quantized.
+    The returned tuple follows the common bias quantization rule where the bias
+    scale is computed from the input scale and the per-output-channel weight scale.
+    """
+    if node.target == torch.ops.aten.linear.default:
+        lin_args = LinearArgs(*node.args, **node.kwargs)
+        if lin_args.bias is None:
+            return None
+        return lin_args.input, lin_args.weight, lin_args.bias
+    if node.target in [
+        torch.ops.circle_custom.conv2d,
+        torch.ops.circle_custom.conv2d.padding,
+    ]:
+        conv_args = Conv2DArgs(*node.args, **node.kwargs)
+        if conv_args.bias is None:
+            return None
+        return conv_args.input, conv_args.weight, conv_args.bias
+    return None
+@trace_graph_diff_on_pass
+class QuantizeBias(PassBase):
+    """
+    Quantize bias.
+    This pass identifies fp32 biases, quantizes them using scales of input and weights.
+    This pass assumes that if bias is fp32, input and weights must have been quantized.
+    """
+    def __init__(self):
+        super().__init__()
+    def call(self, exported_program: ExportedProgram) -> PassResult:
+        logger = logging.getLogger(__name__)
+        graph_module = exported_program.graph_module
+        graph: torch.fx.Graph = graph_module.graph
+        for node in graph.nodes:
+            if node.op != "call_function":
+                continue
+            op_args = _get_input_weight_bias_for_bias_quantization(node)
+            if op_args is None:
+                continue
+            inp, weights, bias = op_args
+            # Only support bias is Parameter.
+            # TODO Is it possible that bias is not Parameter?
+            if not is_torch_param(bias, exported_program):
+                continue
+            bias_val: torch.Tensor = get_torch_param_value(bias, exported_program)
+            if bias_val.dtype != torch.float32:
+                continue
+            if QPARAM_KEY not in inp.meta:
+                continue
+            if QPARAM_KEY not in weights.meta:
+                continue
+            quant_dtype = None
+            if inp.meta[QPARAM_KEY].dtype == "int16":
+                quant_dtype = torch.int64
+            elif inp.meta[QPARAM_KEY].dtype == "uint8":
+                quant_dtype = torch.int32
+            else:
+                continue
+            assert quant_dtype is not None
+            type_info = torch.iinfo(quant_dtype)
+            i_scale = inp.meta[QPARAM_KEY].scale
+            w_scale = weights.meta[QPARAM_KEY].scale
+            assert i_scale is not None
+            assert w_scale is not None
+            assert len(i_scale) == 1
+            assert len(w_scale) == bias_val.shape[0]
+            bias_scale = torch.tensor(i_scale) * torch.tensor(w_scale)
+            q_bias = torch.round(bias_val / bias_scale)
+            q_bias = torch.clamp(q_bias, min=type_info.min, max=type_info.max)
+            q_bias = q_bias.to(quant_dtype)
+            q_bias_node = add_placeholder(exported_program, q_bias, bias.name)
+            qparam = QuantParam()
+            qparam.scale = bias_scale.tolist()
+            assert qparam.scale is not None
+            qparam.zero_point = [0] * len(qparam.scale)
+            qparam.dtype = to_qparam_dtype(quant_dtype)
+            qparam.quantized_dimension = 0
+            q_bias_node.meta[QPARAM_KEY] = qparam
+            node.update_arg(2, q_bias_node)
+            logger.debug(f"Bias ({bias.name}) is quantized to {q_bias_node.name}.")
+        graph.eliminate_dead_code()
+        graph.lint()
+        graph_module.recompile()
+        # Run only once.
+        return PassResult(False)

tico 0.2.0.dev260511__tar.gz → 0.2.0.dev260512__tar.gz

tico 0.2.0.dev260511tar.gz → 0.2.0.dev260512tar.gz