PyPI - tico - Versions diffs - 0.1.0.dev250911__py3-none-any.whl → 0.1.0.dev250914__py3-none-any.whl - Mend

tico 0.1.0.dev250911py3-none-any.whl → 0.1.0.dev250914py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of tico might be problematic. Click here for more details.

Files changed (14) hide show

tico/__init__.py CHANGED Viewed

@@ -29,7 +29,7 @@ __all__ = [
 ]
 # THIS LINE IS AUTOMATICALLY GENERATED BY setup.py
-__version__ = "0.1.0.dev250911"
+__version__ = "0.1.0.dev250914"
 MINIMUM_SUPPORTED_VERSION = "2.5.0"
 SECURE_TORCH_VERSION = "2.6.0"

tico/experimental/quantization/ptq/examples/compare_ppl.py CHANGED Viewed

@@ -20,6 +20,10 @@
 #   • Full post-training UINT-8 flow (wrap → calibrate → eval).
 # =============================================================================
+import argparse
+import sys
+from typing import Optional
 import torch
 import tqdm
 from datasets import load_dataset
@@ -29,14 +33,6 @@ from tico.experimental.quantization.ptq.quant_config import QuantConfig
 from tico.experimental.quantization.ptq.utils.metrics import perplexity
 from tico.experimental.quantization.ptq.wrappers.ptq_wrapper import PTQWrapper
-# -------------------------------------------------------------------------
-# 0. Global configuration
-# -------------------------------------------------------------------------
-MODEL_NAME = "meta-llama/Meta-Llama-3-1B"
-DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-STRIDE = 512  # sliding-window stride for perplexity
-RUN_FP = True  # set False → run UINT-8 path
 # Token-budget presets for activation calibration
 TOKENS: dict[str, int] = {
     # Smoke test (<1 min turnaround on CPU/GPU)
@@ -46,76 +42,198 @@ TOKENS: dict[str, int] = {
     # Production / 4-bit observer smoothing
     "production": 200_000,
 }
-CALIB_TOKENS = TOKENS["baseline"]
-print(f"Calibrating with {CALIB_TOKENS:,} tokens.\n")
-# -------------------------------------------------------------------------
-# 1. Load model
-# -------------------------------------------------------------------------
-tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
-if RUN_FP:
-    # -- FP32 baseline ------------------------------------------------------
-    print("Loading FP32 model …")
-    fp_model = AutoModelForCausalLM.from_pretrained(MODEL_NAME).to(DEVICE).eval()
-    fp_model.config.use_cache = False
-else:
-    # -- UINT-8 pipeline -----------------------------------------------------
-    print("Creating UINT-8 clone …")
-    uint8_model = AutoModelForCausalLM.from_pretrained(MODEL_NAME).to(DEVICE).eval()
-    uint8_model.config.use_cache = False
-    # ---------------------------------------------------------------------
-    # 2. Wrap every Transformer layer with PTQWrapper
-    # ---------------------------------------------------------------------
-    qcfg = QuantConfig()  # all-uint8 defaults
-    wrapped_layers = torch.nn.ModuleList()
-    for idx, layer in enumerate(uint8_model.model.layers):
-        layer_cfg = qcfg.child(f"layer{idx}")
-        wrapped_layers.append(PTQWrapper(layer, qcfg=layer_cfg))
-    uint8_model.model.layers = wrapped_layers
-    # ---------------------------------------------------------------------
-    # 3. Single-pass activation calibration
-    # ---------------------------------------------------------------------
-    print("Calibrating UINT-8 observers …")
-    calib_txt = " ".join(
-        load_dataset("wikitext", "wikitext-2-raw-v1", split="train")["text"]
-    )[:CALIB_TOKENS]
-    ids = tokenizer(calib_txt, return_tensors="pt").input_ids.to(DEVICE)
-    # (a) switch every QuantModuleBase to CALIB mode
-    for l in uint8_model.model.layers:
-        l.enable_calibration()
-    # (b) run inference to collect ranges
-    with torch.no_grad():
-        for i in tqdm.trange(0, ids.size(1) - 1, STRIDE, desc="Calibration"):
-            uint8_model(ids[:, i : i + STRIDE])
-    # (c) freeze (scale, zero-point)
-    for l in uint8_model.model.layers:
-        l.freeze_qparams()
-# -------------------------------------------------------------------------
-# 4. Evaluate perplexity on Wikitext-2
-# -------------------------------------------------------------------------
-print("\nCalculating perplexities …")
-test_ds = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
-enc = tokenizer("\n\n".join(test_ds["text"]), return_tensors="pt")
-if RUN_FP:
-    ppl_fp = perplexity(fp_model, enc, DEVICE, stride=STRIDE)
-else:
-    ppl_int8 = perplexity(uint8_model, enc, DEVICE, stride=STRIDE)
-# -------------------------------------------------------------------------
-# 5. Report
-# -------------------------------------------------------------------------
-print("\n┌── Wikitext-2 test perplexity ─────────────")
-if RUN_FP:
-    print(f"│ FP32  : {ppl_fp:8.2f}")
-else:
-    print(f"│ UINT-8 : {ppl_int8:8.2f}")
-print("└───────────────────────────────────────────")
+DTYPE_MAP = {
+    "float32": torch.float32,
+    "bfloat16": torch.bfloat16,
+    "float16": torch.float16,
+}
+# Hardcoded dataset settings
+DATASET_NAME = "wikitext"
+DATASET_CONFIG = "wikitext-2-raw-v1"
+TRAIN_SPLIT = "train"
+TEST_SPLIT = "test"
+def main():
+    parser = argparse.ArgumentParser(description="Quick PTQ example (FP or UINT8)")
+    parser.add_argument(
+        "--mode",
+        choices=["fp", "uint8"],
+        default="fp",
+        help="Choose FP baseline only or full UINT8 PTQ path.",
+    )
+    parser.add_argument(
+        "--model", type=str, required=True, help="HF repo name or local path."
+    )
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="cuda" if torch.cuda.is_available() else "cpu",
+        help="Device to run on (cuda|cpu).",
+    )
+    parser.add_argument(
+        "--dtype",
+        choices=list(DTYPE_MAP.keys()),
+        default="float32",
+        help="Model dtype for load (float32|bfloat16|float16).",
+    )
+    parser.add_argument(
+        "--stride", type=int, default=512, help="Sliding-window stride for perplexity."
+    )
+    parser.add_argument("--seed", type=int, default=42, help="Random seed.")
+    parser.add_argument(
+        "--trust-remote-code",
+        action="store_true",
+        help="Enable only if you trust the model repo code.",
+    )
+    parser.add_argument(
+        "--hf-token",
+        type=str,
+        default=None,
+        help="Optional HF token for gated/private models.",
+    )
+    parser.add_argument(
+        "--use-cache",
+        dest="use_cache",
+        action="store_true",
+        default=False,
+        help="Use model KV cache if enabled (off by default).",
+    )
+    parser.add_argument(
+        "--no-tqdm", action="store_true", help="Disable tqdm progress bars."
+    )
+    # 2) calib-preset default = debug
+    parser.add_argument(
+        "--calib-preset",
+        choices=list(TOKENS.keys()),
+        default="debug",
+        help="Calibration token budget preset.",
+    )
+    args = parser.parse_args()
+    # Basic setup
+    torch.manual_seed(args.seed)
+    device = torch.device(args.device)
+    dtype = DTYPE_MAP[args.dtype]
+    print("=== Config ===")
+    print(f"Mode             : {args.mode}")
+    print(f"Model            : {args.model}")
+    print(f"Device           : {device.type}")
+    print(f"DType            : {args.dtype}")
+    print(f"Stride           : {args.stride}")
+    print(f"Use HF cache?    : {args.use_cache}")
+    print(f"Calib preset     : {args.calib_preset}")
+    print()
+    # -------------------------------------------------------------------------
+    # 1. Load model and tokenizer
+    # -------------------------------------------------------------------------
+    tokenizer = AutoTokenizer.from_pretrained(
+        args.model,
+        trust_remote_code=args.trust_remote_code,
+        token=args.hf_token,
+    )
+    model = (
+        AutoModelForCausalLM.from_pretrained(
+            args.model,
+            torch_dtype=dtype,
+            trust_remote_code=args.trust_remote_code,
+            token=args.hf_token,
+        )
+        .to(device)
+        .eval()
+    )
+    model.config.use_cache = args.use_cache
+    if args.mode == "fp":
+        fp_model = model
+    else:
+        # INT8 PTQ path
+        uint8_model = model
+        CALIB_TOKENS = TOKENS[args.calib_preset]
+        print(f"Calibrating with {CALIB_TOKENS:,} tokens.\n")
+        # ---------------------------------------------------------------------
+        # 2. Wrap every Transformer layer with PTQWrapper
+        # ---------------------------------------------------------------------
+        qcfg = QuantConfig()  # all-uint8 defaults
+        wrapped_layers = torch.nn.ModuleList()
+        for idx, layer in enumerate(uint8_model.model.layers):
+            layer_cfg = qcfg.child(f"layer{idx}")
+            wrapped_layers.append(PTQWrapper(layer, qcfg=layer_cfg))
+        uint8_model.model.layers = wrapped_layers
+        # ---------------------------------------------------------------------
+        # 3. Single-pass activation calibration
+        # ---------------------------------------------------------------------
+        print("Calibrating UINT-8 observers …")
+        calib_txt = " ".join(
+            load_dataset(DATASET_NAME, DATASET_CONFIG, split=TRAIN_SPLIT)["text"]
+        )[:CALIB_TOKENS]
+        ids = tokenizer(calib_txt, return_tensors="pt").input_ids.to(device)
+        # (a) switch every QuantModuleBase to CALIB mode
+        for l in uint8_model.model.layers:
+            l.enable_calibration()
+        # (b) run inference to collect ranges
+        iterator = range(0, ids.size(1) - 1, args.stride)
+        if not args.no_tqdm:
+            iterator = tqdm.tqdm(iterator, desc="Calibration")
+        with torch.no_grad():
+            for i in iterator:
+                uint8_model(ids[:, i : i + args.stride])
+        # (c) freeze (scale, zero-point)
+        for l in uint8_model.model.layers:
+            l.freeze_qparams()
+    # -------------------------------------------------------------------------
+    # 4. Evaluate perplexity
+    # -------------------------------------------------------------------------
+    print("\nCalculating perplexities …")
+    test_ds = load_dataset(DATASET_NAME, DATASET_CONFIG, split=TEST_SPLIT)
+    enc = tokenizer("\n\n".join(test_ds["text"]), return_tensors="pt")
+    if args.mode == "fp":
+        ppl_fp = perplexity(
+            fp_model,
+            enc,
+            args.device,
+            stride=args.stride,
+            show_progress=not args.no_tqdm,
+        )
+    else:
+        ppl_int8 = perplexity(
+            uint8_model,
+            enc,
+            args.device,
+            stride=args.stride,
+            show_progress=not args.no_tqdm,
+        )
+    # -------------------------------------------------------------------------
+    # 5. Report
+    # -------------------------------------------------------------------------
+    print("\n┌── Wikitext-2 test perplexity ─────────────")
+    if args.mode == "fp":
+        print(f"│ FP     : {ppl_fp:8.2f}")
+    else:
+        print(f"│ UINT-8 : {ppl_int8:8.2f}")
+    print("└───────────────────────────────────────────")
+if __name__ == "__main__":
+    try:
+        main()
+    except Exception as e:
+        print(f"\n[Error] {e}", file=sys.stderr)
+        sys.exit(1)

tico/experimental/quantization/ptq/wrappers/fairseq/quant_encoder.py ADDED Viewed

@@ -0,0 +1,333 @@
+# Copyright (c) 2025 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# -----------------------------------------------------------------------------
+# This file includes modifications based on fairseq
+#  (https://github.com/facebookresearch/fairseq), originally licensed under
+# the MIT License. See the LICENSE file in the fairseq repository for details.
+# -----------------------------------------------------------------------------
+import math
+from typing import Dict, List, Literal, Optional, Tuple
+import torch
+import torch.nn as nn
+from torch import Tensor
+from tico.experimental.quantization.ptq.quant_config import QuantConfig
+from tico.experimental.quantization.ptq.wrappers.ptq_wrapper import PTQWrapper
+from tico.experimental.quantization.ptq.wrappers.quant_module_base import (
+    QuantModuleBase,
+)
+from tico.experimental.quantization.ptq.wrappers.registry import try_register
+@try_register("fairseq.models.transformer.TransformerEncoderBase")
+class QuantFairseqEncoder(QuantModuleBase):
+    """
+    Quant-aware drop-in replacement for Fairseq TransformerEncoderBase.
+    Key design choices:
+    - Keep embeddings and LayerNorms in FP.
+    - Remove training-time logic (dropout, activation-dropout, quant_noise).
+    - Attention masks are handled statically inside the layer wrapper; this
+      encoder only does the original padding zero-out before the stack.
+    I/O contracts:
+    - Forward signature and returned dictionary are identical to the original
+      when `use_external_inputs=False`.
+    - When `use_external_inputs=True`, forward returns a single Tensor (T,B,C)
+      and completely skips embedding/positional/LN/mask-creation paths.
+    - Tensor shapes follow Fairseq convention.
+    """
+    def __init__(
+        self,
+        fp_encoder: nn.Module,
+        *,
+        qcfg: Optional[QuantConfig] = None,
+        fp_name: Optional[str] = None,
+        use_external_inputs: bool = False,  # export-mode flag
+        return_type: Literal["tensor", "dict"] = "dict",
+    ):
+        super().__init__(qcfg, fp_name=fp_name)
+        self.use_external_inputs = use_external_inputs
+        self.return_type: Literal["tensor", "dict"] = return_type
+        # --- carry basic config / metadata (read-only copies) ---------------
+        assert hasattr(fp_encoder, "cfg")
+        self.cfg = fp_encoder.cfg
+        self.return_fc: bool = bool(getattr(fp_encoder, "return_fc", False))
+        # Embedding stack ----------------------------------------------------
+        assert hasattr(fp_encoder, "embed_tokens") and isinstance(
+            fp_encoder.embed_tokens, nn.Module
+        )
+        self.embed_tokens = fp_encoder.embed_tokens  # keep FP embeddings
+        assert hasattr(fp_encoder, "padding_idx")
+        self.padding_idx: int = int(fp_encoder.padding_idx)  # type: ignore[arg-type]
+        # scale = sqrt(embed_dim) unless disabled
+        embed_dim = int(self.embed_tokens.embedding_dim)  # type: ignore[arg-type]
+        no_scale = bool(getattr(self.cfg, "no_scale_embedding", False))
+        self.embed_scale: float = 1.0 if no_scale else math.sqrt(embed_dim)
+        # Positional embeddings (keep as-is; no FQ)
+        self.embed_positions = getattr(fp_encoder, "embed_positions", None)
+        # Optional embedding LayerNorm
+        self.layernorm_embedding = getattr(fp_encoder, "layernorm_embedding", None)
+        # Final encoder LayerNorm (pre-norm stacks may set this to None)
+        self.layer_norm = getattr(fp_encoder, "layer_norm", None)
+        # Max positions (reuse for API parity)
+        self.max_source_positions: int = int(fp_encoder.max_source_positions)  # type: ignore[arg-type]
+        # --- wrap encoder layers with PTQWrapper ----------------------------
+        assert hasattr(fp_encoder, "layers")
+        fp_layers = list(fp_encoder.layers)  # type: ignore[arg-type]
+        self.layers = nn.ModuleList()
+        # Prepare child QuantConfig namespaces: layers/<idx>
+        layers_qcfg = qcfg.child("layers") if qcfg else None
+        for i, layer in enumerate(fp_layers):
+            child_cfg = layers_qcfg.child(str(i)) if layers_qcfg else None
+            self.layers.append(
+                PTQWrapper(layer, qcfg=child_cfg, fp_name=f"{fp_name}.layers.{i}")
+            )
+        # Version buffer (keep for state_dict parity)
+        version = getattr(fp_encoder, "version", None)
+        if isinstance(version, torch.Tensor):
+            self.register_buffer("version", version.clone(), persistent=False)
+        else:
+            self.register_buffer("version", torch.tensor([3.0]), persistent=False)
+    # ----------------------------------------------------------------------
+    def forward_embedding(
+        self, src_tokens: Tensor, token_embedding: Optional[Tensor] = None
+    ) -> Tuple[Tensor, Tensor]:
+        """
+        Embed tokens and add positional embeddings. Dropout/quant_noise are removed.
+        Returns:
+            x (B, T, C), embed (B, T, C)  # embed is the token-only embedding
+        """
+        if token_embedding is None:
+            token_embedding = self.embed_tokens(src_tokens)
+        embed = token_embedding  # token-only
+        x = self.embed_scale * token_embedding
+        if self.embed_positions is not None:
+            x = x + self.embed_positions(src_tokens)
+        if self.layernorm_embedding is not None:
+            x = self.layernorm_embedding(x)
+        # No dropout, no quant_noise here (inference-only)
+        return x, embed
+    # ----------------------------------------------------------------------
+    def forward(
+        self,
+        src_tokens: Tensor,
+        src_lengths: Optional[Tensor] = None,
+        return_all_hiddens: bool = False,
+        token_embeddings: Optional[Tensor] = None,
+        *,
+        # External-inputs branch (used for export)
+        encoder_padding_mask: Optional[Tensor] = None,  # B x T (bool)
+    ) -> Tensor | Dict[str, List[Optional[Tensor]]]:
+        """
+        If `self.use_external_inputs` is True:
+          - Use only x_external and encoder_padding_mask.
+          - Return a single Tensor (T, B, C) for export friendliness.
+        Otherwise (False):
+          - Behave like the original Fairseq encoder forward and return dict-of-lists.
+        """
+        if self.use_external_inputs:
+            # ----- External-input mode: completely skip embedding/positional/LN/mask creation -----
+            x_external = src_tokens  # T x B x C (already embedded + transposed)
+            encoder_states: List[Tensor] = []
+            if return_all_hiddens:
+                encoder_states.append(x_external)
+            for layer in self.layers:
+                out = layer(x_external, encoder_padding_mask=encoder_padding_mask)
+                x_external = (
+                    out[0] if (isinstance(out, tuple) and len(out) == 2) else out
+                )
+                if return_all_hiddens:
+                    encoder_states.append(x_external)
+            if self.layer_norm is not None:
+                x_external = self.layer_norm(x_external)
+            if self.return_type == "dict":
+                return {
+                    "encoder_out": [x_external],
+                    "encoder_padding_mask": [encoder_padding_mask],
+                    "encoder_states": encoder_states,  # type: ignore[dict-item]
+                }
+            else:
+                # For export, returning a single Tensor is simpler and more portable.
+                return x_external
+        # ----- Original path (training/eval compatibility) ------------------
+        # Compute padding mask [B, T] (bool). We keep the original "has_pads" logic.
+        encoder_padding_mask = src_tokens.eq(self.padding_idx)
+        has_pads: Tensor = (
+            torch.tensor(src_tokens.device.type == "xla") or encoder_padding_mask.any()
+        )
+        if torch.jit.is_scripting():
+            has_pads = torch.tensor(1) if has_pads else torch.tensor(0)
+        # Embedding path (B,T,C). No dropout/quant_noise.
+        x, encoder_embedding = self.forward_embedding(src_tokens, token_embeddings)
+        # Zero out padded timesteps prior to the stack (same as original)
+        x = x * (
+            1 - encoder_padding_mask.unsqueeze(-1).type_as(x) * has_pads.type_as(x)
+        )
+        # B x T x C -> T x B x C
+        x = x.transpose(0, 1)
+        encoder_states: List[Tensor] = []  # type: ignore[no-redef]
+        fc_results: List[Optional[Tensor]] = []
+        if return_all_hiddens:
+            encoder_states.append(x)
+        # Encoder layers (each item is PTQ-wrapped and uses static additive masks internally)
+        for layer in self.layers:
+            out = layer(
+                x, encoder_padding_mask=encoder_padding_mask if has_pads else None
+            )
+            if isinstance(out, tuple) and len(out) == 2:
+                x, fc_res = out
+            else:
+                x = out
+                fc_res = None
+            if return_all_hiddens and not torch.jit.is_scripting():
+                encoder_states.append(x)
+                fc_results.append(fc_res)
+        if self.layer_norm is not None:
+            x = self.layer_norm(x)
+        # src_lengths (B, 1) int32, identical to original
+        src_lengths_out = (
+            src_tokens.ne(self.padding_idx)
+            .sum(dim=1, dtype=torch.int32)
+            .reshape(-1, 1)
+            .contiguous()
+        )
+        return {
+            "encoder_out": [x],  # T x B x C
+            "encoder_padding_mask": [encoder_padding_mask],  # B x T
+            "encoder_embedding": [encoder_embedding],  # B x T x C
+            "encoder_states": encoder_states,  # type: ignore[dict-item]  # List[T x B x C]
+            "fc_results": fc_results,  # type: ignore[dict-item]  # List[T x B x C]
+            "src_tokens": [],
+            "src_lengths": [src_lengths_out],
+        }
+    def forward_torchscript(self, net_input: Dict[str, Tensor]):
+        """A TorchScript-compatible version of forward.
+        Encoders which use additional arguments may want to override
+        this method for TorchScript compatibility.
+        """
+        if "encoder_padding_mask" in net_input:
+            return self.forward(
+                src_tokens=net_input["src_tokens"],
+                src_lengths=net_input["src_lengths"],
+                encoder_padding_mask=net_input["encoder_padding_mask"],
+            )
+        else:
+            return self.forward(
+                src_tokens=net_input["src_tokens"],
+                src_lengths=net_input["src_lengths"],
+            )
+    # ----------------------------------------------------------------------
+    @torch.jit.export
+    def reorder_encoder_out(self, encoder_out: Dict[str, List[Tensor]], new_order):
+        """
+        Match original API: reorder the batched dimension (B) according to new_order.
+        """
+        reordered = dict()  # type: ignore[var-annotated]
+        if len(encoder_out["encoder_out"]) == 0:
+            new_encoder_out = []
+        else:
+            new_encoder_out = [encoder_out["encoder_out"][0].index_select(1, new_order)]
+        reordered["encoder_out"] = new_encoder_out
+        keys = [
+            "encoder_padding_mask",
+            "encoder_embedding",
+            "src_tokens",
+            "src_lengths",
+        ]
+        for k in keys:
+            if k not in encoder_out:
+                continue
+            if len(encoder_out[k]) == 0:
+                reordered[k] = []
+            else:
+                reordered[k] = [encoder_out[k][0].index_select(0, new_order)]
+        if "encoder_states" in encoder_out:
+            encoder_states = encoder_out["encoder_states"]
+            if len(encoder_states) > 0:
+                for idx, state in enumerate(encoder_states):
+                    encoder_states[idx] = state.index_select(1, new_order)
+            reordered["encoder_states"] = encoder_states
+        return reordered
+    @torch.jit.export
+    def _reorder_encoder_out(self, encoder_out: Dict[str, List[Tensor]], new_order):
+        """Dummy re-order for beamable enc-dec attention (API parity)."""
+        return encoder_out
+    def max_positions(self) -> int:
+        """Maximum input length supported by the encoder (same policy as the original)."""
+        if self.embed_positions is None:
+            return self.max_source_positions
+        return min(self.max_source_positions, self.embed_positions.max_positions)
+    def upgrade_state_dict_named(self, state_dict, name):
+        """
+        Forward-compat mapping for older checkpoints (mirror original behavior for LNs).
+        The actual remapping of per-layer norms is delegated to the wrapped layers.
+        """
+        for i, layer in enumerate(self.layers):
+            if hasattr(layer, "upgrade_state_dict_named"):
+                layer.upgrade_state_dict_named(state_dict, f"{name}.layers.{i}")
+        version_key = f"{name}.version"
+        v = state_dict.get(version_key, torch.Tensor([1]))
+        if float(v[0].item()) < 2:
+            self.layer_norm = None
+            state_dict[version_key] = torch.Tensor([1])
+        return state_dict
+    def _all_observers(self):
+        for m in self.layers:
+            if isinstance(m, QuantModuleBase):
+                yield from m._all_observers()

tico/experimental/quantization/ptq/wrappers/llama/quant_attn.py CHANGED Viewed

@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Optional
+from typing import Optional, Tuple
 import torch
 import torch.nn as nn
@@ -131,28 +131,38 @@ class QuantLlamaAttention(QuantModuleBase):
         x2n = self._fq(-x2, o_neg)
         return self._fq(torch.cat((x2n, x1), -1), o_cat)
+    @staticmethod
+    def _concat_kv(
+        past: Optional[Tuple[torch.Tensor, torch.Tensor]],
+        k_new: torch.Tensor,
+        v_new: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Concat along sequence dim (dim=2): (B, n_kv, S, H)."""
+        if past is None:
+            return k_new, v_new
+        past_k, past_v = past
+        k = torch.cat([past_k, k_new], dim=2)
+        v = torch.cat([past_v, v_new], dim=2)
+        return k, v
     def forward(
         self,
         hidden_states: torch.Tensor,
         position_embeddings: tuple[torch.Tensor, torch.Tensor],
         attention_mask: Optional[torch.Tensor] = None,
-        past_key_value=None,  # not supported yet
+        past_key_value=None,  # tuple(k, v) or HF Cache-like object
+        use_cache: Optional[bool] = False,
         cache_position: Optional[torch.LongTensor] = None,
         **kwargs,
     ):
-        if past_key_value is not None:
-            raise NotImplementedError(
-                "QuantLlamaAttention does not support KV cache yet."
-            )
         hidden = self._fq(hidden_states, self.obs_hidden)
         B, S, _ = hidden.shape
         H = self.hdim
         # projections
-        q = self.q_proj(hidden).view(B, S, -1, H).transpose(1, 2)
-        k = self.k_proj(hidden).view(B, S, -1, H).transpose(1, 2)
-        v = self.v_proj(hidden).view(B, S, -1, H).transpose(1, 2)
+        q = self.q_proj(hidden).view(B, S, -1, H).transpose(1, 2)  # (B, n_h, S, H)
+        k = self.k_proj(hidden).view(B, S, -1, H).transpose(1, 2)  # (B, n_kv, S, H)
+        v = self.v_proj(hidden).view(B, S, -1, H).transpose(1, 2)  # (B, n_kv, S, H)
         # rope tables
         cos, sin = position_embeddings
@@ -176,14 +186,37 @@ class QuantLlamaAttention(QuantModuleBase):
         k_sin = self._fq(k_half * sin_u, self.obs_k_sin)
         k_rot = self._fq(k_cos + k_sin, self.obs_k_rot)
+        # --- build/update KV for attention & present_key_value -------------
+        present_key_value: Tuple[torch.Tensor, torch.Tensor]
+        # HF Cache path (if available)
+        if use_cache and hasattr(past_key_value, "update"):
+            # Many HF Cache impls use update(k, v) and return (k_total, v_total)
+            try:
+                k_total, v_total = past_key_value.update(k_rot, v)
+                present_key_value = (k_total, v_total)
+                k_for_attn, v_for_attn = k_total, v_total
+            except Exception:
+                # Fallback to tuple concat if Cache signature mismatches
+                k_for_attn, v_for_attn = self._concat_kv(
+                    getattr(past_key_value, "kv", None), k_rot, v
+                )
+                present_key_value = (k_for_attn, v_for_attn)
+        else:
+            # Tuple or None path
+            pkv_tuple = past_key_value if isinstance(past_key_value, tuple) else None
+            k_for_attn, v_for_attn = self._concat_kv(pkv_tuple, k_rot, v)
+            present_key_value = (k_for_attn, v_for_attn)
         # logits
-        k_rep = k_rot.repeat_interleave(self.kv_rep, dim=1)
+        k_rep = k_for_attn.repeat_interleave(self.kv_rep, dim=1)  # (B, n_h, K, H)
         logits_raw = self._fq(q_rot @ k_rep.transpose(-2, -1), self.obs_logits_raw)
         scale = self._fq(self.scale_t, self.obs_scale)
         logits = self._fq(logits_raw * scale, self.obs_logits)
         if attention_mask is None or attention_mask.dtype == torch.bool:
-            _, _, q_len, k_len = logits.shape
+            _, _, q_len, _ = logits.shape
+            k_len = k_for_attn.size(2)
             assert isinstance(self.causal_mask_template, torch.Tensor)
             attention_mask = self.causal_mask_template[..., :q_len, :k_len].to(
                 hidden_states.device
@@ -196,7 +229,7 @@ class QuantLlamaAttention(QuantModuleBase):
         attn_weights = self._fq(attn_weights, self.obs_softmax)
         # attn out
-        v_rep = v.repeat_interleave(self.kv_rep, dim=1)
+        v_rep = v_for_attn.repeat_interleave(self.kv_rep, dim=1)  # (B, n_h, K, H)
         attn_out = (
             self._fq(attn_weights @ v_rep, self.obs_attn_out)
             .transpose(1, 2)
@@ -204,7 +237,13 @@ class QuantLlamaAttention(QuantModuleBase):
         )
         # final projection
-        return self.o_proj(attn_out), attn_weights
+        out = self.o_proj(attn_out)
+        # return with/without cache
+        if use_cache:
+            return out, attn_weights, present_key_value
+        else:
+            return out, attn_weights
     def _all_observers(self):
         # local first

tico/experimental/quantization/ptq/wrappers/llama/quant_decoder_layer.py CHANGED Viewed

@@ -136,7 +136,7 @@ class QuantLlamaDecoderLayer(QuantModuleBase):
             L = hidden_states.size(1)
             attention_mask = self._slice_causal(L, hidden_states.device)
-        hidden_states, _ = self.self_attn(
+        attn_out = self.self_attn(
             hidden_states=hidden_states,
             attention_mask=attention_mask,
             position_ids=position_ids,
@@ -147,7 +147,13 @@ class QuantLlamaDecoderLayer(QuantModuleBase):
             position_embeddings=position_embeddings,
             **kwargs,
         )
-        hidden_states = residual + hidden_states
+        if use_cache:
+            hidden_states_attn, _attn_weights, present_key_value = attn_out
+        else:
+            hidden_states_attn, _attn_weights = attn_out
+            present_key_value = None
+        hidden_states = residual + hidden_states_attn
         # ─── MLP block ─────────────────────────────────────────────────
         residual = hidden_states
@@ -155,6 +161,12 @@ class QuantLlamaDecoderLayer(QuantModuleBase):
         hidden_states = self.mlp(hidden_states)
         hidden_states = residual + hidden_states
+        # Return type policy:
+        # - If use_cache: always return (hidden_states, present_key_value)
+        # - Else: return as configured (tuple/tensor) for HF compatibility
+        if use_cache:
+            return hidden_states, present_key_value
         if self.return_type == "tuple":
             return (hidden_states,)
         elif self.return_type == "tensor":

tico/experimental/quantization/ptq/wrappers/registry.py CHANGED Viewed

@@ -33,6 +33,7 @@ _CORE_MODULES = (
     "tico.experimental.quantization.ptq.wrappers.llama.quant_decoder_layer",
     "tico.experimental.quantization.ptq.wrappers.llama.quant_mlp",
     # fairseq
+    "tico.experimental.quantization.ptq.wrappers.fairseq.quant_encoder",
     "tico.experimental.quantization.ptq.wrappers.fairseq.quant_encoder_layer",
     "tico.experimental.quantization.ptq.wrappers.fairseq.quant_mha",
     # add future core wrappers here

tico/passes/decompose_fake_quantize_tensor_qparams.py CHANGED Viewed

@@ -245,9 +245,10 @@ class DecomposeFakeQuantizeTensorQParams(PassBase):
                     # mask_user(output).args == (dequantize_per_tensor.tensor, mask)
                     if mask:
                         assert len(mask) == 1
-                        mask_user = list(mask[0].users.keys())[0]
-                        assert len(mask_user.args) == 1
-                        mask_user.args = ((mask_user.args[0][0],),)
+                        if len(mask[0].users) > 0:
+                            mask_user = list(mask[0].users.keys())[0]
+                            assert len(mask_user.args) == 1
+                            mask_user.args = ((mask_user.args[0][0],),)
                 modified = True
             if (
                 node.target

tico/passes/remove_redundant_expand.py CHANGED Viewed

@@ -46,7 +46,9 @@ class RemoveRedundantExpand(PassBase):
             input, size = args.input, args.size
             input_shape = extract_shape(input)
-            if list(input_shape) != size:
+            output_shape = extract_shape(node)
+            if input_shape != output_shape:
                 continue
             node.replace_all_uses_with(input, propagate_meta=False)

{tico-0.1.0.dev250911.dist-info → tico-0.1.0.dev250914.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: tico
-Version: 0.1.0.dev250911
+Version: 0.1.0.dev250914
 Summary: Convert exported Torch module to circle
 Home-page: UNKNOWN
 License: UNKNOWN

{tico-0.1.0.dev250911.dist-info → tico-0.1.0.dev250914.dist-info}/RECORD RENAMED Viewed

@@ -1,4 +1,4 @@
-tico/__init__.py,sha256=AFm5dss2XoBxl2mMxRFDadqde3-e57kBbQuWfN4B82E,1883
+tico/__init__.py,sha256=Lmo72Xd9sheKIW4XhH6oc5SheplvnXak_Zbh0EQZsrI,1883
 tico/pt2_to_circle.py,sha256=gu3MD4Iqc0zMZcCZ2IT8oGbyj21CTSbT3Rgd9s2B_9A,2767
 tico/config/__init__.py,sha256=xZzCXjZ84qE-CsBi-dfaL05bqpQ3stKKfTXhnrJRyVs,142
 tico/config/base.py,sha256=q5xMqGxTUZs4mFqt5c7i_y9U00fYgdMGl9nUqIVMlCo,1248
@@ -62,7 +62,7 @@ tico/experimental/quantization/ptq/mode.py,sha256=lT-T8vIv8YWcwrjT7xXVhOw1g7aoAd
 tico/experimental/quantization/ptq/qscheme.py,sha256=uwhv7bCxOOXB3I-IKlRyr_u4eXOq48uIqGy4TLDqGxY,1301
 tico/experimental/quantization/ptq/quant_config.py,sha256=nm7570Y1X2mOT_8s27ilWid04otor6cVTi9GwgAEaKc,4300
 tico/experimental/quantization/ptq/examples/__init__.py,sha256=IO6FP_xYbGy0dW0HL26GXD3ouxARaxCK7bz9dn4blPQ,26
-tico/experimental/quantization/ptq/examples/compare_ppl.py,sha256=SmSmaCBVWTcGRPRk2zopDqESD_gF8D7J4kUNNZ-0cMk,5295
+tico/experimental/quantization/ptq/examples/compare_ppl.py,sha256=QWUuO50lITnooYqEe57VV6mvIHKWZMB_TOGvtZ8C8qQ,8238
 tico/experimental/quantization/ptq/examples/debug_quant_outputs.py,sha256=astXzx-maq1W4gKvX2QaGmD2Tpmjunv4JqDYVk9eZRQ,5177
 tico/experimental/quantization/ptq/examples/quantize_linear.py,sha256=8zq-ZJDYgam0xQ-PbC6Xb1I7W1mv0Wi-b--IP2wwXtw,4539
 tico/experimental/quantization/ptq/examples/quantize_llama_attn.py,sha256=cVWUSSzaZWFp5QZkNkrlpHU3kXyP84QtnZbahVml_yQ,4329
@@ -84,13 +84,14 @@ tico/experimental/quantization/ptq/wrappers/__init__.py,sha256=IO6FP_xYbGy0dW0HL
 tico/experimental/quantization/ptq/wrappers/ptq_wrapper.py,sha256=F9sK_DiRaXiGNHULcwIbs5EUtHz6ZJ7N4r5CWTTfhsM,2442
 tico/experimental/quantization/ptq/wrappers/quant_elementwise.py,sha256=LhEoobfvto6zKrBOKL4gmxfFFc31jHzyQV_zfps-iQM,3604
 tico/experimental/quantization/ptq/wrappers/quant_module_base.py,sha256=vkcDos_knGSS29rIZuEIWkAJLHrENbGz8nCH2-iara8,5969
-tico/experimental/quantization/ptq/wrappers/registry.py,sha256=bTd1fZGCXkL4iaduKUXjWVpRXfvOaJGeurxwKJBVu6I,5019
+tico/experimental/quantization/ptq/wrappers/registry.py,sha256=GlVBPWPAnLRqTtemu_YOEX9WisF1eN6Mud7y1zzvpW0,5092
 tico/experimental/quantization/ptq/wrappers/fairseq/__init__.py,sha256=Mc8FLd9DusyB_IT1vk1OYrRkngOYnYd05IvtA9ORVQc,160
+tico/experimental/quantization/ptq/wrappers/fairseq/quant_encoder.py,sha256=r9DPUAbL2KRJ8zpMJ39Y9n6Oe79nte-mFcdjG2qEP-w,13809
 tico/experimental/quantization/ptq/wrappers/fairseq/quant_encoder_layer.py,sha256=aGr80Ku75j2H-UZ0elEa0mOQEyaAs2YJ4WJCN0lonn0,6412
 tico/experimental/quantization/ptq/wrappers/fairseq/quant_mha.py,sha256=HsigmOLeacLXc46QNeFqwQ0DwKQhNrtWTKEtLJoqXoc,15562
 tico/experimental/quantization/ptq/wrappers/llama/__init__.py,sha256=IO6FP_xYbGy0dW0HL26GXD3ouxARaxCK7bz9dn4blPQ,26
-tico/experimental/quantization/ptq/wrappers/llama/quant_attn.py,sha256=-K1COLHIHfJZhQu-RE6KfJIkaL7S6yR4iUj48QkjMTw,8652
-tico/experimental/quantization/ptq/wrappers/llama/quant_decoder_layer.py,sha256=2XsIf5rcabDXXkahqriSxfo2curFq0Y5bnRPcYkJPg8,7187
+tico/experimental/quantization/ptq/wrappers/llama/quant_attn.py,sha256=futw-XhAhErdaK2cZY8T3_xCxZbsj-l1dbsSbeunE_4,10403
+tico/experimental/quantization/ptq/wrappers/llama/quant_decoder_layer.py,sha256=ZImtfT2pyYyGJa0QCcHgCVootiWeflpRvLa4LisjZSY,7646
 tico/experimental/quantization/ptq/wrappers/llama/quant_mlp.py,sha256=uZMnrX66oZwxhKhcNbLXXeri-WxxRBiZnr15aBXJMm0,3562
 tico/experimental/quantization/ptq/wrappers/nn/__init__.py,sha256=IO6FP_xYbGy0dW0HL26GXD3ouxARaxCK7bz9dn4blPQ,26
 tico/experimental/quantization/ptq/wrappers/nn/quant_layernorm.py,sha256=G5Sgt-tXnzh0Rxyk-2honmZIfEQOZlRfOsoDBdSGmA4,6887
@@ -111,7 +112,7 @@ tico/passes/convert_to_relu6.py,sha256=1BJpUwUb6Zli_1y3eyJQo7dg9B1xvZ7sYjMbvEQsF
 tico/passes/decompose_addmm.py,sha256=KjnpZjSuA0uvNmKaTN_EMwobcOi3CAB81buORzTDxro,3979
 tico/passes/decompose_batch_norm.py,sha256=06LAxhSmpTxFZJmUelwB3I_GipNWrLoM7PfM6ZkxOZY,6512
 tico/passes/decompose_fake_quantize.py,sha256=736srs8SM8K_mLR0WG10LVMMLRkYkBM9OF0k1GCkAW0,5218
-tico/passes/decompose_fake_quantize_tensor_qparams.py,sha256=p-sz_cgir4jMWp43CR75fj0TbLkNvSl888fvkRqFRtE,13922
+tico/passes/decompose_fake_quantize_tensor_qparams.py,sha256=CalubQ1OYC2l59_TNPOcAnl4VxvameYWIQcy57Z6yjI,13985
 tico/passes/decompose_group_norm.py,sha256=6BqvYtMTPzeIgp8cPA8OFMwEBvb7odcg04IUgwtp7NQ,10120
 tico/passes/decompose_grouped_conv2d.py,sha256=n2qv320akL1ju33ucZ6lU1cKEAaj0NI8YZ5CrUnkRLM,8512
 tico/passes/decompose_slice_scatter.py,sha256=xqMHKhW2595YoAeubKZ4jRhYW4TQ09EXPgLNgODqXG8,5653
@@ -128,7 +129,7 @@ tico/passes/merge_consecutive_cat.py,sha256=ayZNLDA1DFM7Fxxi2Dmk1CujkgUuaVCH1rhQ
 tico/passes/ops.py,sha256=cSj3Sk2x2cOE9b8oU5pmSa_rHr-iX2lORzu3N_UHMSQ,2967
 tico/passes/remove_nop.py,sha256=Hf91p_EJAOC6DyWNthash0_UWtEcNc_M7znamQfYQ5Y,2686
 tico/passes/remove_redundant_assert_nodes.py,sha256=rYbTCyuNIXIC-2NreHKBVCuaSUkEQvB_iSRzb26P_EA,1821
-tico/passes/remove_redundant_expand.py,sha256=auyqIoQT4HJhiJfuUe6BrEtUhvz221ohnIK5EuszWeg,2112
+tico/passes/remove_redundant_expand.py,sha256=8yhlMnbog-T9gIK6LKIU0tu0__gfhZzO36g_fJIVVP4,2162
 tico/passes/remove_redundant_permute.py,sha256=98UsaZzFZdQzEEAR1pIzRisAf6hgfXLa88aayjalt3E,4292
 tico/passes/remove_redundant_reshape.py,sha256=aeep6LDvY58GEuOrWckkEXnJa6wkkbiJ9FrimT9F3-s,16384
 tico/passes/remove_redundant_slice.py,sha256=Iv7TbB39fktNb4eq0VdyZnwxL_VsKLJ90diMmaf3kZk,2087
@@ -251,9 +252,9 @@ tico/utils/mx/__init__.py,sha256=IO6FP_xYbGy0dW0HL26GXD3ouxARaxCK7bz9dn4blPQ,26
 tico/utils/mx/elemwise_ops.py,sha256=V6glyAHsVR1joqpsgnNytatCD_ew92xNWZ19UFDoMTA,10281
 tico/utils/mx/formats.py,sha256=uzNWyu-1onUlwQfX5cZ6fZSUfHMRqorper7_T1k3jfk,3404
 tico/utils/mx/mx_ops.py,sha256=RcfUTYVi-wilGB2sC35OeARdwDqnixv7dG5iyZ-fQT8,8555
-tico-0.1.0.dev250911.dist-info/LICENSE,sha256=kp4JLII7bzRhPb0CPD5XTDZMh22BQ7h3k3B7t8TiSbw,12644
-tico-0.1.0.dev250911.dist-info/METADATA,sha256=xwLIBaymr4huU56zVBjWp4SD870bk3Gb9Npss9zH8zk,8450
-tico-0.1.0.dev250911.dist-info/WHEEL,sha256=G16H4A3IeoQmnOrYV4ueZGKSjhipXx8zc8nu9FGlvMA,92
-tico-0.1.0.dev250911.dist-info/entry_points.txt,sha256=kBKYSS_IYrSXmUYevmmepqIVPScq5vF8ulQRu3I_Zf0,59
-tico-0.1.0.dev250911.dist-info/top_level.txt,sha256=oqs7UPoNSKZEwqsX8B-KAWdQwfAa7i60pbxW_Jk7P3w,5
-tico-0.1.0.dev250911.dist-info/RECORD,,
+tico-0.1.0.dev250914.dist-info/LICENSE,sha256=kp4JLII7bzRhPb0CPD5XTDZMh22BQ7h3k3B7t8TiSbw,12644
+tico-0.1.0.dev250914.dist-info/METADATA,sha256=qW47MJq3y-q2MtV7kSDUrT8dkZtBWScPMBwZgvMR6tg,8450
+tico-0.1.0.dev250914.dist-info/WHEEL,sha256=G16H4A3IeoQmnOrYV4ueZGKSjhipXx8zc8nu9FGlvMA,92
+tico-0.1.0.dev250914.dist-info/entry_points.txt,sha256=kBKYSS_IYrSXmUYevmmepqIVPScq5vF8ulQRu3I_Zf0,59
+tico-0.1.0.dev250914.dist-info/top_level.txt,sha256=oqs7UPoNSKZEwqsX8B-KAWdQwfAa7i60pbxW_Jk7P3w,5
+tico-0.1.0.dev250914.dist-info/RECORD,,

{tico-0.1.0.dev250911.dist-info → tico-0.1.0.dev250914.dist-info}/LICENSE RENAMED Viewed

File without changes

{tico-0.1.0.dev250911.dist-info → tico-0.1.0.dev250914.dist-info}/WHEEL RENAMED Viewed

File without changes

{tico-0.1.0.dev250911.dist-info → tico-0.1.0.dev250914.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{tico-0.1.0.dev250911.dist-info → tico-0.1.0.dev250914.dist-info}/top_level.txt RENAMED Viewed

File without changes

tico 0.1.0.dev250911__py3-none-any.whl → 0.1.0.dev250914__py3-none-any.whl

Potentially problematic release.

tico 0.1.0.dev250911py3-none-any.whl → 0.1.0.dev250914py3-none-any.whl