PyPI - tico - Versions diffs - 0.1.0.dev250904__py3-none-any.whl → 0.1.0.dev251109__py3-none-any.whl - Mend

tico 0.1.0.dev250904py3-none-any.whl → 0.1.0.dev251109py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of tico might be problematic. Click here for more details.

Files changed (133) hide show

tico/experimental/quantization/algorithm/smoothquant/smooth_quant.py DELETED Viewed

@@ -1,164 +0,0 @@
-# Copyright (c) 2025 Samsung Electronics Co., Ltd. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from typing import Dict, List, Optional
-import torch
-@torch.no_grad()
-def smooth_weights(
-    front_module: torch.nn.Module,
-    back_modules: torch.nn.Module | List[torch.nn.Module],
-    activation_max: torch.Tensor,
-    alpha: float,
-):
-    """
-    Applies SmoothQuant-style smoothing to the weights and biases of two
-     connected modules using activation maximum values.
-    NOTE All modules **MUST** have `weight` and optionally `bias` attributes.
-    Parameters
-    -----------
-        front_module
-            The front module whose weights and biases will be adjusted.
-        back_modules
-            A list of back modules whose weights and biases will be adjusted.
-        activation_max
-            A tensor of channel-wise maximum activation values for the front module.
-        alpha
-            The smoothing factor that determines the scaling for weight adjustments.
-    Raises
-    -------
-    AttributeError
-        If `front_module` or any module in `back_modules` does not have `weight` attributes.
-    ValueError
-        If the shape of tensors in `activation_max` does not match the number of channels
-         in `front_module`'s weight.
-    NoteImplementedError
-        If `front_module` or any module in `back_modules` is of an unsupported type.
-    """
-    from transformers.models.llama.modeling_llama import LlamaRMSNorm
-    if not isinstance(back_modules, list):
-        back_modules = [back_modules]
-    # Check attributes
-    if not hasattr(front_module, "weight"):
-        raise AttributeError(
-            f"The front module '{type(front_module).__name__}' does not have a 'weight' attribute."
-        )
-    for back_m in back_modules:
-        if not hasattr(back_m, "weight"):
-            raise AttributeError(
-                f"The front module '{type(back_m).__name__}' does not have a 'weight' attribute."
-            )
-    # Check shapes
-    if isinstance(front_module, LlamaRMSNorm):
-        front_numel = front_module.weight.numel()
-    else:
-        raise NotImplementedError(
-            f"Unsupported module type: {type(front_module).__name__}"
-        )
-    for back_m in back_modules:
-        if isinstance(back_m, torch.nn.Linear):
-            back_numel = back_m.in_features
-        else:
-            raise NotImplementedError(
-                f"Unsupported module type: {type(front_module).__name__}"
-            )
-        if front_numel != back_numel or back_numel != activation_max.numel():
-            raise ValueError(
-                f"Shape mismatch: front_numel({front_numel}), back_numel({back_numel}), activation_max_numel({activation_max.numel()})"
-            )
-    # Compute scales
-    device, dtype = back_modules[0].weight.device, back_modules[0].weight.dtype
-    activation_max = activation_max.to(device=device, dtype=dtype)  # type: ignore[arg-type]
-    weight_scales = torch.cat(
-        [back_m.weight.abs().max(dim=0, keepdim=True)[0] for back_m in back_modules],  # type: ignore[operator]
-        dim=0,
-    )
-    weight_scales = weight_scales.max(dim=0)[0].clamp(min=1e-5)
-    scales = (
-        (activation_max.pow(alpha) / weight_scales.pow(1 - alpha))
-        .clamp(min=1e-5)
-        .to(device)  # type: ignore[arg-type]
-        .to(dtype)  # type: ignore[arg-type]
-    )
-    # Smooth
-    front_module.weight.div_(scales)
-    if hasattr(front_module, "bias"):
-        front_module.bias.div_(scales)
-    for back_m in back_modules:
-        back_m.weight.mul_(scales.view(1, -1))  # type: ignore[operator]
-@torch.no_grad()
-def apply_smoothing(
-    model: torch.nn.Module,
-    activation_max: Dict[str, torch.Tensor],
-    alpha: float = 0.5,
-    custom_alpha_map: Optional[Dict[str, float]] = None,
-):
-    """
-    Applies SmoothQuant-style smoothing to the model's weights using activation maximum values.
-    Parameters
-    -----------
-        model
-            A torch module whose weights will be smoothed.
-        activation_max
-            The channel-wise maximum activation values for the model.
-        alpha
-            The default smoothing factor to apply across all modules.
-        custom_alpha_map
-            A dictionary mapping layer/module names to custom alpha values.
-            Layers specified in this dictionary will use the corresponding alpha
-             value instead of the default.
-    """
-    from transformers.models.llama.modeling_llama import LlamaDecoderLayer
-    for name, module in model.named_modules():
-        alpha_to_apply = alpha
-        if custom_alpha_map and name in custom_alpha_map:
-            alpha_to_apply = custom_alpha_map[name]
-        if alpha_to_apply > 1.0:
-            raise RuntimeError(
-                f"Alpha value cannot exceed 1.0. Given alpha: {alpha_to_apply}"
-            )
-        # SmoothQuant is applied before capturing the graph. Therefore, it needs to know
-        #  specific module information.
-        # TODO Suport more modules.
-        if isinstance(module, LlamaDecoderLayer):
-            attn_ln = module.input_layernorm
-            qkv = [
-                module.self_attn.q_proj,
-                module.self_attn.k_proj,
-                module.self_attn.v_proj,
-            ]
-            qkv_input_scales = activation_max[name + ".self_attn.q_proj"]
-            smooth_weights(attn_ln, qkv, qkv_input_scales, alpha_to_apply)
-            ffn_ln = module.post_attention_layernorm
-            fcs = [module.mlp.gate_proj, module.mlp.up_proj]
-            fcs_input_scales = activation_max[name + ".mlp.gate_proj"]
-            smooth_weights(ffn_ln, fcs, fcs_input_scales, alpha_to_apply)

tico/experimental/quantization/ptq/examples/compare_ppl.py DELETED Viewed

@@ -1,121 +0,0 @@
-# Copyright (c) 2025 Samsung Electronics Co., Ltd. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# =============================================================================
-# QUICK PTQ WORKFLOW (OPTIONAL FP32 BASELINE)
-# -----------------------------------------------------------------------------
-# Toggle RUN_FP to choose between:
-#   • FP32 perplexity measurement only, OR
-#   • Full post-training UINT-8 flow (wrap → calibrate → eval).
-# =============================================================================
-import torch
-import tqdm
-from datasets import load_dataset
-from transformers import AutoModelForCausalLM, AutoTokenizer
-from tico.experimental.quantization.ptq.quant_config import QuantConfig
-from tico.experimental.quantization.ptq.utils.metrics import perplexity
-from tico.experimental.quantization.ptq.wrappers.ptq_wrapper import PTQWrapper
-# -------------------------------------------------------------------------
-# 0. Global configuration
-# -------------------------------------------------------------------------
-MODEL_NAME = "meta-llama/Meta-Llama-3-1B"
-DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-STRIDE = 512  # sliding-window stride for perplexity
-RUN_FP = True  # set False → run UINT-8 path
-# Token-budget presets for activation calibration
-TOKENS: dict[str, int] = {
-    # Smoke test (<1 min turnaround on CPU/GPU)
-    "debug": 2_000,  # ≈16 × 128-seq batches
-    # Good default for 1-7B models (≲3 % ppl delta)
-    "baseline": 50_000,
-    # Production / 4-bit observer smoothing
-    "production": 200_000,
-}
-CALIB_TOKENS = TOKENS["baseline"]
-print(f"Calibrating with {CALIB_TOKENS:,} tokens.\n")
-# -------------------------------------------------------------------------
-# 1. Load model
-# -------------------------------------------------------------------------
-tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
-if RUN_FP:
-    # -- FP32 baseline ------------------------------------------------------
-    print("Loading FP32 model …")
-    fp_model = AutoModelForCausalLM.from_pretrained(MODEL_NAME).to(DEVICE).eval()
-    fp_model.config.use_cache = False
-else:
-    # -- UINT-8 pipeline -----------------------------------------------------
-    print("Creating UINT-8 clone …")
-    uint8_model = AutoModelForCausalLM.from_pretrained(MODEL_NAME).to(DEVICE).eval()
-    uint8_model.config.use_cache = False
-    # ---------------------------------------------------------------------
-    # 2. Wrap every Transformer layer with PTQWrapper
-    # ---------------------------------------------------------------------
-    qcfg = QuantConfig()  # all-uint8 defaults
-    wrapped_layers = torch.nn.ModuleList()
-    for idx, layer in enumerate(uint8_model.model.layers):
-        layer_cfg = qcfg.child(f"layer{idx}")
-        wrapped_layers.append(PTQWrapper(layer, qcfg=layer_cfg))
-    uint8_model.model.layers = wrapped_layers
-    # ---------------------------------------------------------------------
-    # 3. Single-pass activation calibration
-    # ---------------------------------------------------------------------
-    print("Calibrating UINT-8 observers …")
-    calib_txt = " ".join(
-        load_dataset("wikitext", "wikitext-2-raw-v1", split="train")["text"]
-    )[:CALIB_TOKENS]
-    ids = tokenizer(calib_txt, return_tensors="pt").input_ids.to(DEVICE)
-    # (a) switch every QuantModuleBase to CALIB mode
-    for l in uint8_model.model.layers:
-        l.enable_calibration()
-    # (b) run inference to collect ranges
-    with torch.no_grad():
-        for i in tqdm.trange(0, ids.size(1) - 1, STRIDE, desc="Calibration"):
-            uint8_model(ids[:, i : i + STRIDE])
-    # (c) freeze (scale, zero-point)
-    for l in uint8_model.model.layers:
-        l.freeze_qparams()
-# -------------------------------------------------------------------------
-# 4. Evaluate perplexity on Wikitext-2
-# -------------------------------------------------------------------------
-print("\nCalculating perplexities …")
-test_ds = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
-enc = tokenizer("\n\n".join(test_ds["text"]), return_tensors="pt")
-if RUN_FP:
-    ppl_fp = perplexity(fp_model, enc, DEVICE, stride=STRIDE)
-else:
-    ppl_int8 = perplexity(uint8_model, enc, DEVICE, stride=STRIDE)
-# -------------------------------------------------------------------------
-# 5. Report
-# -------------------------------------------------------------------------
-print("\n┌── Wikitext-2 test perplexity ─────────────")
-if RUN_FP:
-    print(f"│ FP32  : {ppl_fp:8.2f}")
-else:
-    print(f"│ UINT-8 : {ppl_int8:8.2f}")
-print("└───────────────────────────────────────────")

tico/experimental/quantization/ptq/examples/debug_quant_outputs.py DELETED Viewed

@@ -1,129 +0,0 @@
-# Copyright (c) 2025 Samsung Electronics Co., Ltd. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import torch
-import tqdm
-from datasets import load_dataset
-from transformers import AutoModelForCausalLM, AutoTokenizer
-from tico.experimental.quantization.ptq.quant_config import QuantConfig
-from tico.experimental.quantization.ptq.utils.introspection import (
-    build_fqn_map,
-    compare_layer_outputs,
-    save_fp_outputs,
-)
-from tico.experimental.quantization.ptq.wrappers.ptq_wrapper import PTQWrapper
-# ============================================================================
-# LAYER-WISE DIFF DEBUGGING PIPELINE
-# ----------------------------------------------------------------------------
-# A quantization debugging pipeline that identifies accuracy regressions
-# by comparing UINT vs FP outputs at each layer.
-#
-#   1. Load a full-precision (FP) LLaMA-3-1B model.
-#   2. Wrap each Transformer block with PTQWrapper (activations → fake-quant).
-#   3. Capture reference FP layer outputs before quantization.
-#   4. Calibrate UINT-8 activation observers in a single pass.
-#   5. Freeze quantization parameters (scale, zero-point).
-#   6. Re-run inference and compare UINT-8 vs FP outputs per layer.
-#   7. Report where quantization hurts the most.
-#
-# Use this pipeline to trace precision loss layer by layer, and pinpoint
-# problematic modules during post-training quantization.
-# ============================================================================
-# -------------------------------------------------------------------------
-# 0. Global configuration
-# -------------------------------------------------------------------------
-MODEL_NAME = "meta-llama/Meta-Llama-3-1B"
-DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-STRIDE = 512
-# Token-budget presets for activation calibration
-TOKENS: dict[str, int] = {
-    # Smoke test (<1 min turnaround on CPU/GPU)
-    "debug": 2_000,  # ≈16 × 128-seq batches
-    # Good default for 1-7B models (≲3 % ppl delta)
-    "baseline": 50_000,
-    # Production / 4-bit observer smoothing
-    "production": 200_000,
-}
-CALIB_TOKENS = TOKENS["baseline"]
-print(f"Calibrating with {CALIB_TOKENS:,} tokens.\n")
-# -------------------------------------------------------------------------
-# 1. Load the FP backbone
-# -------------------------------------------------------------------------
-print("Loading FP model …")
-tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
-model = AutoModelForCausalLM.from_pretrained(MODEL_NAME).to(DEVICE).eval()
-model.config.use_cache = False  # disable KV-cache → full forward
-m_to_fqn = build_fqn_map(model)  # map modules → fully-qualified names
-# Use Wikitext-2 train split for calibration.
-dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="train")
-# -------------------------------------------------------------------------
-# 2. Wrap every layer with PTQWrapper (UINT-8 activations)
-# -------------------------------------------------------------------------
-print("Wrapping layers with PTQWrapper …")
-qcfg = QuantConfig()  # default: per-tensor UINT8
-new_layers = torch.nn.ModuleList()
-for idx, fp_layer in enumerate(model.model.layers):
-    layer_cfg = qcfg.child(f"layer{idx}")
-    q_layer = PTQWrapper(
-        fp_layer,
-        qcfg=layer_cfg,
-        fp_name=m_to_fqn.get(fp_layer),
-    )
-    new_layers.append(q_layer)
-model.model.layers = new_layers  # swap in quant wrappers
-# -------------------------------------------------------------------------
-# 3. Activation calibration plus FP-vs-UINT8 diffing
-# -------------------------------------------------------------------------
-print("Calibrating UINT-8 observers …")
-calib_txt = " ".join(dataset["text"])[:CALIB_TOKENS]
-ids = tokenizer(calib_txt, return_tensors="pt").input_ids.to(DEVICE)
-# (a) Enable CALIB mode on every QuantModuleBase
-for l in model.model.layers:
-    l.enable_calibration()
-# Save reference FP activations before observers clamp/quantize
-save_handles, act_cache = save_fp_outputs(model)
-with torch.no_grad():
-    for i in tqdm.trange(0, ids.size(1) - 1, STRIDE, desc="Act-calibration"):
-        inputs = ids[:, i : i + STRIDE]
-        model(inputs)  # observers collect act. ranges
-# Remove save hooks now that FP activations are cached
-for h in save_handles:
-    h.remove()
-# (b) Freeze (scale, zero-point) after calibration
-for l in model.model.layers:
-    l.freeze_qparams()
-# (c) Register diff hooks and measure per-layer deltas
-cmp_handles = compare_layer_outputs(model, act_cache, metrics=["diff", "peir"])
-# Use same inputs for comparison.
-model(inputs)
-assert isinstance(cmp_handles, list)
-for h in cmp_handles:
-    h.remove()

tico/experimental/quantization/ptq/examples/quantize_with_gptq.py DELETED Viewed

@@ -1,165 +0,0 @@
-# Copyright (c) 2025 Samsung Electronics Co., Ltd. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# =============================================================================
-# PTQ + GPTQ HYBRID QUANTIZATION PIPELINE
-# -----------------------------------------------------------------------------
-# This script shows how to:
-#   1. Load a pretrained FP Llama-3 model.
-#   2. Run GPTQ to quantize weights only.
-#   3. Wrap every Transformer layer with a PTQWrapper to quantize activations.
-#   4. Calibrate UINT-8 observers in a single pass over a text corpus.
-#   5. Inject GPTQ’s per-tensor weight scales / zero-points into the PTQ graph.
-#   6. Freeze all Q-params and compute Wikitext-2 perplexity.
-# =============================================================================
-from typing import Any
-import torch
-import tqdm
-from datasets import load_dataset
-from transformers import AutoModelForCausalLM, AutoTokenizer
-from tico.experimental.quantization import convert, prepare
-from tico.experimental.quantization.config import GPTQConfig
-from tico.experimental.quantization.ptq.observers.affine_base import AffineObserverBase
-from tico.experimental.quantization.ptq.quant_config import QuantConfig
-from tico.experimental.quantization.ptq.utils.introspection import build_fqn_map
-from tico.experimental.quantization.ptq.utils.metrics import perplexity
-from tico.experimental.quantization.ptq.wrappers.ptq_wrapper import PTQWrapper
-from tico.experimental.quantization.ptq.wrappers.quant_module_base import (
-    QuantModuleBase,
-)
-# -------------------------------------------------------------------------
-# 0. Global configuration
-# -------------------------------------------------------------------------
-MODEL_NAME = "meta-llama/Meta-Llama-3-1B"
-DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-STRIDE = 512
-# Token-budget presets for activation calibration
-TOKENS: dict[str, int] = {
-    # Smoke test (<1 min turnaround on CPU/GPU)
-    "debug": 2_000,  # ≈16 × 128-seq batches
-    # Good default for 1-7B models (≲3 % ppl delta)
-    "baseline": 50_000,
-    # Production / 4-bit observer smoothing
-    "production": 200_000,
-}
-CALIB_TOKENS = TOKENS["baseline"]
-# -------------------------------------------------------------------------
-# 1. Helper — copy GPTQ (scale, zp) into PTQ observers
-# -------------------------------------------------------------------------
-def inject_gptq_qparams(
-    root: torch.nn.Module,
-    gptq_quantizers: dict[str, Any],  # {fp_name: quantizer}
-    weight_obs_name: str = "weight",
-):
-    """
-    For every `QuantModuleBase` whose `fp_name` matches a GPTQ key,
-    locate the observer called `weight_obs_name` and overwrite its
-    (scale, zero-point), then lock them against further updates.
-    """
-    for m in root.modules():
-        if not isinstance(m, QuantModuleBase):
-            continue
-        if m.fp_name is None:
-            continue
-        quantizer = gptq_quantizers.get(m.fp_name)
-        if quantizer is None:
-            continue
-        obs = m.get_observer(weight_obs_name)
-        if obs is None:
-            continue
-        assert isinstance(obs, AffineObserverBase)
-        # GPTQ quantizer attributes
-        obs.load_qparams(quantizer.scale, quantizer.zero, lock=True)
-# -------------------------------------------------------------------------
-# 2. Load the FP backbone
-# -------------------------------------------------------------------------
-print("Loading FP model …")
-tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
-model = AutoModelForCausalLM.from_pretrained(MODEL_NAME).to(DEVICE).eval()
-model.config.use_cache = False  # disable KV-cache → full forward
-m_to_fqn = build_fqn_map(model)  # map modules → fully-qualified names
-# -------------------------------------------------------------------------
-# 3. Run GPTQ (weight-only) pass
-# -------------------------------------------------------------------------
-print("Applying GPTQ …")
-dataset = load_dataset("wikiText", "wikitext-2-raw-v1", split="test")
-q_m = prepare(model, GPTQConfig(), inplace=True)
-for d in tqdm.tqdm(dataset, desc="GPTQ calibration"):
-    ids = tokenizer(d["text"], return_tensors="pt").input_ids.to(DEVICE)
-    q_m(ids)  # observers gather weight stats
-q_m = convert(q_m, inplace=True)  # materialize INT-weight tensors
-# -------------------------------------------------------------------------
-# 4. Wrap every layer with PTQWrapper (activation UINT-8)
-# -------------------------------------------------------------------------
-qcfg = QuantConfig()  # default: per-tensor UINT8
-new_layers = torch.nn.ModuleList()
-for idx, fp_layer in enumerate(q_m.model.layers):
-    layer_cfg = qcfg.child(f"layer{idx}")
-    q_layer = PTQWrapper(
-        fp_layer,
-        qcfg=layer_cfg,
-        fp_name=m_to_fqn.get(fp_layer),
-    )
-    new_layers.append(q_layer)
-q_m.model.layers = new_layers
-# -------------------------------------------------------------------------
-# 5. Single-pass activation calibration
-# -------------------------------------------------------------------------
-print("Calibrating UINT-8 observers …")
-calib_txt = " ".join(
-    load_dataset("wikitext", "wikitext-2-raw-v1", split="train")["text"]
-)[:CALIB_TOKENS]
-ids = tokenizer(calib_txt, return_tensors="pt").input_ids.to(DEVICE)
-# (a) Enable CALIB mode on every QuantModuleBase
-for l in q_m.model.layers:
-    l.enable_calibration()
-# (b) Overwrite weight observers with GPTQ statistics
-inject_gptq_qparams(q_m, q_m.quantizers)
-with torch.no_grad():
-    for i in tqdm.trange(0, ids.size(1) - 1, STRIDE, desc="Act-calibration"):
-        q_m(ids[:, i : i + STRIDE])  # observers collect act. ranges
-# (c) Freeze all Q-params (scale, zp)
-for l in q_m.model.layers:
-    l.freeze_qparams()
-# -------------------------------------------------------------------------
-# 6. Evaluate perplexity on Wikitext-2
-# -------------------------------------------------------------------------
-print("\nCalculating perplexities …")
-test_ds = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
-enc = tokenizer("\n\n".join(test_ds["text"]), return_tensors="pt")
-ppl_uint8 = perplexity(q_m, enc, DEVICE, stride=STRIDE)
-print("\n┌── Wikitext-2 test perplexity ─────────────")
-print(f"│ UINT-8 : {ppl_uint8:8.2f}")
-print("└───────────────────────────────────────────")