tico 0.1.0.dev250828__py3-none-any.whl → 0.1.0.dev250901__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
tico/__init__.py CHANGED
@@ -29,7 +29,7 @@ __all__ = [
29
29
  ]
30
30
 
31
31
  # THIS LINE IS AUTOMATICALLY GENERATED BY setup.py
32
- __version__ = "0.1.0.dev250828"
32
+ __version__ = "0.1.0.dev250901"
33
33
 
34
34
  MINIMUM_SUPPORTED_VERSION = "2.5.0"
35
35
  SECURE_TORCH_VERSION = "2.6.0"
@@ -0,0 +1,124 @@
1
+ # Copyright (c) 2025 Samsung Electronics Co., Ltd. All Rights Reserved
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ # =============================================================================
16
+ # POST-TRAINING QUANTIZATION EXAMPLE — Llama Decoder Layer (Self-Attn + MLP)
17
+ # -----------------------------------------------------------------------------
18
+ # This demo shows how to:
19
+ # 1. Replace a single FP32 `LlamaDecoderLayer` with `QuantLlamaDecoderLayer`.
20
+ # 2. Collect activation statistics in one calibration sweep.
21
+ # 3. Freeze scales / zero-points and switch to INT-simulation mode.
22
+ # 4. Compare INT-8 vs FP32 outputs with a quick mean-absolute-diff check.
23
+ # 5. Export the calibrated, quantized block to a Circle model.
24
+ # -----------------------------------------------------------------------------
25
+ # Style / layout is kept identical to the `quantize_llama_attn.py` and
26
+ # `quantize_llama_mlp.py` examples for easy side-by-side reading.
27
+ # =============================================================================
28
+
29
+ import pathlib
30
+
31
+ import torch
32
+ from transformers import AutoModelForCausalLM, AutoTokenizer
33
+
34
+ from tico.experimental.quantization.evaluation.metric import compute_peir
35
+ from tico.experimental.quantization.evaluation.utils import plot_two_outputs
36
+ from tico.experimental.quantization.ptq.mode import Mode
37
+ from tico.experimental.quantization.ptq.wrappers.llama.quant_decoder_layer import (
38
+ QuantLlamaDecoderLayer,
39
+ )
40
+ from tico.utils.utils import SuppressWarning
41
+
42
+ MODEL_NAME = "Maykeye/TinyLLama-v0"
43
+ model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
44
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
45
+
46
+ model.eval() # disable dropout, etc.
47
+ rotary = model.model.rotary_emb # RoPE helper
48
+
49
+ # -------------------------------------------------------------------------
50
+ # 1. Swap in the quant wrapper
51
+ # -------------------------------------------------------------------------
52
+ fp32_layer = model.model.layers[0] # keep a reference for diff check
53
+ model.model.layers[0] = QuantLlamaDecoderLayer(
54
+ fp32_layer
55
+ ) # PTQWrapper(fp32_layer) is also fine
56
+ model.eval()
57
+
58
+ qlayer = model.model.layers[0] # alias for brevity
59
+
60
+ # -------------------------------------------------------------------------
61
+ # 2. Single-pass calibration (gather activation ranges)
62
+ # -------------------------------------------------------------------------
63
+ PROMPTS = [
64
+ "The quick brown fox jumps over the lazy dog.",
65
+ "In 2025, AI systems accelerated hardware-software co-design at scale.",
66
+ "양자화는 왜 어려울까? 분포, 길이, 마스크가 관건이다.",
67
+ "今日はいい天気ですね。ところでRoPE角度は長さに依存します。",
68
+ "def quicksort(arr):\n if len(arr) <= 1: return arr\n ...",
69
+ "Prices rose 3.14% — see Figure 2; emails: foo@bar.com!",
70
+ ]
71
+
72
+ with torch.no_grad():
73
+ qlayer.enable_calibration()
74
+ for prompt in PROMPTS:
75
+ ids = tokenizer(prompt, return_tensors="pt")
76
+ hidden = model.model.embed_tokens(ids["input_ids"])
77
+ pos = rotary(hidden, ids["input_ids"]) # (cos, sin) tuple
78
+ S = pos[0].shape[1]
79
+ attn_mask = torch.zeros(1, 1, S, S) # causal-mask placeholder
80
+ _ = qlayer(hidden, attention_mask=attn_mask, position_embeddings=pos)
81
+ qlayer.freeze_qparams()
82
+
83
+ assert qlayer._mode is Mode.QUANT, "Quantization mode should be active now."
84
+
85
+ # -------------------------------------------------------------------------
86
+ # 3. Quick INT-sim vs FP32 sanity check
87
+ # -------------------------------------------------------------------------
88
+ ids = tokenizer("check", return_tensors="pt")
89
+ hidden = model.model.embed_tokens(ids["input_ids"])
90
+ pos = rotary(hidden, ids["input_ids"])
91
+ S = pos[0].shape[1]
92
+ attn_mask = torch.zeros(1, 1, S, S)
93
+
94
+ with torch.no_grad():
95
+ int8_out = qlayer(hidden, attention_mask=attn_mask, position_embeddings=pos)
96
+ int8 = int8_out[0] if isinstance(int8_out, tuple) else int8_out
97
+ fp32_out = fp32_layer(hidden, attention_mask=attn_mask, position_embeddings=pos)
98
+ fp32 = fp32_out[0] if isinstance(fp32_out, tuple) else fp32_out
99
+
100
+ print("┌───────────── Quantization Error Summary ─────────────")
101
+ print(f"│ Mean |diff|: {(int8 - fp32).abs().mean().item():.6f}")
102
+ print(f"│ PEIR : {compute_peir(fp32, int8) * 100:.6f} %")
103
+ print("└──────────────────────────────────────────────────────")
104
+ print(plot_two_outputs(fp32, int8))
105
+
106
+ # -------------------------------------------------------------------------
107
+ # 4. Export the calibrated layer to Circle
108
+ # -------------------------------------------------------------------------
109
+ import tico
110
+
111
+ save_path = pathlib.Path("decoder_layer.q.circle")
112
+ B, S, D = 1, 4, model.config.hidden_size
113
+ example_hidden = torch.randn(B, S, D)
114
+ example_pos = rotary(example_hidden, torch.arange(S)[None, :])
115
+ attn_mask = torch.zeros(1, 1, S, S)
116
+
117
+ with SuppressWarning(UserWarning, ".*"):
118
+ cm = tico.convert(
119
+ qlayer, (example_hidden, attn_mask), {"position_embeddings": example_pos}
120
+ )
121
+ # Note that the model is not fully quantized.
122
+ cm.save(save_path)
123
+
124
+ print(f"Quantized Circle model saved to {save_path.resolve()}")
@@ -1,6 +1,9 @@
1
1
  from tico.experimental.quantization.ptq.wrappers.llama.quant_attn import (
2
2
  QuantLlamaAttention,
3
3
  )
4
+ from tico.experimental.quantization.ptq.wrappers.llama.quant_decoder_layer import (
5
+ QuantLlamaDecoderLayer,
6
+ )
4
7
  from tico.experimental.quantization.ptq.wrappers.llama.quant_mlp import QuantLlamaMLP
5
8
 
6
- __all__ = ["QuantLlamaAttention", "QuantLlamaMLP"]
9
+ __all__ = ["QuantLlamaAttention", "QuantLlamaDecoderLayer", "QuantLlamaMLP"]
@@ -0,0 +1,168 @@
1
+ # Copyright (c) 2025 Samsung Electronics Co., Ltd. All Rights Reserved
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from typing import Optional, Tuple
16
+
17
+ import torch
18
+ import torch.nn as nn
19
+
20
+ from tico.experimental.quantization.ptq.quant_config import QuantConfig
21
+ from tico.experimental.quantization.ptq.wrappers.llama.quant_attn import (
22
+ QuantLlamaAttention,
23
+ )
24
+ from tico.experimental.quantization.ptq.wrappers.llama.quant_mlp import QuantLlamaMLP
25
+ from tico.experimental.quantization.ptq.wrappers.ptq_wrapper import PTQWrapper
26
+ from tico.experimental.quantization.ptq.wrappers.quant_module_base import (
27
+ QuantModuleBase,
28
+ )
29
+ from tico.experimental.quantization.ptq.wrappers.registry import try_register
30
+
31
+
32
+ @try_register("transformers.models.llama.modeling_llama.LlamaDecoderLayer")
33
+ class QuantLlamaDecoderLayer(QuantModuleBase):
34
+ """
35
+ Quant-aware drop-in replacement for HF `LlamaDecoderLayer`.
36
+ Signature and return-value are identical to the original.
37
+
38
+ ▸ Attention & MLP blocks are replaced by their quantized counterparts
39
+ ▸ LayerNorms remain FP32 (no fake-quant)
40
+ ▸ A "static" causal mask is pre-built in `__init__` to avoid
41
+ dynamic boolean-to-float casts inside `forward`.
42
+
43
+ Notes on the causal mask
44
+ ------------------------
45
+ Building a boolean mask "inside" `forward` would introduce
46
+ non-deterministic dynamic ops that an integer-only accelerator cannot
47
+ fuse easily. Therefore we:
48
+
49
+ 1. Pre-compute a full upper-triangular mask of size
50
+ `[1, 1, max_seq, max_seq]` in `__init__`.
51
+ 2. In `forward`, if the caller passes `attention_mask=None`, we
52
+ slice the pre-computed template to the current sequence length.
53
+ """
54
+
55
+ def __init__(
56
+ self,
57
+ fp_layer: nn.Module,
58
+ *,
59
+ qcfg: Optional[QuantConfig] = None,
60
+ fp_name: Optional[str] = None,
61
+ return_type: Optional[str] = None,
62
+ ):
63
+ """
64
+ Q) Why do we need `return_type`?
65
+ A) Different versions of `transformers` wrap the decoder output in
66
+ different containers: a plain Tensor or a tuple.
67
+ """
68
+ self.return_type = return_type
69
+ if self.return_type is None:
70
+ import transformers
71
+
72
+ v = tuple(map(int, transformers.__version__.split(".")[:2]))
73
+ self.return_type = "tensor" if v >= (4, 54) else "tuple"
74
+ assert self.return_type is not None
75
+ super().__init__(qcfg, fp_name=fp_name)
76
+
77
+ # Child QuantConfigs -------------------------------------------------
78
+ attn_cfg = qcfg.child("self_attn") if qcfg else None
79
+ mlp_cfg = qcfg.child("mlp") if qcfg else None
80
+
81
+ # Quantized sub-modules ---------------------------------------------
82
+ assert hasattr(fp_layer, "self_attn") and isinstance(
83
+ fp_layer.self_attn, torch.nn.Module
84
+ )
85
+ assert hasattr(fp_layer, "mlp") and isinstance(fp_layer.mlp, torch.nn.Module)
86
+ self.self_attn = PTQWrapper(
87
+ fp_layer.self_attn, qcfg=attn_cfg, fp_name=f"{fp_name}.self_attn"
88
+ )
89
+ self.mlp = PTQWrapper(fp_layer.mlp, qcfg=mlp_cfg, fp_name=f"{fp_name}.mlp")
90
+
91
+ # LayerNorms remain FP (copied from fp_layer to keep weights)
92
+ assert hasattr(fp_layer, "input_layernorm") and isinstance(
93
+ fp_layer.input_layernorm, torch.nn.Module
94
+ )
95
+ assert hasattr(fp_layer, "post_attention_layernorm") and isinstance(
96
+ fp_layer.post_attention_layernorm, torch.nn.Module
97
+ )
98
+ self.input_layernorm = fp_layer.input_layernorm
99
+ self.post_attention_layernorm = fp_layer.post_attention_layernorm
100
+
101
+ # Static causal mask template ---------------------------------------
102
+ assert hasattr(fp_layer.self_attn, "config") and hasattr(
103
+ fp_layer.self_attn.config, "max_position_embeddings"
104
+ )
105
+ assert isinstance(fp_layer.self_attn.config.max_position_embeddings, int)
106
+ max_seq = fp_layer.self_attn.config.max_position_embeddings
107
+ mask = torch.full((1, 1, max_seq, max_seq), float("-120"))
108
+ mask.triu_(1)
109
+ self.register_buffer("causal_mask_template", mask, persistent=False)
110
+
111
+ def _slice_causal(self, seq_len: int, device: torch.device) -> torch.Tensor:
112
+ """Return `[1,1,L,L]` causal mask slice on *device*."""
113
+ assert isinstance(self.causal_mask_template, torch.Tensor)
114
+ return self.causal_mask_template[..., :seq_len, :seq_len].to(device)
115
+
116
+ def forward(
117
+ self,
118
+ hidden_states: torch.Tensor,
119
+ attention_mask: Optional[torch.Tensor] = None,
120
+ position_ids: Optional[torch.LongTensor] = None,
121
+ past_key_value: Optional["Cache"] = None, # type: ignore[name-defined]
122
+ output_attentions: Optional[bool] = False,
123
+ use_cache: Optional[bool] = False,
124
+ cache_position: Optional[torch.LongTensor] = None,
125
+ position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,
126
+ **kwargs,
127
+ ) -> Tuple[torch.Tensor] | torch.Tensor:
128
+ if output_attentions:
129
+ raise NotImplementedError(
130
+ "QuantLlamaDecoderLayer does not support output attention yet."
131
+ )
132
+ residual = hidden_states
133
+ hidden_states = self.input_layernorm(hidden_states)
134
+
135
+ if attention_mask is None or attention_mask.dtype == torch.bool:
136
+ L = hidden_states.size(1)
137
+ attention_mask = self._slice_causal(L, hidden_states.device)
138
+
139
+ hidden_states, _ = self.self_attn(
140
+ hidden_states=hidden_states,
141
+ attention_mask=attention_mask,
142
+ position_ids=position_ids,
143
+ past_key_value=past_key_value,
144
+ output_attentions=output_attentions,
145
+ use_cache=use_cache,
146
+ cache_position=cache_position,
147
+ position_embeddings=position_embeddings,
148
+ **kwargs,
149
+ )
150
+ hidden_states = residual + hidden_states
151
+
152
+ # ─── MLP block ─────────────────────────────────────────────────
153
+ residual = hidden_states
154
+ hidden_states = self.post_attention_layernorm(hidden_states)
155
+ hidden_states = self.mlp(hidden_states)
156
+ hidden_states = residual + hidden_states
157
+
158
+ if self.return_type == "tuple":
159
+ return (hidden_states,)
160
+ elif self.return_type == "tensor":
161
+ return hidden_states
162
+ else:
163
+ raise RuntimeError("Invalid return type.")
164
+
165
+ # No local observers; just recurse into children
166
+ def _all_observers(self):
167
+ yield from self.self_attn._all_observers()
168
+ yield from self.mlp._all_observers()
@@ -29,6 +29,7 @@ _CORE_MODULES = (
29
29
  "tico.experimental.quantization.ptq.wrappers.nn.quant_linear",
30
30
  "tico.experimental.quantization.ptq.wrappers.nn.quant_silu",
31
31
  "tico.experimental.quantization.ptq.wrappers.llama.quant_attn",
32
+ "tico.experimental.quantization.ptq.wrappers.llama.quant_decoder_layer",
32
33
  "tico.experimental.quantization.ptq.wrappers.llama.quant_mlp",
33
34
  # add future core wrappers here
34
35
  )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: tico
3
- Version: 0.1.0.dev250828
3
+ Version: 0.1.0.dev250901
4
4
  Summary: Convert exported Torch module to circle
5
5
  Home-page: UNKNOWN
6
6
  License: UNKNOWN
@@ -1,4 +1,4 @@
1
- tico/__init__.py,sha256=Ur6T0ZsgBvl70ek5stJ0_uxhK9xNmHado-clOup1OEM,1883
1
+ tico/__init__.py,sha256=MgvCVXWMpNL2dxPn54C8fdQaTJPdtHivhuNHH4qN5R8,1883
2
2
  tico/pt2_to_circle.py,sha256=gu3MD4Iqc0zMZcCZ2IT8oGbyj21CTSbT3Rgd9s2B_9A,2767
3
3
  tico/config/__init__.py,sha256=xZzCXjZ84qE-CsBi-dfaL05bqpQ3stKKfTXhnrJRyVs,142
4
4
  tico/config/base.py,sha256=q5xMqGxTUZs4mFqt5c7i_y9U00fYgdMGl9nUqIVMlCo,1248
@@ -64,6 +64,7 @@ tico/experimental/quantization/ptq/quant_config.py,sha256=nm7570Y1X2mOT_8s27ilWi
64
64
  tico/experimental/quantization/ptq/examples/__init__.py,sha256=IO6FP_xYbGy0dW0HL26GXD3ouxARaxCK7bz9dn4blPQ,26
65
65
  tico/experimental/quantization/ptq/examples/quantize_linear.py,sha256=8zq-ZJDYgam0xQ-PbC6Xb1I7W1mv0Wi-b--IP2wwXtw,4539
66
66
  tico/experimental/quantization/ptq/examples/quantize_llama_attn.py,sha256=cVWUSSzaZWFp5QZkNkrlpHU3kXyP84QtnZbahVml_yQ,4329
67
+ tico/experimental/quantization/ptq/examples/quantize_llama_decoder_layer.py,sha256=mBWrjkyEovYQsPC4Rrsri6Pm1rlFmDb3NiP0DQQhFyM,5751
67
68
  tico/experimental/quantization/ptq/examples/quantize_llama_mlp.py,sha256=N1qZQgt1S-xZrdv-PW7OfXEcv0gsO2q9faOF4aD-zKo,4147
68
69
  tico/experimental/quantization/ptq/observers/__init__.py,sha256=WF2MvL9M_jl-B1FqcY9zic34NOCRp17HkRYv-TMxMr4,613
69
70
  tico/experimental/quantization/ptq/observers/affine_base.py,sha256=e2Eba64nrxKQyE4F_WJ7WTSsk3xe6bkdGUKaoLFWGFw,4638
@@ -78,9 +79,10 @@ tico/experimental/quantization/ptq/wrappers/__init__.py,sha256=47DEQpj8HBSa-_TIm
78
79
  tico/experimental/quantization/ptq/wrappers/ptq_wrapper.py,sha256=F9sK_DiRaXiGNHULcwIbs5EUtHz6ZJ7N4r5CWTTfhsM,2442
79
80
  tico/experimental/quantization/ptq/wrappers/quant_elementwise.py,sha256=LhEoobfvto6zKrBOKL4gmxfFFc31jHzyQV_zfps-iQM,3604
80
81
  tico/experimental/quantization/ptq/wrappers/quant_module_base.py,sha256=vkcDos_knGSS29rIZuEIWkAJLHrENbGz8nCH2-iara8,5969
81
- tico/experimental/quantization/ptq/wrappers/registry.py,sha256=TwH-MD-qkTkG6M-f1VqFLmSNcXLNYsh21yjyzCcojJc,4706
82
- tico/experimental/quantization/ptq/wrappers/llama/__init__.py,sha256=b360gkQ0RxExmiV-ZaaxwJdMPJ53g6uCRlR2-_dOby0,240
82
+ tico/experimental/quantization/ptq/wrappers/registry.py,sha256=M1D_foC0PR-Ii4G0lbOO3_pmhvHlMF28NolK_q2DZtw,4783
83
+ tico/experimental/quantization/ptq/wrappers/llama/__init__.py,sha256=4xuAYnJcohMTtBzrH4cxq8WKG2GQo8nbhektVg8w7F0,380
83
84
  tico/experimental/quantization/ptq/wrappers/llama/quant_attn.py,sha256=WIUI6EFMTvvruvqu8pBxWy6qJeDyjkaYbJk1R3pAmwE,8578
85
+ tico/experimental/quantization/ptq/wrappers/llama/quant_decoder_layer.py,sha256=2XsIf5rcabDXXkahqriSxfo2curFq0Y5bnRPcYkJPg8,7187
84
86
  tico/experimental/quantization/ptq/wrappers/llama/quant_mlp.py,sha256=uZMnrX66oZwxhKhcNbLXXeri-WxxRBiZnr15aBXJMm0,3562
85
87
  tico/experimental/quantization/ptq/wrappers/nn/__init__.py,sha256=I9uTt5HfcRoMEDYHpAeATMv2TbCQiX0ZbfUFMzSJ4Qw,336
86
88
  tico/experimental/quantization/ptq/wrappers/nn/quant_layernorm.py,sha256=G5Sgt-tXnzh0Rxyk-2honmZIfEQOZlRfOsoDBdSGmA4,6887
@@ -240,9 +242,9 @@ tico/utils/mx/__init__.py,sha256=IO6FP_xYbGy0dW0HL26GXD3ouxARaxCK7bz9dn4blPQ,26
240
242
  tico/utils/mx/elemwise_ops.py,sha256=V6glyAHsVR1joqpsgnNytatCD_ew92xNWZ19UFDoMTA,10281
241
243
  tico/utils/mx/formats.py,sha256=uzNWyu-1onUlwQfX5cZ6fZSUfHMRqorper7_T1k3jfk,3404
242
244
  tico/utils/mx/mx_ops.py,sha256=RcfUTYVi-wilGB2sC35OeARdwDqnixv7dG5iyZ-fQT8,8555
243
- tico-0.1.0.dev250828.dist-info/LICENSE,sha256=kp4JLII7bzRhPb0CPD5XTDZMh22BQ7h3k3B7t8TiSbw,12644
244
- tico-0.1.0.dev250828.dist-info/METADATA,sha256=nYk1Pl6H1eZbVSUDMUYLDjb8-AsRW9e9v1EAnIgLM4k,8450
245
- tico-0.1.0.dev250828.dist-info/WHEEL,sha256=G16H4A3IeoQmnOrYV4ueZGKSjhipXx8zc8nu9FGlvMA,92
246
- tico-0.1.0.dev250828.dist-info/entry_points.txt,sha256=kBKYSS_IYrSXmUYevmmepqIVPScq5vF8ulQRu3I_Zf0,59
247
- tico-0.1.0.dev250828.dist-info/top_level.txt,sha256=oqs7UPoNSKZEwqsX8B-KAWdQwfAa7i60pbxW_Jk7P3w,5
248
- tico-0.1.0.dev250828.dist-info/RECORD,,
245
+ tico-0.1.0.dev250901.dist-info/LICENSE,sha256=kp4JLII7bzRhPb0CPD5XTDZMh22BQ7h3k3B7t8TiSbw,12644
246
+ tico-0.1.0.dev250901.dist-info/METADATA,sha256=LMgoYoHYFT8cJU9VNYiiX89tMSxEX30x17x_6eWAr4o,8450
247
+ tico-0.1.0.dev250901.dist-info/WHEEL,sha256=G16H4A3IeoQmnOrYV4ueZGKSjhipXx8zc8nu9FGlvMA,92
248
+ tico-0.1.0.dev250901.dist-info/entry_points.txt,sha256=kBKYSS_IYrSXmUYevmmepqIVPScq5vF8ulQRu3I_Zf0,59
249
+ tico-0.1.0.dev250901.dist-info/top_level.txt,sha256=oqs7UPoNSKZEwqsX8B-KAWdQwfAa7i60pbxW_Jk7P3w,5
250
+ tico-0.1.0.dev250901.dist-info/RECORD,,