tico 0.1.0.dev250803__py3-none-any.whl → 0.1.0.dev251102__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (133) hide show
  1. tico/__init__.py +1 -1
  2. tico/config/v1.py +5 -0
  3. tico/passes/cast_mixed_type_args.py +2 -0
  4. tico/passes/convert_expand_to_slice_cat.py +153 -0
  5. tico/passes/convert_matmul_to_linear.py +312 -0
  6. tico/passes/convert_to_relu6.py +1 -1
  7. tico/passes/decompose_fake_quantize_tensor_qparams.py +5 -4
  8. tico/passes/ops.py +0 -1
  9. tico/passes/remove_redundant_assert_nodes.py +3 -1
  10. tico/passes/remove_redundant_expand.py +3 -1
  11. tico/quantization/__init__.py +6 -0
  12. tico/{experimental/quantization → quantization}/algorithm/gptq/gptq.py +1 -1
  13. tico/{experimental/quantization → quantization}/algorithm/gptq/quantizer.py +30 -8
  14. tico/{experimental/quantization → quantization}/algorithm/pt2e/annotation/annotator.py +6 -8
  15. tico/{experimental/quantization → quantization}/algorithm/pt2e/annotation/op/adaptive_avg_pool2d.py +4 -6
  16. tico/{experimental/quantization → quantization}/algorithm/pt2e/annotation/op/add.py +4 -6
  17. tico/{experimental/quantization → quantization}/algorithm/pt2e/annotation/op/conv2d.py +4 -6
  18. tico/{experimental/quantization → quantization}/algorithm/pt2e/annotation/op/div.py +4 -6
  19. tico/{experimental/quantization → quantization}/algorithm/pt2e/annotation/op/linear.py +4 -6
  20. tico/{experimental/quantization → quantization}/algorithm/pt2e/annotation/op/mean.py +4 -6
  21. tico/{experimental/quantization → quantization}/algorithm/pt2e/annotation/op/mul.py +4 -6
  22. tico/{experimental/quantization → quantization}/algorithm/pt2e/annotation/op/relu6.py +4 -6
  23. tico/{experimental/quantization → quantization}/algorithm/pt2e/annotation/op/rsqrt.py +4 -6
  24. tico/{experimental/quantization → quantization}/algorithm/pt2e/annotation/op/sub.py +4 -6
  25. tico/{experimental/quantization → quantization}/algorithm/pt2e/annotation/spec.py +1 -3
  26. tico/{experimental/quantization → quantization}/algorithm/pt2e/annotation/utils.py +1 -1
  27. tico/{experimental/quantization → quantization}/algorithm/pt2e/quantizer.py +5 -2
  28. tico/{experimental/quantization → quantization}/algorithm/pt2e/utils.py +1 -3
  29. tico/{experimental/quantization → quantization}/algorithm/smoothquant/observer.py +26 -8
  30. tico/{experimental/quantization → quantization}/algorithm/smoothquant/quantizer.py +28 -9
  31. tico/quantization/algorithm/smoothquant/smooth_quant.py +327 -0
  32. tico/quantization/config/base.py +26 -0
  33. tico/quantization/config/gptq.py +29 -0
  34. tico/quantization/config/pt2e.py +25 -0
  35. tico/quantization/config/ptq.py +119 -0
  36. tico/{experimental/quantization/config.py → quantization/config/smoothquant.py} +9 -36
  37. tico/{experimental/quantization → quantization}/evaluation/evaluate.py +7 -16
  38. tico/{experimental/quantization → quantization}/evaluation/executor/circle_executor.py +3 -4
  39. tico/{experimental/quantization → quantization}/evaluation/executor/triv24_executor.py +2 -4
  40. tico/quantization/evaluation/metric.py +146 -0
  41. tico/{experimental/quantization → quantization}/evaluation/utils.py +1 -1
  42. tico/quantization/passes/__init__.py +1 -0
  43. tico/{experimental/quantization → quantization}/public_interface.py +11 -18
  44. tico/{experimental/quantization → quantization}/quantizer.py +1 -1
  45. tico/quantization/quantizer_registry.py +73 -0
  46. tico/quantization/wrapq/__init__.py +1 -0
  47. tico/quantization/wrapq/dtypes.py +70 -0
  48. tico/quantization/wrapq/examples/__init__.py +1 -0
  49. tico/quantization/wrapq/examples/compare_ppl.py +230 -0
  50. tico/quantization/wrapq/examples/debug_quant_outputs.py +224 -0
  51. tico/quantization/wrapq/examples/quantize_linear.py +107 -0
  52. tico/quantization/wrapq/examples/quantize_llama_attn.py +101 -0
  53. tico/quantization/wrapq/examples/quantize_llama_decoder_layer.py +125 -0
  54. tico/quantization/wrapq/examples/quantize_llama_mlp.py +95 -0
  55. tico/quantization/wrapq/examples/quantize_with_gptq.py +265 -0
  56. tico/quantization/wrapq/mode.py +32 -0
  57. tico/quantization/wrapq/observers/__init__.py +1 -0
  58. tico/quantization/wrapq/observers/affine_base.py +128 -0
  59. tico/quantization/wrapq/observers/base.py +98 -0
  60. tico/quantization/wrapq/observers/ema.py +62 -0
  61. tico/quantization/wrapq/observers/identity.py +74 -0
  62. tico/quantization/wrapq/observers/minmax.py +39 -0
  63. tico/quantization/wrapq/observers/mx.py +60 -0
  64. tico/quantization/wrapq/qscheme.py +40 -0
  65. tico/quantization/wrapq/quantizer.py +179 -0
  66. tico/quantization/wrapq/utils/__init__.py +1 -0
  67. tico/quantization/wrapq/utils/introspection.py +167 -0
  68. tico/quantization/wrapq/utils/metrics.py +124 -0
  69. tico/quantization/wrapq/utils/reduce_utils.py +25 -0
  70. tico/quantization/wrapq/wrappers/__init__.py +1 -0
  71. tico/quantization/wrapq/wrappers/fairseq/__init__.py +5 -0
  72. tico/quantization/wrapq/wrappers/fairseq/decoder_export_single_step.py +234 -0
  73. tico/quantization/wrapq/wrappers/fairseq/quant_decoder.py +429 -0
  74. tico/quantization/wrapq/wrappers/fairseq/quant_decoder_layer.py +492 -0
  75. tico/quantization/wrapq/wrappers/fairseq/quant_encoder.py +331 -0
  76. tico/quantization/wrapq/wrappers/fairseq/quant_encoder_layer.py +163 -0
  77. tico/quantization/wrapq/wrappers/fairseq/quant_mha.py +381 -0
  78. tico/quantization/wrapq/wrappers/llama/__init__.py +1 -0
  79. tico/quantization/wrapq/wrappers/llama/quant_attn.py +276 -0
  80. tico/quantization/wrapq/wrappers/llama/quant_decoder_layer.py +176 -0
  81. tico/quantization/wrapq/wrappers/llama/quant_mlp.py +96 -0
  82. tico/quantization/wrapq/wrappers/nn/__init__.py +1 -0
  83. tico/quantization/wrapq/wrappers/nn/quant_layernorm.py +183 -0
  84. tico/quantization/wrapq/wrappers/nn/quant_linear.py +65 -0
  85. tico/quantization/wrapq/wrappers/nn/quant_silu.py +59 -0
  86. tico/quantization/wrapq/wrappers/ptq_wrapper.py +69 -0
  87. tico/quantization/wrapq/wrappers/quant_elementwise.py +111 -0
  88. tico/quantization/wrapq/wrappers/quant_module_base.py +168 -0
  89. tico/quantization/wrapq/wrappers/registry.py +125 -0
  90. tico/serialize/circle_serializer.py +11 -4
  91. tico/serialize/operators/adapters/__init__.py +1 -0
  92. tico/serialize/operators/adapters/llama_rmsnorm.py +35 -0
  93. tico/serialize/operators/op_constant_pad_nd.py +41 -11
  94. tico/serialize/operators/op_le.py +54 -0
  95. tico/serialize/operators/op_mm.py +15 -132
  96. tico/serialize/operators/op_rmsnorm.py +65 -0
  97. tico/utils/convert.py +20 -15
  98. tico/utils/dtype.py +22 -0
  99. tico/utils/register_custom_op.py +29 -4
  100. tico/utils/signature.py +247 -0
  101. tico/utils/utils.py +50 -53
  102. tico/utils/validate_args_kwargs.py +37 -0
  103. {tico-0.1.0.dev250803.dist-info → tico-0.1.0.dev251102.dist-info}/METADATA +49 -2
  104. {tico-0.1.0.dev250803.dist-info → tico-0.1.0.dev251102.dist-info}/RECORD +130 -73
  105. tico/experimental/quantization/__init__.py +0 -6
  106. tico/experimental/quantization/algorithm/smoothquant/smooth_quant.py +0 -164
  107. tico/experimental/quantization/evaluation/metric.py +0 -109
  108. /tico/{experimental/quantization → quantization}/algorithm/__init__.py +0 -0
  109. /tico/{experimental/quantization → quantization}/algorithm/gptq/__init__.py +0 -0
  110. /tico/{experimental/quantization → quantization}/algorithm/gptq/quant.py +0 -0
  111. /tico/{experimental/quantization → quantization}/algorithm/gptq/utils.py +0 -0
  112. /tico/{experimental/quantization → quantization}/algorithm/pt2e/__init__.py +0 -0
  113. /tico/{experimental/quantization → quantization}/algorithm/pt2e/annotation/__init__.py +0 -0
  114. /tico/{experimental/quantization → quantization}/algorithm/pt2e/annotation/config.py +0 -0
  115. /tico/{experimental/quantization → quantization}/algorithm/pt2e/annotation/op/__init__.py +0 -0
  116. /tico/{experimental/quantization → quantization}/algorithm/pt2e/transformation/__init__.py +0 -0
  117. /tico/{experimental/quantization → quantization}/algorithm/pt2e/transformation/convert_scalars_to_attrs.py +0 -0
  118. /tico/{experimental/quantization → quantization}/algorithm/smoothquant/__init__.py +0 -0
  119. /tico/{experimental/quantization/evaluation → quantization/config}/__init__.py +0 -0
  120. /tico/{experimental/quantization/evaluation/executor → quantization/evaluation}/__init__.py +0 -0
  121. /tico/{experimental/quantization → quantization}/evaluation/backend.py +0 -0
  122. /tico/{experimental/quantization/passes → quantization/evaluation/executor}/__init__.py +0 -0
  123. /tico/{experimental/quantization → quantization}/evaluation/executor/backend_executor.py +0 -0
  124. /tico/{experimental/quantization → quantization}/passes/fold_quant_ops.py +0 -0
  125. /tico/{experimental/quantization → quantization}/passes/insert_quantize_on_dtype_mismatch.py +0 -0
  126. /tico/{experimental/quantization → quantization}/passes/propagate_qparam_backward.py +0 -0
  127. /tico/{experimental/quantization → quantization}/passes/propagate_qparam_forward.py +0 -0
  128. /tico/{experimental/quantization → quantization}/passes/quantize_bias.py +0 -0
  129. /tico/{experimental/quantization → quantization}/passes/remove_weight_dequant_op.py +0 -0
  130. {tico-0.1.0.dev250803.dist-info → tico-0.1.0.dev251102.dist-info}/LICENSE +0 -0
  131. {tico-0.1.0.dev250803.dist-info → tico-0.1.0.dev251102.dist-info}/WHEEL +0 -0
  132. {tico-0.1.0.dev250803.dist-info → tico-0.1.0.dev251102.dist-info}/entry_points.txt +0 -0
  133. {tico-0.1.0.dev250803.dist-info → tico-0.1.0.dev251102.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,224 @@
1
+ # Copyright (c) 2025 Samsung Electronics Co., Ltd. All Rights Reserved
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ # ============================================================================
16
+ # LAYER-WISE DIFF DEBUGGING PIPELINE
17
+ # ----------------------------------------------------------------------------
18
+ # A quantization debugging pipeline that identifies accuracy regressions
19
+ # by comparing UINT vs FP outputs at each layer.
20
+ #
21
+ # 1. Load a full-precision (FP) LLaMA-3-1B model.
22
+ # 2. Wrap each Transformer block with PTQWrapper (activations → fake-quant).
23
+ # 3. Capture reference FP layer outputs before quantization.
24
+ # 4. Calibrate UINT-8 activation observers in a single pass.
25
+ # 5. Freeze quantization parameters (scale, zero-point).
26
+ # 6. Re-run inference and compare UINT-8 vs FP outputs per layer.
27
+ # 7. Report where quantization hurts the most.
28
+ #
29
+ # Use this pipeline to trace precision loss layer by layer, and pinpoint
30
+ # problematic modules during post-training quantization.
31
+ # ============================================================================
32
+
33
+ import argparse
34
+ import sys
35
+
36
+ import torch
37
+ import tqdm
38
+ from datasets import load_dataset
39
+ from transformers import AutoModelForCausalLM, AutoTokenizer
40
+
41
+ from tico.quantization import convert, prepare
42
+ from tico.quantization.config.ptq import PTQConfig
43
+ from tico.quantization.wrapq.utils.introspection import (
44
+ build_fqn_map,
45
+ compare_layer_outputs,
46
+ save_fp_outputs,
47
+ )
48
+ from tico.quantization.wrapq.wrappers.ptq_wrapper import PTQWrapper
49
+
50
+ # Token-budget presets for activation calibration
51
+ TOKENS: dict[str, int] = {
52
+ # Smoke test (<1 min turnaround on CPU/GPU)
53
+ "debug": 2_000, # ≈16 × 128-seq batches
54
+ # Good default for 1-7B models (≲3 % ppl delta)
55
+ "baseline": 50_000,
56
+ # Production / 4-bit observer smoothing
57
+ "production": 200_000,
58
+ }
59
+
60
+ DTYPE_MAP = {
61
+ "float32": torch.float32,
62
+ "bfloat16": torch.bfloat16,
63
+ "float16": torch.float16,
64
+ }
65
+
66
+ # Hardcoded dataset settings
67
+ DATASET_NAME = "wikitext"
68
+ DATASET_CONFIG = "wikitext-2-raw-v1"
69
+ TRAIN_SPLIT = "train"
70
+
71
+
72
+ def main():
73
+ parser = argparse.ArgumentParser(
74
+ description="Layer-wise diff debugging pipeline for PTQ"
75
+ )
76
+ parser.add_argument(
77
+ "--model", type=str, required=True, help="HF repo name or local path."
78
+ )
79
+ parser.add_argument(
80
+ "--device",
81
+ type=str,
82
+ default="cuda" if torch.cuda.is_available() else "cpu",
83
+ help="Device to run on (cuda|cpu|mps).",
84
+ )
85
+ parser.add_argument(
86
+ "--dtype",
87
+ choices=list(DTYPE_MAP.keys()),
88
+ default="float32",
89
+ help=f"Model dtype for load.",
90
+ )
91
+ parser.add_argument(
92
+ "--stride",
93
+ type=int,
94
+ default=512,
95
+ help="Sliding-window stride used during calibration.",
96
+ )
97
+ parser.add_argument(
98
+ "--calib-preset",
99
+ choices=list(TOKENS.keys()),
100
+ default="debug",
101
+ help="Calibration token budget preset.",
102
+ )
103
+ parser.add_argument("--seed", type=int, default=42, help="Random seed.")
104
+ parser.add_argument(
105
+ "--trust-remote-code",
106
+ action="store_true",
107
+ help="Enable only if you trust the model repo code.",
108
+ )
109
+ parser.add_argument(
110
+ "--hf-token",
111
+ type=str,
112
+ default=None,
113
+ help="Optional HF token for gated/private repos.",
114
+ )
115
+ parser.add_argument(
116
+ "--use-cache",
117
+ dest="use_cache",
118
+ action="store_true",
119
+ default=False,
120
+ help="Use model KV cache if enabled (off by default).",
121
+ )
122
+ parser.add_argument(
123
+ "--no-tqdm", action="store_true", help="Disable tqdm progress bars."
124
+ )
125
+
126
+ args = parser.parse_args()
127
+
128
+ # Basic setup
129
+ torch.manual_seed(args.seed)
130
+ device = torch.device(args.device)
131
+ dtype = DTYPE_MAP[args.dtype] # noqa: E999 (kept readable)
132
+
133
+ print("=== Config ===")
134
+ print(f"Model : {args.model}")
135
+ print(f"Device : {device.type}")
136
+ print(f"DType : {args.dtype}")
137
+ print(f"Stride : {args.stride}")
138
+ print(
139
+ f"Calib preset : {args.calib_preset} ({TOKENS[args.calib_preset]:,} tokens)"
140
+ )
141
+ print(f"Use HF cache? : {args.use_cache}")
142
+ print()
143
+
144
+ # -------------------------------------------------------------------------
145
+ # 1. Load the FP backbone and tokenizer
146
+ # -------------------------------------------------------------------------
147
+ print("Loading FP model …")
148
+ tokenizer = AutoTokenizer.from_pretrained(
149
+ args.model,
150
+ trust_remote_code=args.trust_remote_code,
151
+ token=args.hf_token,
152
+ )
153
+ model = (
154
+ AutoModelForCausalLM.from_pretrained(
155
+ args.model,
156
+ torch_dtype=dtype,
157
+ trust_remote_code=args.trust_remote_code,
158
+ token=args.hf_token,
159
+ )
160
+ .to(device)
161
+ .eval()
162
+ )
163
+
164
+ # Disable KV cache to force full forward passes for introspection
165
+ model.config.use_cache = args.use_cache
166
+
167
+ # Build module -> FQN map before wrapping
168
+ m_to_fqn = build_fqn_map(model)
169
+
170
+ # Prepare calibration inputs (HF Wikitext-2 train split)
171
+ CALIB_TOKENS = TOKENS[args.calib_preset]
172
+ print(f"Calibrating with {CALIB_TOKENS:,} tokens.\n")
173
+ # Use Wikitext-2 train split for calibration.
174
+ dataset = load_dataset(DATASET_NAME, DATASET_CONFIG, split=TRAIN_SPLIT)
175
+
176
+ # -------------------------------------------------------------------------
177
+ # 2. Wrap every layer with PTQWrapper (UINT-8 activations)
178
+ # -------------------------------------------------------------------------
179
+ print("Wrapping layers with PTQWrapper …")
180
+ qcfg = PTQConfig() # default: per-tensor UINT8
181
+ prepare(model, qcfg)
182
+
183
+ # -------------------------------------------------------------------------
184
+ # 3. Activation calibration plus FP-vs-UINT8 diffing
185
+ # -------------------------------------------------------------------------
186
+ print("Calibrating UINT-8 observers …")
187
+ calib_txt = " ".join(dataset["text"])[:CALIB_TOKENS]
188
+ ids = tokenizer(calib_txt, return_tensors="pt").input_ids.to(device)
189
+
190
+ # Save reference FP activations before observers clamp/quantize
191
+ save_handles, act_cache = save_fp_outputs(model)
192
+
193
+ iterator = range(0, ids.size(1) - 1, args.stride)
194
+ if not args.no_tqdm:
195
+ iterator = tqdm.tqdm(iterator, desc="Act-Calibration")
196
+ with torch.no_grad():
197
+ for i in iterator:
198
+ inputs = ids[:, i : i + args.stride]
199
+ model(inputs) # observers collect act. ranges
200
+
201
+ # Remove save hooks now that FP activations are cached
202
+ for h in save_handles:
203
+ h.remove()
204
+
205
+ # Freeze (scale, zero-point) after calibration
206
+ convert(model)
207
+
208
+ # Register diff hooks and measure per-layer deltas
209
+ cmp_handles = compare_layer_outputs(model, act_cache, metrics=["diff", "peir"])
210
+ # Use same inputs for comparison.
211
+ with torch.no_grad():
212
+ model(inputs)
213
+
214
+ assert isinstance(cmp_handles, list)
215
+ for h in cmp_handles:
216
+ h.remove()
217
+
218
+
219
+ if __name__ == "__main__":
220
+ try:
221
+ main()
222
+ except Exception as e:
223
+ print(f"\n[Error] {e}", file=sys.stderr)
224
+ sys.exit(1)
@@ -0,0 +1,107 @@
1
+ # Copyright (c) 2025 Samsung Electronics Co., Ltd. All Rights Reserved
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ # =============================================================================
16
+ # POST-TRAINING QUANTIZATION EXAMPLE — Simple Linear Model
17
+ # -----------------------------------------------------------------------------
18
+ # This demo shows a minimal PTQ flow for a toy model:
19
+ # 1. Define a simple model with a single Linear layer.
20
+ # 2. Replace the FP32 Linear with a QuantLinear wrapper.
21
+ # 3. Run a short calibration pass to collect activation statistics.
22
+ # 4. Freeze scales / zero-points and switch to INT-simulation mode.
23
+ # 5. Compare INT vs FP32 outputs with a mean-absolute-diff check.
24
+ # 6. Export the quantized model to a Circle format.
25
+ # =============================================================================
26
+
27
+ import pathlib
28
+
29
+ import torch
30
+ import torch.nn as nn
31
+
32
+ from tico.quantization import convert, prepare
33
+ from tico.quantization.config.ptq import PTQConfig
34
+ from tico.quantization.evaluation.metric import compute_peir
35
+ from tico.quantization.evaluation.utils import plot_two_outputs
36
+ from tico.quantization.wrapq.mode import Mode
37
+ from tico.quantization.wrapq.wrappers.nn.quant_linear import QuantLinear
38
+ from tico.utils.utils import SuppressWarning
39
+
40
+
41
+ # -------------------------------------------------------------------------
42
+ # 0. Define a toy model (1 Linear layer only)
43
+ # -------------------------------------------------------------------------
44
+ class TinyLinearModel(nn.Module):
45
+ """A minimal model: single Linear layer."""
46
+
47
+ def __init__(self, in_features=16, out_features=8):
48
+ super().__init__()
49
+ self.fc = nn.Linear(in_features, out_features, bias=False)
50
+
51
+ def forward(self, x):
52
+ return self.fc(x)
53
+
54
+
55
+ # Instantiate FP32 model
56
+ model = TinyLinearModel()
57
+ model.eval()
58
+
59
+ # Keep FP32 reference for diff check
60
+ fp32_layer = model.fc
61
+
62
+ # -------------------------------------------------------------------------
63
+ # 1. Replace the Linear with QuantLinear wrapper
64
+ # -------------------------------------------------------------------------
65
+ model.fc = prepare(fp32_layer, PTQConfig()) # type: ignore[assignment]
66
+ qlayer = model.fc # alias for brevity
67
+
68
+ # -------------------------------------------------------------------------
69
+ # 2. Single-pass calibration (collect activation ranges)
70
+ # -------------------------------------------------------------------------
71
+ assert isinstance(qlayer.wrapped, QuantLinear)
72
+ with torch.no_grad():
73
+ for _ in range(16): # small toy batch
74
+ x = torch.randn(4, 16) # (batch=4, features=16)
75
+ _ = model(x)
76
+
77
+ convert(qlayer)
78
+
79
+ assert qlayer._mode is Mode.QUANT, "Quantization mode should be active now."
80
+
81
+ # -------------------------------------------------------------------------
82
+ # 3. Quick INT-sim vs FP32 sanity check
83
+ # -------------------------------------------------------------------------
84
+ x = torch.randn(2, 16)
85
+ with torch.no_grad():
86
+ int8_out = model(x)
87
+ fp32_out = fp32_layer(x)
88
+
89
+ print("┌───────────── Quantization Error Summary ─────────────")
90
+ print(f"│ Mean |diff|: {(int8_out - fp32_out).abs().mean().item():.6f}")
91
+ print(f"│ PEIR : {compute_peir(fp32_out, int8_out) * 100:.6f} %")
92
+ print("└──────────────────────────────────────────────────────")
93
+ print(plot_two_outputs(fp32_out, int8_out))
94
+
95
+ # -------------------------------------------------------------------------
96
+ # 4. Export the calibrated model to Circle
97
+ # -------------------------------------------------------------------------
98
+ import tico
99
+
100
+ save_path = pathlib.Path("tiny_linear.q.circle")
101
+ example_input = torch.randn(1, 16)
102
+
103
+ with SuppressWarning(UserWarning, ".*"):
104
+ cm = tico.convert(model, (example_input,)) # forward(x) only
105
+ cm.save(save_path)
106
+
107
+ print(f"Quantized Circle model saved to {save_path.resolve()}")
@@ -0,0 +1,101 @@
1
+ # Copyright (c) 2025 Samsung Electronics Co., Ltd. All Rights Reserved
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import pathlib
16
+
17
+ import torch
18
+ from transformers import AutoModelForCausalLM, AutoTokenizer
19
+
20
+ from tico.quantization import convert, prepare
21
+ from tico.quantization.config.ptq import PTQConfig
22
+ from tico.quantization.evaluation.metric import compute_peir
23
+ from tico.quantization.evaluation.utils import plot_two_outputs
24
+ from tico.quantization.wrapq.mode import Mode
25
+ from tico.quantization.wrapq.wrappers.llama.quant_attn import QuantLlamaAttention
26
+ from tico.utils.utils import SuppressWarning
27
+
28
+ name = "Maykeye/TinyLLama-v0"
29
+ model = AutoModelForCausalLM.from_pretrained(name)
30
+ tokenizer = AutoTokenizer.from_pretrained(name)
31
+
32
+ # -------------------------------------------------------------------------
33
+ # 1. Replace layer-0’s MLP with QuantLlamaMLP
34
+ # -------------------------------------------------------------------------
35
+ orig_attn = model.model.layers[0].self_attn
36
+ model.model.layers[0].self_attn = prepare(orig_attn, PTQConfig())
37
+ model.eval()
38
+
39
+ attn_q = model.model.layers[0].self_attn # quant wrapper
40
+ assert isinstance(attn_q.wrapped, QuantLlamaAttention)
41
+ rotary = model.model.rotary_emb
42
+
43
+ # -------------------------------------------------------------------------
44
+ # 2. Single-pass calibration
45
+ # -------------------------------------------------------------------------
46
+ PROMPTS = [
47
+ "The quick brown fox jumps over the lazy dog.",
48
+ "In 2025, AI systems accelerated hardware-software co-design at scale.",
49
+ "양자화는 왜 어려울까? 분포, 길이, 마스크가 관건이다.",
50
+ "今日はいい天気ですね。ところでRoPE角度は長さに依存します。",
51
+ "def quicksort(arr):\n if len(arr) <= 1: return arr\n ...",
52
+ "Prices rose 3.14% — see Figure 2; emails: foo@bar.com!",
53
+ ]
54
+
55
+ with torch.no_grad():
56
+ for prompt in PROMPTS:
57
+ ids = tokenizer(prompt, return_tensors="pt")
58
+ embeds = model.model.embed_tokens(ids["input_ids"])
59
+ cos_sin = rotary(embeds, ids["input_ids"])
60
+ S = cos_sin[0].shape[1]
61
+ float_mask = torch.zeros(1, 1, S, S)
62
+ _ = attn_q(embeds, cos_sin) # observers collect
63
+
64
+ convert(attn_q)
65
+
66
+ assert attn_q._mode is Mode.QUANT, "Quantization mode should be active now."
67
+
68
+ # -------------------------------------------------------------------------
69
+ # 3. Quick diff check (INT-sim vs FP32)
70
+ # -------------------------------------------------------------------------
71
+ ids = tokenizer("check", return_tensors="pt")
72
+ emb = model.model.embed_tokens(ids["input_ids"])
73
+ pos = rotary(emb, ids["input_ids"])
74
+ S = pos[0].shape[1]
75
+ float_mask = torch.zeros(1, 1, S, S)
76
+ with torch.no_grad():
77
+ int8 = attn_q(emb, pos)[0]
78
+ fp32 = orig_attn(emb, position_embeddings=pos, attention_mask=None)[0]
79
+
80
+ print("┌───────────── Quantization Error Summary ─────────────")
81
+ print(f"│ Mean |diff|: {(int8 - fp32).abs().mean().item():.6f}")
82
+ print(f"│ PEIR : {compute_peir(fp32, int8) * 100:.6f} %")
83
+ print("└──────────────────────────────────────────────────────")
84
+ print(plot_two_outputs(fp32, int8))
85
+
86
+ # -------------------------------------------------------------------------
87
+ # 4. Export the quantized block
88
+ # -------------------------------------------------------------------------
89
+ import tico
90
+
91
+ save_path = pathlib.Path("attn.q.circle")
92
+ B, S, D = 1, 4, model.config.hidden_size
93
+ example = torch.randn(B, S, D)
94
+ example_pos = rotary(example, torch.arange(S)[None, :])
95
+ float_mask = torch.zeros(1, 1, S, S)
96
+
97
+ with SuppressWarning(UserWarning, ".*"):
98
+ cm = tico.convert(attn_q, (example, example_pos))
99
+ cm.save(save_path)
100
+
101
+ print(f"Quantized Circle model saved to {save_path.resolve()}")
@@ -0,0 +1,125 @@
1
+ # Copyright (c) 2025 Samsung Electronics Co., Ltd. All Rights Reserved
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ # =============================================================================
16
+ # POST-TRAINING QUANTIZATION EXAMPLE — Llama Decoder Layer (Self-Attn + MLP)
17
+ # -----------------------------------------------------------------------------
18
+ # This demo shows how to:
19
+ # 1. Replace a single FP32 `LlamaDecoderLayer` with `QuantLlamaDecoderLayer`.
20
+ # 2. Collect activation statistics in one calibration sweep.
21
+ # 3. Freeze scales / zero-points and switch to INT-simulation mode.
22
+ # 4. Compare INT-8 vs FP32 outputs with a quick mean-absolute-diff check.
23
+ # 5. Export the calibrated, quantized block to a Circle model.
24
+ # -----------------------------------------------------------------------------
25
+ # Style / layout is kept identical to the `quantize_llama_attn.py` and
26
+ # `quantize_llama_mlp.py` examples for easy side-by-side reading.
27
+ # =============================================================================
28
+
29
+ import pathlib
30
+
31
+ import torch
32
+ from transformers import AutoModelForCausalLM, AutoTokenizer
33
+
34
+ from tico.quantization import convert, prepare
35
+ from tico.quantization.config.ptq import PTQConfig
36
+ from tico.quantization.evaluation.metric import compute_peir
37
+ from tico.quantization.evaluation.utils import plot_two_outputs
38
+ from tico.quantization.wrapq.mode import Mode
39
+ from tico.quantization.wrapq.wrappers.llama.quant_decoder_layer import (
40
+ QuantLlamaDecoderLayer,
41
+ )
42
+ from tico.utils.utils import SuppressWarning
43
+
44
+ MODEL_NAME = "Maykeye/TinyLLama-v0"
45
+ model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
46
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
47
+
48
+ model.eval() # disable dropout, etc.
49
+ rotary = model.model.rotary_emb # RoPE helper
50
+
51
+ # -------------------------------------------------------------------------
52
+ # 1. Swap in the quant wrapper
53
+ # -------------------------------------------------------------------------
54
+ fp32_layer = model.model.layers[0] # keep a reference for diff check
55
+ model.model.layers[0] = prepare(fp32_layer, PTQConfig())
56
+ model.eval()
57
+
58
+ qlayer = model.model.layers[0] # alias for brevity
59
+ assert isinstance(qlayer.wrapped, QuantLlamaDecoderLayer)
60
+
61
+ # -------------------------------------------------------------------------
62
+ # 2. Single-pass calibration (gather activation ranges)
63
+ # -------------------------------------------------------------------------
64
+ PROMPTS = [
65
+ "The quick brown fox jumps over the lazy dog.",
66
+ "In 2025, AI systems accelerated hardware-software co-design at scale.",
67
+ "양자화는 왜 어려울까? 분포, 길이, 마스크가 관건이다.",
68
+ "今日はいい天気ですね。ところでRoPE角度は長さに依存します。",
69
+ "def quicksort(arr):\n if len(arr) <= 1: return arr\n ...",
70
+ "Prices rose 3.14% — see Figure 2; emails: foo@bar.com!",
71
+ ]
72
+
73
+ with torch.no_grad():
74
+ for prompt in PROMPTS:
75
+ ids = tokenizer(prompt, return_tensors="pt")
76
+ hidden = model.model.embed_tokens(ids["input_ids"])
77
+ pos = rotary(hidden, ids["input_ids"]) # (cos, sin) tuple
78
+ S = pos[0].shape[1]
79
+ attn_mask = torch.zeros(1, 1, S, S) # causal-mask placeholder
80
+ _ = qlayer(hidden, attention_mask=attn_mask, position_embeddings=pos)
81
+
82
+ convert(qlayer)
83
+
84
+ assert qlayer._mode is Mode.QUANT, "Quantization mode should be active now."
85
+
86
+ # -------------------------------------------------------------------------
87
+ # 3. Quick INT-sim vs FP32 sanity check
88
+ # -------------------------------------------------------------------------
89
+ ids = tokenizer("check", return_tensors="pt")
90
+ hidden = model.model.embed_tokens(ids["input_ids"])
91
+ pos = rotary(hidden, ids["input_ids"])
92
+ S = pos[0].shape[1]
93
+ attn_mask = torch.zeros(1, 1, S, S)
94
+
95
+ with torch.no_grad():
96
+ int8_out = qlayer(hidden, attention_mask=attn_mask, position_embeddings=pos)
97
+ int8 = int8_out[0] if isinstance(int8_out, tuple) else int8_out
98
+ fp32_out = fp32_layer(hidden, attention_mask=attn_mask, position_embeddings=pos)
99
+ fp32 = fp32_out[0] if isinstance(fp32_out, tuple) else fp32_out
100
+
101
+ print("┌───────────── Quantization Error Summary ─────────────")
102
+ print(f"│ Mean |diff|: {(int8 - fp32).abs().mean().item():.6f}")
103
+ print(f"│ PEIR : {compute_peir(fp32, int8) * 100:.6f} %")
104
+ print("└──────────────────────────────────────────────────────")
105
+ print(plot_two_outputs(fp32, int8))
106
+
107
+ # -------------------------------------------------------------------------
108
+ # 4. Export the calibrated layer to Circle
109
+ # -------------------------------------------------------------------------
110
+ import tico
111
+
112
+ save_path = pathlib.Path("decoder_layer.q.circle")
113
+ B, S, D = 1, 4, model.config.hidden_size
114
+ example_hidden = torch.randn(B, S, D)
115
+ example_pos = rotary(example_hidden, torch.arange(S)[None, :])
116
+ attn_mask = torch.zeros(1, 1, S, S)
117
+
118
+ with SuppressWarning(UserWarning, ".*"):
119
+ cm = tico.convert(
120
+ qlayer, (example_hidden, attn_mask), {"position_embeddings": example_pos}
121
+ )
122
+ # Note that the model is not fully quantized.
123
+ cm.save(save_path)
124
+
125
+ print(f"Quantized Circle model saved to {save_path.resolve()}")
@@ -0,0 +1,95 @@
1
+ # Copyright (c) 2025 Samsung Electronics Co., Ltd. All Rights Reserved
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import pathlib
16
+
17
+ import torch
18
+ from transformers import AutoModelForCausalLM, AutoTokenizer
19
+
20
+ import tico
21
+ from tico.quantization import convert, prepare
22
+ from tico.quantization.config.ptq import PTQConfig
23
+ from tico.quantization.evaluation.metric import compute_peir
24
+ from tico.quantization.evaluation.utils import plot_two_outputs
25
+ from tico.quantization.wrapq.dtypes import INT16
26
+ from tico.quantization.wrapq.mode import Mode
27
+ from tico.quantization.wrapq.qscheme import QScheme
28
+ from tico.quantization.wrapq.wrappers.llama.quant_mlp import QuantLlamaMLP
29
+ from tico.utils.utils import SuppressWarning
30
+
31
+ name = "Maykeye/TinyLLama-v0"
32
+ model = AutoModelForCausalLM.from_pretrained(name)
33
+ tokenizer = AutoTokenizer.from_pretrained(name)
34
+ model.eval()
35
+
36
+ # -------------------------------------------------------------------------
37
+ # 1. Replace layer-0’s MLP with QuantLlamaMLP
38
+ # -------------------------------------------------------------------------
39
+ fp32_mlp = model.model.layers[0].mlp
40
+ model.model.layers[0].mlp = prepare(
41
+ fp32_mlp, PTQConfig(default_dtype=INT16, default_qscheme=QScheme.PER_TENSOR_SYMM)
42
+ )
43
+ model.eval()
44
+
45
+ mlp_q = model.model.layers[0].mlp
46
+ assert isinstance(mlp_q.wrapped, QuantLlamaMLP)
47
+
48
+ # -------------------------------------------------------------------------
49
+ # 2. Single-pass calibration
50
+ # -------------------------------------------------------------------------
51
+ PROMPTS = [
52
+ "The quick brown fox jumps over the lazy dog.",
53
+ "In 2025, AI systems accelerated hardware-software co-design at scale.",
54
+ "양자화는 왜 어려울까? 분포, 길이, 마스크가 관건이다.",
55
+ "今日はいい天気ですね。ところでRoPE角度は長さに依存します。",
56
+ "def quicksort(arr):\n if len(arr) <= 1: return arr\n ...",
57
+ "Prices rose 3.14% — see Figure 2; emails: foo@bar.com!",
58
+ ]
59
+
60
+ with torch.no_grad():
61
+ for prompt in PROMPTS:
62
+ enc = tokenizer(prompt, return_tensors="pt")
63
+ emb = model.model.embed_tokens(enc["input_ids"])
64
+ _ = mlp_q(emb)
65
+
66
+ convert(mlp_q)
67
+
68
+ assert mlp_q._mode is Mode.QUANT, "Quantization mode should be active now."
69
+
70
+ # -------------------------------------------------------------------------
71
+ # 3. Quick diff check (INT-sim vs FP32)
72
+ # -------------------------------------------------------------------------
73
+ with torch.no_grad():
74
+ ids = tokenizer("quant all tensors!", return_tensors="pt")
75
+ emb = model.model.embed_tokens(ids["input_ids"])
76
+ int16 = mlp_q(emb) # INT-sim
77
+ fp32 = fp32_mlp(emb) # baseline reference
78
+
79
+ print("┌───────────── Quantization Error Summary ─────────────")
80
+ print(f"│ Mean |diff|: {(int16 - fp32).abs().mean().item():.6f}")
81
+ print(f"│ PEIR : {compute_peir(fp32, int16) * 100:.6f} %")
82
+ print("└──────────────────────────────────────────────────────")
83
+ print(plot_two_outputs(fp32, int16))
84
+
85
+ # -------------------------------------------------------------------------
86
+ # 4. Export the quantized block
87
+ # -------------------------------------------------------------------------
88
+ save_path = pathlib.Path("mlp.q.circle")
89
+ example_in = (torch.randn(1, 1, model.config.hidden_size),)
90
+
91
+ with SuppressWarning(UserWarning, ".*"):
92
+ cm = tico.convert(mlp_q, example_in)
93
+ cm.save(save_path)
94
+
95
+ print(f"Quantized Circle model saved to {save_path.resolve()}")