tico 0.1.0.dev251020__py3-none-any.whl → 0.1.0.dev251022__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of tico might be problematic. Click here for more details.
- tico/__init__.py +1 -1
- tico/experimental/quantization/config/ptq.py +4 -1
- tico/experimental/quantization/ptq/examples/compare_ppl.py +5 -16
- tico/experimental/quantization/ptq/examples/debug_quant_outputs.py +5 -20
- tico/experimental/quantization/ptq/examples/quantize_linear.py +7 -6
- tico/experimental/quantization/ptq/examples/quantize_llama_attn.py +6 -6
- tico/experimental/quantization/ptq/examples/quantize_llama_decoder_layer.py +6 -5
- tico/experimental/quantization/ptq/examples/quantize_llama_mlp.py +6 -6
- tico/experimental/quantization/ptq/examples/quantize_with_gptq.py +5 -24
- tico/experimental/quantization/ptq/quantizer.py +181 -0
- tico/experimental/quantization/public_interface.py +2 -2
- tico/experimental/quantization/quantizer_registry.py +11 -8
- {tico-0.1.0.dev251020.dist-info → tico-0.1.0.dev251022.dist-info}/METADATA +1 -1
- {tico-0.1.0.dev251020.dist-info → tico-0.1.0.dev251022.dist-info}/RECORD +18 -17
- {tico-0.1.0.dev251020.dist-info → tico-0.1.0.dev251022.dist-info}/LICENSE +0 -0
- {tico-0.1.0.dev251020.dist-info → tico-0.1.0.dev251022.dist-info}/WHEEL +0 -0
- {tico-0.1.0.dev251020.dist-info → tico-0.1.0.dev251022.dist-info}/entry_points.txt +0 -0
- {tico-0.1.0.dev251020.dist-info → tico-0.1.0.dev251022.dist-info}/top_level.txt +0 -0
tico/__init__.py
CHANGED
|
@@ -75,6 +75,8 @@ class PTQConfig(BaseConfig):
|
|
|
75
75
|
default_observer: Type[ObserverBase] = MinMaxObserver
|
|
76
76
|
default_qscheme: QScheme = QScheme.PER_TENSOR_ASYMM
|
|
77
77
|
overrides: Mapping[str, Mapping[str, Any]] = field(default_factory=dict)
|
|
78
|
+
# If True, any module that cannot be wrapped will raise.
|
|
79
|
+
strict_wrap: bool = True
|
|
78
80
|
|
|
79
81
|
@property
|
|
80
82
|
def name(self) -> str:
|
|
@@ -110,7 +112,8 @@ class PTQConfig(BaseConfig):
|
|
|
110
112
|
self.default_observer,
|
|
111
113
|
default_qscheme=self.default_qscheme,
|
|
112
114
|
overrides=sub_overrides,
|
|
115
|
+
strict_wrap=self.strict_wrap,
|
|
113
116
|
)
|
|
114
117
|
|
|
115
118
|
def __repr__(self):
|
|
116
|
-
return f"PTQConfig(default_dtype={self.default_dtype}, default_observer={self.default_observer}, default_qscheme={self.default_qscheme}, overrides={dict(self.overrides)})"
|
|
119
|
+
return f"PTQConfig(default_dtype={self.default_dtype}, default_observer={self.default_observer}, default_qscheme={self.default_qscheme}, overrides={dict(self.overrides)}, strict_wrap={self.strict_wrap})"
|
|
@@ -22,16 +22,15 @@
|
|
|
22
22
|
|
|
23
23
|
import argparse
|
|
24
24
|
import sys
|
|
25
|
-
from typing import Optional
|
|
26
25
|
|
|
27
26
|
import torch
|
|
28
27
|
import tqdm
|
|
29
28
|
from datasets import load_dataset
|
|
30
29
|
from transformers import AutoModelForCausalLM, AutoTokenizer
|
|
31
30
|
|
|
31
|
+
from tico.experimental.quantization import convert, prepare
|
|
32
32
|
from tico.experimental.quantization.config.ptq import PTQConfig
|
|
33
33
|
from tico.experimental.quantization.ptq.utils.metrics import perplexity
|
|
34
|
-
from tico.experimental.quantization.ptq.wrappers.ptq_wrapper import PTQWrapper
|
|
35
34
|
|
|
36
35
|
# Token-budget presets for activation calibration
|
|
37
36
|
TOKENS: dict[str, int] = {
|
|
@@ -166,12 +165,7 @@ def main():
|
|
|
166
165
|
# 2. Wrap every Transformer layer with PTQWrapper
|
|
167
166
|
# ---------------------------------------------------------------------
|
|
168
167
|
qcfg = PTQConfig() # all-uint8 defaults
|
|
169
|
-
|
|
170
|
-
wrapped_layers = torch.nn.ModuleList()
|
|
171
|
-
for idx, layer in enumerate(uint8_model.model.layers):
|
|
172
|
-
layer_cfg = qcfg.child(f"layer{idx}")
|
|
173
|
-
wrapped_layers.append(PTQWrapper(layer, qcfg=layer_cfg))
|
|
174
|
-
uint8_model.model.layers = wrapped_layers
|
|
168
|
+
prepare(uint8_model, qcfg)
|
|
175
169
|
|
|
176
170
|
# ---------------------------------------------------------------------
|
|
177
171
|
# 3. Single-pass activation calibration
|
|
@@ -182,11 +176,7 @@ def main():
|
|
|
182
176
|
)[:CALIB_TOKENS]
|
|
183
177
|
ids = tokenizer(calib_txt, return_tensors="pt").input_ids.to(device)
|
|
184
178
|
|
|
185
|
-
#
|
|
186
|
-
for l in uint8_model.model.layers:
|
|
187
|
-
l.enable_calibration()
|
|
188
|
-
|
|
189
|
-
# (b) run inference to collect ranges
|
|
179
|
+
# Run inference to collect ranges
|
|
190
180
|
iterator = range(0, ids.size(1) - 1, args.stride)
|
|
191
181
|
if not args.no_tqdm:
|
|
192
182
|
iterator = tqdm.tqdm(iterator, desc="Calibration")
|
|
@@ -194,9 +184,8 @@ def main():
|
|
|
194
184
|
for i in iterator:
|
|
195
185
|
uint8_model(ids[:, i : i + args.stride])
|
|
196
186
|
|
|
197
|
-
#
|
|
198
|
-
|
|
199
|
-
l.freeze_qparams()
|
|
187
|
+
# Freeze (scale, zero-point)
|
|
188
|
+
convert(uint8_model)
|
|
200
189
|
|
|
201
190
|
# -------------------------------------------------------------------------
|
|
202
191
|
# 4. Evaluate perplexity
|
|
@@ -38,6 +38,7 @@ import tqdm
|
|
|
38
38
|
from datasets import load_dataset
|
|
39
39
|
from transformers import AutoModelForCausalLM, AutoTokenizer
|
|
40
40
|
|
|
41
|
+
from tico.experimental.quantization import convert, prepare
|
|
41
42
|
from tico.experimental.quantization.config.ptq import PTQConfig
|
|
42
43
|
from tico.experimental.quantization.ptq.utils.introspection import (
|
|
43
44
|
build_fqn_map,
|
|
@@ -177,18 +178,7 @@ def main():
|
|
|
177
178
|
# -------------------------------------------------------------------------
|
|
178
179
|
print("Wrapping layers with PTQWrapper …")
|
|
179
180
|
qcfg = PTQConfig() # default: per-tensor UINT8
|
|
180
|
-
|
|
181
|
-
new_layers = torch.nn.ModuleList()
|
|
182
|
-
for idx, fp_layer in enumerate(model.model.layers):
|
|
183
|
-
layer_cfg = qcfg.child(f"layer{idx}")
|
|
184
|
-
q_layer = PTQWrapper(
|
|
185
|
-
fp_layer,
|
|
186
|
-
qcfg=layer_cfg,
|
|
187
|
-
fp_name=m_to_fqn.get(fp_layer),
|
|
188
|
-
)
|
|
189
|
-
new_layers.append(q_layer)
|
|
190
|
-
|
|
191
|
-
model.model.layers = new_layers # swap in quant wrappers
|
|
181
|
+
prepare(model, qcfg)
|
|
192
182
|
|
|
193
183
|
# -------------------------------------------------------------------------
|
|
194
184
|
# 3. Activation calibration plus FP-vs-UINT8 diffing
|
|
@@ -197,10 +187,6 @@ def main():
|
|
|
197
187
|
calib_txt = " ".join(dataset["text"])[:CALIB_TOKENS]
|
|
198
188
|
ids = tokenizer(calib_txt, return_tensors="pt").input_ids.to(device)
|
|
199
189
|
|
|
200
|
-
# (a) Enable CALIB mode on every QuantModuleBase
|
|
201
|
-
for l in model.model.layers:
|
|
202
|
-
l.enable_calibration()
|
|
203
|
-
|
|
204
190
|
# Save reference FP activations before observers clamp/quantize
|
|
205
191
|
save_handles, act_cache = save_fp_outputs(model)
|
|
206
192
|
|
|
@@ -216,11 +202,10 @@ def main():
|
|
|
216
202
|
for h in save_handles:
|
|
217
203
|
h.remove()
|
|
218
204
|
|
|
219
|
-
#
|
|
220
|
-
|
|
221
|
-
l.freeze_qparams()
|
|
205
|
+
# Freeze (scale, zero-point) after calibration
|
|
206
|
+
convert(model)
|
|
222
207
|
|
|
223
|
-
#
|
|
208
|
+
# Register diff hooks and measure per-layer deltas
|
|
224
209
|
cmp_handles = compare_layer_outputs(model, act_cache, metrics=["diff", "peir"])
|
|
225
210
|
# Use same inputs for comparison.
|
|
226
211
|
with torch.no_grad():
|
|
@@ -29,13 +29,15 @@ import pathlib
|
|
|
29
29
|
import torch
|
|
30
30
|
import torch.nn as nn
|
|
31
31
|
|
|
32
|
+
from tico.experimental.quantization import convert, prepare
|
|
33
|
+
from tico.experimental.quantization.config.ptq import PTQConfig
|
|
32
34
|
from tico.experimental.quantization.evaluation.metric import compute_peir
|
|
33
35
|
from tico.experimental.quantization.evaluation.utils import plot_two_outputs
|
|
34
|
-
|
|
35
36
|
from tico.experimental.quantization.ptq.mode import Mode
|
|
36
37
|
from tico.experimental.quantization.ptq.wrappers.nn.quant_linear import QuantLinear
|
|
37
38
|
from tico.utils.utils import SuppressWarning
|
|
38
39
|
|
|
40
|
+
|
|
39
41
|
# -------------------------------------------------------------------------
|
|
40
42
|
# 0. Define a toy model (1 Linear layer only)
|
|
41
43
|
# -------------------------------------------------------------------------
|
|
@@ -60,20 +62,19 @@ fp32_layer = model.fc
|
|
|
60
62
|
# -------------------------------------------------------------------------
|
|
61
63
|
# 1. Replace the Linear with QuantLinear wrapper
|
|
62
64
|
# -------------------------------------------------------------------------
|
|
63
|
-
model.fc =
|
|
64
|
-
# model.fc = PTQWrapper(fp32_layer) (Wrapping helper class)
|
|
65
|
+
model.fc = prepare(fp32_layer, PTQConfig()) # type: ignore[assignment]
|
|
65
66
|
qlayer = model.fc # alias for brevity
|
|
66
67
|
|
|
67
68
|
# -------------------------------------------------------------------------
|
|
68
69
|
# 2. Single-pass calibration (collect activation ranges)
|
|
69
70
|
# -------------------------------------------------------------------------
|
|
70
|
-
assert isinstance(qlayer, QuantLinear)
|
|
71
|
+
assert isinstance(qlayer.wrapped, QuantLinear)
|
|
71
72
|
with torch.no_grad():
|
|
72
|
-
qlayer.enable_calibration()
|
|
73
73
|
for _ in range(16): # small toy batch
|
|
74
74
|
x = torch.randn(4, 16) # (batch=4, features=16)
|
|
75
75
|
_ = model(x)
|
|
76
|
-
|
|
76
|
+
|
|
77
|
+
convert(qlayer)
|
|
77
78
|
|
|
78
79
|
assert qlayer._mode is Mode.QUANT, "Quantization mode should be active now."
|
|
79
80
|
|
|
@@ -17,9 +17,10 @@ import pathlib
|
|
|
17
17
|
import torch
|
|
18
18
|
from transformers import AutoModelForCausalLM, AutoTokenizer
|
|
19
19
|
|
|
20
|
+
from tico.experimental.quantization import convert, prepare
|
|
21
|
+
from tico.experimental.quantization.config.ptq import PTQConfig
|
|
20
22
|
from tico.experimental.quantization.evaluation.metric import compute_peir
|
|
21
23
|
from tico.experimental.quantization.evaluation.utils import plot_two_outputs
|
|
22
|
-
|
|
23
24
|
from tico.experimental.quantization.ptq.mode import Mode
|
|
24
25
|
from tico.experimental.quantization.ptq.wrappers.llama.quant_attn import (
|
|
25
26
|
QuantLlamaAttention,
|
|
@@ -34,12 +35,11 @@ tokenizer = AutoTokenizer.from_pretrained(name)
|
|
|
34
35
|
# 1. Replace layer-0’s MLP with QuantLlamaMLP
|
|
35
36
|
# -------------------------------------------------------------------------
|
|
36
37
|
orig_attn = model.model.layers[0].self_attn
|
|
37
|
-
model.model.layers[0].self_attn =
|
|
38
|
-
orig_attn
|
|
39
|
-
) # PTQWrapper(orig_attn) is also fine
|
|
38
|
+
model.model.layers[0].self_attn = prepare(orig_attn, PTQConfig())
|
|
40
39
|
model.eval()
|
|
41
40
|
|
|
42
41
|
attn_q = model.model.layers[0].self_attn # quant wrapper
|
|
42
|
+
assert isinstance(attn_q.wrapped, QuantLlamaAttention)
|
|
43
43
|
rotary = model.model.rotary_emb
|
|
44
44
|
|
|
45
45
|
# -------------------------------------------------------------------------
|
|
@@ -55,7 +55,6 @@ PROMPTS = [
|
|
|
55
55
|
]
|
|
56
56
|
|
|
57
57
|
with torch.no_grad():
|
|
58
|
-
attn_q.enable_calibration()
|
|
59
58
|
for prompt in PROMPTS:
|
|
60
59
|
ids = tokenizer(prompt, return_tensors="pt")
|
|
61
60
|
embeds = model.model.embed_tokens(ids["input_ids"])
|
|
@@ -63,7 +62,8 @@ with torch.no_grad():
|
|
|
63
62
|
S = cos_sin[0].shape[1]
|
|
64
63
|
float_mask = torch.zeros(1, 1, S, S)
|
|
65
64
|
_ = attn_q(embeds, cos_sin) # observers collect
|
|
66
|
-
|
|
65
|
+
|
|
66
|
+
convert(attn_q)
|
|
67
67
|
|
|
68
68
|
assert attn_q._mode is Mode.QUANT, "Quantization mode should be active now."
|
|
69
69
|
|
|
@@ -31,6 +31,8 @@ import pathlib
|
|
|
31
31
|
import torch
|
|
32
32
|
from transformers import AutoModelForCausalLM, AutoTokenizer
|
|
33
33
|
|
|
34
|
+
from tico.experimental.quantization import convert, prepare
|
|
35
|
+
from tico.experimental.quantization.config.ptq import PTQConfig
|
|
34
36
|
from tico.experimental.quantization.evaluation.metric import compute_peir
|
|
35
37
|
from tico.experimental.quantization.evaluation.utils import plot_two_outputs
|
|
36
38
|
from tico.experimental.quantization.ptq.mode import Mode
|
|
@@ -50,12 +52,11 @@ rotary = model.model.rotary_emb # RoPE helper
|
|
|
50
52
|
# 1. Swap in the quant wrapper
|
|
51
53
|
# -------------------------------------------------------------------------
|
|
52
54
|
fp32_layer = model.model.layers[0] # keep a reference for diff check
|
|
53
|
-
model.model.layers[0] =
|
|
54
|
-
fp32_layer
|
|
55
|
-
) # PTQWrapper(fp32_layer) is also fine
|
|
55
|
+
model.model.layers[0] = prepare(fp32_layer, PTQConfig())
|
|
56
56
|
model.eval()
|
|
57
57
|
|
|
58
58
|
qlayer = model.model.layers[0] # alias for brevity
|
|
59
|
+
assert isinstance(qlayer.wrapped, QuantLlamaDecoderLayer)
|
|
59
60
|
|
|
60
61
|
# -------------------------------------------------------------------------
|
|
61
62
|
# 2. Single-pass calibration (gather activation ranges)
|
|
@@ -70,7 +71,6 @@ PROMPTS = [
|
|
|
70
71
|
]
|
|
71
72
|
|
|
72
73
|
with torch.no_grad():
|
|
73
|
-
qlayer.enable_calibration()
|
|
74
74
|
for prompt in PROMPTS:
|
|
75
75
|
ids = tokenizer(prompt, return_tensors="pt")
|
|
76
76
|
hidden = model.model.embed_tokens(ids["input_ids"])
|
|
@@ -78,7 +78,8 @@ with torch.no_grad():
|
|
|
78
78
|
S = pos[0].shape[1]
|
|
79
79
|
attn_mask = torch.zeros(1, 1, S, S) # causal-mask placeholder
|
|
80
80
|
_ = qlayer(hidden, attention_mask=attn_mask, position_embeddings=pos)
|
|
81
|
-
|
|
81
|
+
|
|
82
|
+
convert(qlayer)
|
|
82
83
|
|
|
83
84
|
assert qlayer._mode is Mode.QUANT, "Quantization mode should be active now."
|
|
84
85
|
|
|
@@ -18,6 +18,7 @@ import torch
|
|
|
18
18
|
from transformers import AutoModelForCausalLM, AutoTokenizer
|
|
19
19
|
|
|
20
20
|
import tico
|
|
21
|
+
from tico.experimental.quantization import convert, prepare
|
|
21
22
|
from tico.experimental.quantization.config.ptq import PTQConfig
|
|
22
23
|
from tico.experimental.quantization.evaluation.metric import compute_peir
|
|
23
24
|
from tico.experimental.quantization.evaluation.utils import plot_two_outputs
|
|
@@ -36,13 +37,13 @@ model.eval()
|
|
|
36
37
|
# 1. Replace layer-0’s MLP with QuantLlamaMLP
|
|
37
38
|
# -------------------------------------------------------------------------
|
|
38
39
|
fp32_mlp = model.model.layers[0].mlp
|
|
39
|
-
model.model.layers[0].mlp =
|
|
40
|
-
fp32_mlp,
|
|
41
|
-
|
|
42
|
-
) # PTQWrapper(fp32_mlp) is also fine
|
|
40
|
+
model.model.layers[0].mlp = prepare(
|
|
41
|
+
fp32_mlp, PTQConfig(default_dtype=INT16, default_qscheme=QScheme.PER_TENSOR_SYMM)
|
|
42
|
+
)
|
|
43
43
|
model.eval()
|
|
44
44
|
|
|
45
45
|
mlp_q = model.model.layers[0].mlp
|
|
46
|
+
assert isinstance(mlp_q.wrapped, QuantLlamaMLP)
|
|
46
47
|
|
|
47
48
|
# -------------------------------------------------------------------------
|
|
48
49
|
# 2. Single-pass calibration
|
|
@@ -57,13 +58,12 @@ PROMPTS = [
|
|
|
57
58
|
]
|
|
58
59
|
|
|
59
60
|
with torch.no_grad():
|
|
60
|
-
mlp_q.enable_calibration()
|
|
61
61
|
for prompt in PROMPTS:
|
|
62
62
|
enc = tokenizer(prompt, return_tensors="pt")
|
|
63
63
|
emb = model.model.embed_tokens(enc["input_ids"])
|
|
64
64
|
_ = mlp_q(emb)
|
|
65
65
|
|
|
66
|
-
|
|
66
|
+
convert(mlp_q)
|
|
67
67
|
|
|
68
68
|
assert mlp_q._mode is Mode.QUANT, "Quantization mode should be active now."
|
|
69
69
|
|
|
@@ -215,22 +215,8 @@ def main():
|
|
|
215
215
|
# 4. Wrap every layer with PTQWrapper (activation UINT-8)
|
|
216
216
|
# -------------------------------------------------------------------------
|
|
217
217
|
print("Wrapping layers with PTQWrapper …")
|
|
218
|
-
layers = q_m.model.layers
|
|
219
|
-
if not isinstance(layers, (list, torch.nn.ModuleList)):
|
|
220
|
-
raise TypeError(f"'model.layers' must be list/ModuleList, got {type(layers)}")
|
|
221
|
-
|
|
222
218
|
qcfg = PTQConfig() # default: per-tensor UINT8
|
|
223
|
-
|
|
224
|
-
for idx, fp_layer in enumerate(layers):
|
|
225
|
-
layer_cfg = qcfg.child(f"layer{idx}")
|
|
226
|
-
wrapped.append(
|
|
227
|
-
PTQWrapper(
|
|
228
|
-
fp_layer,
|
|
229
|
-
qcfg=layer_cfg,
|
|
230
|
-
fp_name=m_to_fqn.get(fp_layer),
|
|
231
|
-
)
|
|
232
|
-
)
|
|
233
|
-
q_m.model.layers = wrapped
|
|
219
|
+
prepare(q_m, qcfg)
|
|
234
220
|
|
|
235
221
|
# -------------------------------------------------------------------------
|
|
236
222
|
# 5. Single-pass activation calibration
|
|
@@ -242,11 +228,7 @@ def main():
|
|
|
242
228
|
calib_txt = " ".join(dataset_train["text"])[:CALIB_TOKENS]
|
|
243
229
|
train_ids = tokenizer(calib_txt, return_tensors="pt").input_ids.to(device)
|
|
244
230
|
|
|
245
|
-
#
|
|
246
|
-
for l in q_m.model.layers:
|
|
247
|
-
l.enable_calibration()
|
|
248
|
-
|
|
249
|
-
# (b) Overwrite weight observers with GPTQ statistics
|
|
231
|
+
# Overwrite weight observers with GPTQ statistics
|
|
250
232
|
if hasattr(q_m, "quantizers") and isinstance(q_m.quantizers, dict):
|
|
251
233
|
inject_gptq_qparams(q_m, q_m.quantizers)
|
|
252
234
|
else:
|
|
@@ -254,7 +236,7 @@ def main():
|
|
|
254
236
|
"[Warn] q_m.quantizers not found or not a dict; skipping GPTQ qparam injection."
|
|
255
237
|
)
|
|
256
238
|
|
|
257
|
-
#
|
|
239
|
+
# Forward passes to collect activation ranges
|
|
258
240
|
iterator = range(0, train_ids.size(1) - 1, args.stride)
|
|
259
241
|
if not args.no_tqdm:
|
|
260
242
|
iterator = tqdm.tqdm(iterator, desc="Act-calibration")
|
|
@@ -262,9 +244,8 @@ def main():
|
|
|
262
244
|
for i in iterator:
|
|
263
245
|
q_m(train_ids[:, i : i + args.stride])
|
|
264
246
|
|
|
265
|
-
#
|
|
266
|
-
|
|
267
|
-
l.freeze_qparams()
|
|
247
|
+
# Freeze all Q-params (scale, zero-point)
|
|
248
|
+
convert(q_m)
|
|
268
249
|
|
|
269
250
|
# -------------------------------------------------------------------------
|
|
270
251
|
# 6. Evaluate perplexity on Wikitext-2
|
|
@@ -0,0 +1,181 @@
|
|
|
1
|
+
# Copyright (c) 2025 Samsung Electronics Co., Ltd. All Rights Reserved
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
from typing import Any, Dict, Optional
|
|
16
|
+
|
|
17
|
+
import torch
|
|
18
|
+
import torch.nn as nn
|
|
19
|
+
|
|
20
|
+
from tico.experimental.quantization.config.ptq import PTQConfig
|
|
21
|
+
|
|
22
|
+
from tico.experimental.quantization.ptq.wrappers.ptq_wrapper import PTQWrapper
|
|
23
|
+
from tico.experimental.quantization.ptq.wrappers.quant_module_base import (
|
|
24
|
+
QuantModuleBase,
|
|
25
|
+
)
|
|
26
|
+
from tico.experimental.quantization.quantizer import BaseQuantizer
|
|
27
|
+
from tico.experimental.quantization.quantizer_registry import register_quantizer
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@register_quantizer(PTQConfig)
|
|
31
|
+
class PTQQuantizer(BaseQuantizer):
|
|
32
|
+
"""
|
|
33
|
+
Post-Training Quantization (PTQ) quantizer integrated with the public interface.
|
|
34
|
+
|
|
35
|
+
Features
|
|
36
|
+
--------
|
|
37
|
+
• Automatically wraps quantizable modules using PTQWrapper.
|
|
38
|
+
• Supports leaf-level (single-module) quantization (e.g., prepare(model.fc, PTQConfig())).
|
|
39
|
+
• Enforces strict wrapping if `strict_wrap=True`: raises NotImplementedError if
|
|
40
|
+
no quantizable module was found at any boundary.
|
|
41
|
+
• If `strict_wrap=False`, unquantizable modules are silently skipped.
|
|
42
|
+
"""
|
|
43
|
+
|
|
44
|
+
def __init__(self, config: PTQConfig):
|
|
45
|
+
super().__init__(config)
|
|
46
|
+
self.qcfg: PTQConfig = config
|
|
47
|
+
self.strict_wrap: bool = bool(getattr(config, "strict_wrap", True))
|
|
48
|
+
|
|
49
|
+
@torch.no_grad()
|
|
50
|
+
def prepare(
|
|
51
|
+
self,
|
|
52
|
+
model: torch.nn.Module,
|
|
53
|
+
args: Optional[Any] = None,
|
|
54
|
+
kwargs: Optional[Dict[str, Any]] = None,
|
|
55
|
+
):
|
|
56
|
+
# Wrap the tree (or single module) according to strictness policy
|
|
57
|
+
model = self._wrap_supported(model, self.qcfg)
|
|
58
|
+
|
|
59
|
+
# Switch all quant modules into calibration mode
|
|
60
|
+
if isinstance(model, QuantModuleBase):
|
|
61
|
+
model.enable_calibration()
|
|
62
|
+
for m in model.modules():
|
|
63
|
+
if isinstance(m, QuantModuleBase):
|
|
64
|
+
m.enable_calibration()
|
|
65
|
+
return model
|
|
66
|
+
|
|
67
|
+
@torch.no_grad()
|
|
68
|
+
def convert(self, model):
|
|
69
|
+
# Freeze qparams across the tree (QUANT mode)
|
|
70
|
+
if isinstance(model, QuantModuleBase):
|
|
71
|
+
model.freeze_qparams()
|
|
72
|
+
for m in model.modules():
|
|
73
|
+
if isinstance(m, QuantModuleBase):
|
|
74
|
+
m.freeze_qparams()
|
|
75
|
+
return model
|
|
76
|
+
|
|
77
|
+
def _wrap_supported(
|
|
78
|
+
self,
|
|
79
|
+
root: nn.Module,
|
|
80
|
+
qcfg: PTQConfig,
|
|
81
|
+
) -> nn.Module:
|
|
82
|
+
"""
|
|
83
|
+
Recursively attempt to wrap boundaries. Strictness is applied at every boundary.
|
|
84
|
+
"""
|
|
85
|
+
assert not isinstance(root, QuantModuleBase), "The module is already wrapped."
|
|
86
|
+
|
|
87
|
+
# Case A: HuggingFace-style transformers: model.model.layers
|
|
88
|
+
lm = getattr(root, "model", None)
|
|
89
|
+
layers = getattr(lm, "layers", None) if isinstance(lm, nn.Module) else None
|
|
90
|
+
if isinstance(layers, nn.ModuleList):
|
|
91
|
+
new_list = nn.ModuleList()
|
|
92
|
+
for idx, layer in enumerate(layers):
|
|
93
|
+
child_scope = f"layer{idx}"
|
|
94
|
+
child_cfg = qcfg.child(child_scope)
|
|
95
|
+
|
|
96
|
+
# Enforce strictness at the child boundary
|
|
97
|
+
wrapped = self._try_wrap(
|
|
98
|
+
layer,
|
|
99
|
+
child_cfg,
|
|
100
|
+
fp_name=child_scope,
|
|
101
|
+
raise_on_fail=self.strict_wrap,
|
|
102
|
+
)
|
|
103
|
+
new_list.append(wrapped)
|
|
104
|
+
lm.layers = new_list # type: ignore[union-attr]
|
|
105
|
+
return root
|
|
106
|
+
|
|
107
|
+
# Case B: Containers
|
|
108
|
+
if isinstance(root, (nn.Sequential, nn.ModuleList)):
|
|
109
|
+
for i, child in enumerate(list(root)):
|
|
110
|
+
name = str(i)
|
|
111
|
+
child_cfg = qcfg.child(name)
|
|
112
|
+
|
|
113
|
+
wrapped = self._try_wrap(
|
|
114
|
+
child, child_cfg, fp_name=name, raise_on_fail=self.strict_wrap
|
|
115
|
+
)
|
|
116
|
+
if wrapped is child:
|
|
117
|
+
assert not self.strict_wrap
|
|
118
|
+
wrapped = self._wrap_supported(wrapped, child_cfg)
|
|
119
|
+
root[i] = wrapped # type: ignore[index]
|
|
120
|
+
|
|
121
|
+
if isinstance(root, nn.ModuleDict):
|
|
122
|
+
for k, child in list(root.items()):
|
|
123
|
+
name = k
|
|
124
|
+
child_cfg = qcfg.child(name)
|
|
125
|
+
|
|
126
|
+
wrapped = self._try_wrap(
|
|
127
|
+
child, child_cfg, fp_name=name, raise_on_fail=self.strict_wrap
|
|
128
|
+
)
|
|
129
|
+
if wrapped is child:
|
|
130
|
+
assert not self.strict_wrap
|
|
131
|
+
wrapped = self._wrap_supported(wrapped, child_cfg)
|
|
132
|
+
root[k] = wrapped # type: ignore[index]
|
|
133
|
+
|
|
134
|
+
# Case C: Leaf node
|
|
135
|
+
root_name = getattr(root, "_get_name", lambda: None)()
|
|
136
|
+
wrapped = self._try_wrap(
|
|
137
|
+
root, qcfg, fp_name=root_name, raise_on_fail=self.strict_wrap
|
|
138
|
+
)
|
|
139
|
+
if wrapped is not root:
|
|
140
|
+
return wrapped
|
|
141
|
+
|
|
142
|
+
assert not self.strict_wrap
|
|
143
|
+
# Case D: Named children
|
|
144
|
+
for name, child in list(root.named_children()):
|
|
145
|
+
child_cfg = qcfg.child(name)
|
|
146
|
+
|
|
147
|
+
wrapped = self._try_wrap(
|
|
148
|
+
child, child_cfg, fp_name=name, raise_on_fail=self.strict_wrap
|
|
149
|
+
)
|
|
150
|
+
if wrapped is child:
|
|
151
|
+
assert not self.strict_wrap
|
|
152
|
+
wrapped = self._wrap_supported(wrapped, child_cfg)
|
|
153
|
+
setattr(root, name, wrapped)
|
|
154
|
+
|
|
155
|
+
return root
|
|
156
|
+
|
|
157
|
+
def _try_wrap(
|
|
158
|
+
self,
|
|
159
|
+
module: nn.Module,
|
|
160
|
+
qcfg_for_child: PTQConfig,
|
|
161
|
+
*,
|
|
162
|
+
fp_name: Optional[str],
|
|
163
|
+
raise_on_fail: bool,
|
|
164
|
+
) -> nn.Module:
|
|
165
|
+
"""
|
|
166
|
+
Attempt to wrap a boundary with PTQWrapper.
|
|
167
|
+
|
|
168
|
+
Behavior:
|
|
169
|
+
• If PTQWrapper succeeds: return wrapped module.
|
|
170
|
+
• If PTQWrapper raises NotImplementedError:
|
|
171
|
+
- raise_on_fail=True -> re-raise (strict)
|
|
172
|
+
- raise_on_fail=False -> return original module (permissive)
|
|
173
|
+
"""
|
|
174
|
+
try:
|
|
175
|
+
return PTQWrapper(module, qcfg=qcfg_for_child, fp_name=fp_name)
|
|
176
|
+
except NotImplementedError as e:
|
|
177
|
+
if raise_on_fail:
|
|
178
|
+
raise NotImplementedError(
|
|
179
|
+
f"PTQQuantizer: no quantization wrapper for {type(module).__name__}"
|
|
180
|
+
) from e
|
|
181
|
+
return module
|
|
@@ -32,7 +32,7 @@ def prepare(
|
|
|
32
32
|
quant_config: BaseConfig,
|
|
33
33
|
args: Optional[Any] = None,
|
|
34
34
|
kwargs: Optional[Dict[str, Any]] = None,
|
|
35
|
-
inplace: Optional[bool] =
|
|
35
|
+
inplace: Optional[bool] = True,
|
|
36
36
|
):
|
|
37
37
|
"""
|
|
38
38
|
Prepare the model for quantization using the provided configuration.
|
|
@@ -68,7 +68,7 @@ def prepare(
|
|
|
68
68
|
return model
|
|
69
69
|
|
|
70
70
|
|
|
71
|
-
def convert(model, inplace: Optional[bool] =
|
|
71
|
+
def convert(model, inplace: Optional[bool] = True):
|
|
72
72
|
"""
|
|
73
73
|
Convert the prepared model to a quantized model using the provided configuration.
|
|
74
74
|
|
|
@@ -53,14 +53,17 @@ def get_quantizer(cfg: BaseConfig) -> BaseQuantizer:
|
|
|
53
53
|
# Lazy import by naming convention
|
|
54
54
|
name = getattr(cfg, "name", None)
|
|
55
55
|
if name:
|
|
56
|
-
|
|
57
|
-
importlib.import_module(
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
56
|
+
if name == "ptq":
|
|
57
|
+
importlib.import_module(f"tico.experimental.quantization.ptq.quantizer")
|
|
58
|
+
else:
|
|
59
|
+
try:
|
|
60
|
+
importlib.import_module(
|
|
61
|
+
f"tico.experimental.quantization.algorithm.{name}.quantizer"
|
|
62
|
+
)
|
|
63
|
+
except Exception as e:
|
|
64
|
+
raise RuntimeError(
|
|
65
|
+
f"Failed to import quantizer module for config name='{name}': {e}"
|
|
66
|
+
)
|
|
64
67
|
|
|
65
68
|
qcls = _lookup(cfg)
|
|
66
69
|
if qcls is not None:
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
tico/__init__.py,sha256=
|
|
1
|
+
tico/__init__.py,sha256=EsX-4xNY4R5ooL4KCszR-_fjyRlWHKMUQ3Bk8MvMf_Q,1883
|
|
2
2
|
tico/pt2_to_circle.py,sha256=gu3MD4Iqc0zMZcCZ2IT8oGbyj21CTSbT3Rgd9s2B_9A,2767
|
|
3
3
|
tico/config/__init__.py,sha256=xZzCXjZ84qE-CsBi-dfaL05bqpQ3stKKfTXhnrJRyVs,142
|
|
4
4
|
tico/config/base.py,sha256=q5xMqGxTUZs4mFqt5c7i_y9U00fYgdMGl9nUqIVMlCo,1248
|
|
@@ -6,9 +6,9 @@ tico/config/factory.py,sha256=il0zqB6Lm5NX2LnG-TUhmiP9vVeZ_3TucJMorVZIodY,1324
|
|
|
6
6
|
tico/config/v1.py,sha256=uB5d39fkmuBACwjBVGtdWb_HGXfXsvmw6nw64xZcC-8,1342
|
|
7
7
|
tico/experimental/__init__.py,sha256=IO6FP_xYbGy0dW0HL26GXD3ouxARaxCK7bz9dn4blPQ,26
|
|
8
8
|
tico/experimental/quantization/__init__.py,sha256=IaJPZegVJp0P3luutBo907Kp5sOJensE1Mm-XBG_jBs,122
|
|
9
|
-
tico/experimental/quantization/public_interface.py,sha256=
|
|
9
|
+
tico/experimental/quantization/public_interface.py,sha256=56lfDZIIC8pICyR0qAM1qGx6eAcJMbE-GARW8Bxkls0,4218
|
|
10
10
|
tico/experimental/quantization/quantizer.py,sha256=pDTQGzR-BcQJeGZ7O4cXRQdCme4q_POpxHetwnv0bYg,2370
|
|
11
|
-
tico/experimental/quantization/quantizer_registry.py,sha256=
|
|
11
|
+
tico/experimental/quantization/quantizer_registry.py,sha256=Jhiw2XMlLTn51hHim1okx3ozr-mjulR-SHiQRjYvfXc,2502
|
|
12
12
|
tico/experimental/quantization/algorithm/__init__.py,sha256=IO6FP_xYbGy0dW0HL26GXD3ouxARaxCK7bz9dn4blPQ,26
|
|
13
13
|
tico/experimental/quantization/algorithm/gptq/__init__.py,sha256=IO6FP_xYbGy0dW0HL26GXD3ouxARaxCK7bz9dn4blPQ,26
|
|
14
14
|
tico/experimental/quantization/algorithm/gptq/gptq.py,sha256=Qn9b_2ki7B64DcVEY25NMkww3PdZ5EqYQQXfYhNDQ6I,5555
|
|
@@ -44,7 +44,7 @@ tico/experimental/quantization/config/__init__.py,sha256=IO6FP_xYbGy0dW0HL26GXD3
|
|
|
44
44
|
tico/experimental/quantization/config/base.py,sha256=xg_HCDSuMgYvMd6ENZe4Sm2SYJgMaCBj4cmqaz_lhAs,816
|
|
45
45
|
tico/experimental/quantization/config/gptq.py,sha256=IUIEz5bLhsTXqoBCE1rfPec99zsRjwgpDbPW5YJqOPg,973
|
|
46
46
|
tico/experimental/quantization/config/pt2e.py,sha256=9HCrraTGGZeKEN9puKV-ODi7ncV2Wjc3oe_JCO1D_Rs,850
|
|
47
|
-
tico/experimental/quantization/config/ptq.py,sha256=
|
|
47
|
+
tico/experimental/quantization/config/ptq.py,sha256=4QhoJ6hTJOb1MH88sa0vxKS6GdPdFIy1rFjEXsnarrk,4595
|
|
48
48
|
tico/experimental/quantization/config/smoothquant.py,sha256=b92dz4-MiBbkaLzXb47bVoO29d2P416woFQUZ1wpO_s,1414
|
|
49
49
|
tico/experimental/quantization/evaluation/__init__.py,sha256=IO6FP_xYbGy0dW0HL26GXD3ouxARaxCK7bz9dn4blPQ,26
|
|
50
50
|
tico/experimental/quantization/evaluation/backend.py,sha256=CZL9rZOA0t8cH7PHp6u9l7dGqWNvTj9bKOvwo0PVul0,692
|
|
@@ -66,14 +66,15 @@ tico/experimental/quantization/ptq/__init__.py,sha256=IO6FP_xYbGy0dW0HL26GXD3oux
|
|
|
66
66
|
tico/experimental/quantization/ptq/dtypes.py,sha256=xfCBtq6mQmUYRwsoFgII6gvRl1raQi0Inj9pznDuKwQ,2236
|
|
67
67
|
tico/experimental/quantization/ptq/mode.py,sha256=lT-T8vIv8YWcwrjT7xXVhOw1g7aoAdh_3PWB-ptPKaI,1052
|
|
68
68
|
tico/experimental/quantization/ptq/qscheme.py,sha256=uwhv7bCxOOXB3I-IKlRyr_u4eXOq48uIqGy4TLDqGxY,1301
|
|
69
|
+
tico/experimental/quantization/ptq/quantizer.py,sha256=4hS6S4O8ytcUKoZdjTSmbUrCzWlCELsb_uz5ARnQucI,6647
|
|
69
70
|
tico/experimental/quantization/ptq/examples/__init__.py,sha256=IO6FP_xYbGy0dW0HL26GXD3ouxARaxCK7bz9dn4blPQ,26
|
|
70
|
-
tico/experimental/quantization/ptq/examples/compare_ppl.py,sha256=
|
|
71
|
-
tico/experimental/quantization/ptq/examples/debug_quant_outputs.py,sha256=
|
|
72
|
-
tico/experimental/quantization/ptq/examples/quantize_linear.py,sha256=
|
|
73
|
-
tico/experimental/quantization/ptq/examples/quantize_llama_attn.py,sha256=
|
|
74
|
-
tico/experimental/quantization/ptq/examples/quantize_llama_decoder_layer.py,sha256=
|
|
75
|
-
tico/experimental/quantization/ptq/examples/quantize_llama_mlp.py,sha256=
|
|
76
|
-
tico/experimental/quantization/ptq/examples/quantize_with_gptq.py,sha256=
|
|
71
|
+
tico/experimental/quantization/ptq/examples/compare_ppl.py,sha256=Ap4s62eCMYkVwxdlysVRxdWg6AhLRv7Ib0wqytHqEmY,7774
|
|
72
|
+
tico/experimental/quantization/ptq/examples/debug_quant_outputs.py,sha256=2e1zPFsB0XNTlgjiFrSgR-ezULCXCXHg-5fuSyEgPfI,7713
|
|
73
|
+
tico/experimental/quantization/ptq/examples/quantize_linear.py,sha256=Dox0oxj6HpTwrCQMEltch-7V4ZaKd72IfREizzwp-Ms,4548
|
|
74
|
+
tico/experimental/quantization/ptq/examples/quantize_llama_attn.py,sha256=IsQ4hO_uTcjvVqYqmghzQj5FC98ZlYUpY3mOR3Y3H-A,4421
|
|
75
|
+
tico/experimental/quantization/ptq/examples/quantize_llama_decoder_layer.py,sha256=DNkIoInx9tM1kHEQV-gKPiqRPAOZ3qUNroVq32D_5So,5843
|
|
76
|
+
tico/experimental/quantization/ptq/examples/quantize_llama_mlp.py,sha256=XfYv68anCV5NPeByo4tEyQYjVmC4KArG15sVyjThuTc,4149
|
|
77
|
+
tico/experimental/quantization/ptq/examples/quantize_with_gptq.py,sha256=39c7T-JvUnCAeVa8e1A3FKZWBF32Mnvkv9Dms9LLvdU,9769
|
|
77
78
|
tico/experimental/quantization/ptq/observers/__init__.py,sha256=IO6FP_xYbGy0dW0HL26GXD3ouxARaxCK7bz9dn4blPQ,26
|
|
78
79
|
tico/experimental/quantization/ptq/observers/affine_base.py,sha256=e2Eba64nrxKQyE4F_WJ7WTSsk3xe6bkdGUKaoLFWGFw,4638
|
|
79
80
|
tico/experimental/quantization/ptq/observers/base.py,sha256=Wons1MzpqK1mfcy-ppl-B2Dum0edXg2dWW2Lw3V18tw,3280
|
|
@@ -262,9 +263,9 @@ tico/utils/mx/__init__.py,sha256=IO6FP_xYbGy0dW0HL26GXD3ouxARaxCK7bz9dn4blPQ,26
|
|
|
262
263
|
tico/utils/mx/elemwise_ops.py,sha256=V6glyAHsVR1joqpsgnNytatCD_ew92xNWZ19UFDoMTA,10281
|
|
263
264
|
tico/utils/mx/formats.py,sha256=uzNWyu-1onUlwQfX5cZ6fZSUfHMRqorper7_T1k3jfk,3404
|
|
264
265
|
tico/utils/mx/mx_ops.py,sha256=RcfUTYVi-wilGB2sC35OeARdwDqnixv7dG5iyZ-fQT8,8555
|
|
265
|
-
tico-0.1.0.
|
|
266
|
-
tico-0.1.0.
|
|
267
|
-
tico-0.1.0.
|
|
268
|
-
tico-0.1.0.
|
|
269
|
-
tico-0.1.0.
|
|
270
|
-
tico-0.1.0.
|
|
266
|
+
tico-0.1.0.dev251022.dist-info/LICENSE,sha256=kp4JLII7bzRhPb0CPD5XTDZMh22BQ7h3k3B7t8TiSbw,12644
|
|
267
|
+
tico-0.1.0.dev251022.dist-info/METADATA,sha256=3K8YDuxSO0M8dPuuo7-Fux6HX68W8DJ-Cz5v9R7sz1M,8455
|
|
268
|
+
tico-0.1.0.dev251022.dist-info/WHEEL,sha256=G16H4A3IeoQmnOrYV4ueZGKSjhipXx8zc8nu9FGlvMA,92
|
|
269
|
+
tico-0.1.0.dev251022.dist-info/entry_points.txt,sha256=kBKYSS_IYrSXmUYevmmepqIVPScq5vF8ulQRu3I_Zf0,59
|
|
270
|
+
tico-0.1.0.dev251022.dist-info/top_level.txt,sha256=oqs7UPoNSKZEwqsX8B-KAWdQwfAa7i60pbxW_Jk7P3w,5
|
|
271
|
+
tico-0.1.0.dev251022.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|