tico 0.1.0.dev250901__py3-none-any.whl → 0.1.0.dev250903__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (23) hide show
  1. tico/__init__.py +1 -1
  2. tico/experimental/quantization/ptq/__init__.py +1 -13
  3. tico/experimental/quantization/ptq/examples/compare_ppl.py +121 -0
  4. tico/experimental/quantization/ptq/examples/debug_quant_outputs.py +129 -0
  5. tico/experimental/quantization/ptq/examples/quantize_with_gptq.py +165 -0
  6. tico/experimental/quantization/ptq/observers/__init__.py +1 -15
  7. tico/experimental/quantization/ptq/observers/ema.py +1 -1
  8. tico/experimental/quantization/ptq/observers/minmax.py +1 -1
  9. tico/experimental/quantization/ptq/utils/__init__.py +1 -5
  10. tico/experimental/quantization/ptq/utils/introspection.py +169 -0
  11. tico/experimental/quantization/ptq/utils/metrics.py +123 -0
  12. tico/experimental/quantization/ptq/wrappers/__init__.py +1 -0
  13. tico/experimental/quantization/ptq/wrappers/llama/__init__.py +1 -9
  14. tico/experimental/quantization/ptq/wrappers/llama/quant_attn.py +4 -1
  15. tico/experimental/quantization/ptq/wrappers/nn/__init__.py +1 -11
  16. tico/experimental/quantization/ptq/wrappers/registry.py +12 -9
  17. tico/passes/remove_redundant_assert_nodes.py +3 -1
  18. {tico-0.1.0.dev250901.dist-info → tico-0.1.0.dev250903.dist-info}/METADATA +1 -1
  19. {tico-0.1.0.dev250901.dist-info → tico-0.1.0.dev250903.dist-info}/RECORD +23 -18
  20. {tico-0.1.0.dev250901.dist-info → tico-0.1.0.dev250903.dist-info}/LICENSE +0 -0
  21. {tico-0.1.0.dev250901.dist-info → tico-0.1.0.dev250903.dist-info}/WHEEL +0 -0
  22. {tico-0.1.0.dev250901.dist-info → tico-0.1.0.dev250903.dist-info}/entry_points.txt +0 -0
  23. {tico-0.1.0.dev250901.dist-info → tico-0.1.0.dev250903.dist-info}/top_level.txt +0 -0
tico/__init__.py CHANGED
@@ -29,7 +29,7 @@ __all__ = [
29
29
  ]
30
30
 
31
31
  # THIS LINE IS AUTOMATICALLY GENERATED BY setup.py
32
- __version__ = "0.1.0.dev250901"
32
+ __version__ = "0.1.0.dev250903"
33
33
 
34
34
  MINIMUM_SUPPORTED_VERSION = "2.5.0"
35
35
  SECURE_TORCH_VERSION = "2.6.0"
@@ -1,13 +1 @@
1
- """
2
- Public PTQ API — re-export the most common symbols.
3
- """
4
-
5
- from tico.experimental.quantization.ptq.dtypes import DType
6
- from tico.experimental.quantization.ptq.mode import Mode
7
- from tico.experimental.quantization.ptq.qscheme import QScheme
8
-
9
- __all__ = [
10
- "DType",
11
- "Mode",
12
- "QScheme",
13
- ]
1
+ # DO NOT REMOVE THIS FILE
@@ -0,0 +1,121 @@
1
+ # Copyright (c) 2025 Samsung Electronics Co., Ltd. All Rights Reserved
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ # =============================================================================
16
+ # QUICK PTQ WORKFLOW (OPTIONAL FP32 BASELINE)
17
+ # -----------------------------------------------------------------------------
18
+ # Toggle RUN_FP to choose between:
19
+ # • FP32 perplexity measurement only, OR
20
+ # • Full post-training UINT-8 flow (wrap → calibrate → eval).
21
+ # =============================================================================
22
+
23
+ import torch
24
+ import tqdm
25
+ from datasets import load_dataset
26
+ from transformers import AutoModelForCausalLM, AutoTokenizer
27
+
28
+ from tico.experimental.quantization.ptq.quant_config import QuantConfig
29
+ from tico.experimental.quantization.ptq.utils.metrics import perplexity
30
+ from tico.experimental.quantization.ptq.wrappers.ptq_wrapper import PTQWrapper
31
+
32
+ # -------------------------------------------------------------------------
33
+ # 0. Global configuration
34
+ # -------------------------------------------------------------------------
35
+ MODEL_NAME = "meta-llama/Meta-Llama-3-1B"
36
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
37
+ STRIDE = 512 # sliding-window stride for perplexity
38
+ RUN_FP = True # set False → run UINT-8 path
39
+
40
+ # Token-budget presets for activation calibration
41
+ TOKENS: dict[str, int] = {
42
+ # Smoke test (<1 min turnaround on CPU/GPU)
43
+ "debug": 2_000, # ≈16 × 128-seq batches
44
+ # Good default for 1-7B models (≲3 % ppl delta)
45
+ "baseline": 50_000,
46
+ # Production / 4-bit observer smoothing
47
+ "production": 200_000,
48
+ }
49
+ CALIB_TOKENS = TOKENS["baseline"]
50
+ print(f"Calibrating with {CALIB_TOKENS:,} tokens.\n")
51
+
52
+ # -------------------------------------------------------------------------
53
+ # 1. Load model
54
+ # -------------------------------------------------------------------------
55
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
56
+
57
+ if RUN_FP:
58
+ # -- FP32 baseline ------------------------------------------------------
59
+ print("Loading FP32 model …")
60
+ fp_model = AutoModelForCausalLM.from_pretrained(MODEL_NAME).to(DEVICE).eval()
61
+ fp_model.config.use_cache = False
62
+ else:
63
+ # -- UINT-8 pipeline -----------------------------------------------------
64
+ print("Creating UINT-8 clone …")
65
+ uint8_model = AutoModelForCausalLM.from_pretrained(MODEL_NAME).to(DEVICE).eval()
66
+ uint8_model.config.use_cache = False
67
+
68
+ # ---------------------------------------------------------------------
69
+ # 2. Wrap every Transformer layer with PTQWrapper
70
+ # ---------------------------------------------------------------------
71
+ qcfg = QuantConfig() # all-uint8 defaults
72
+
73
+ wrapped_layers = torch.nn.ModuleList()
74
+ for idx, layer in enumerate(uint8_model.model.layers):
75
+ layer_cfg = qcfg.child(f"layer{idx}")
76
+ wrapped_layers.append(PTQWrapper(layer, qcfg=layer_cfg))
77
+ uint8_model.model.layers = wrapped_layers
78
+
79
+ # ---------------------------------------------------------------------
80
+ # 3. Single-pass activation calibration
81
+ # ---------------------------------------------------------------------
82
+ print("Calibrating UINT-8 observers …")
83
+ calib_txt = " ".join(
84
+ load_dataset("wikitext", "wikitext-2-raw-v1", split="train")["text"]
85
+ )[:CALIB_TOKENS]
86
+ ids = tokenizer(calib_txt, return_tensors="pt").input_ids.to(DEVICE)
87
+
88
+ # (a) switch every QuantModuleBase to CALIB mode
89
+ for l in uint8_model.model.layers:
90
+ l.enable_calibration()
91
+
92
+ # (b) run inference to collect ranges
93
+ with torch.no_grad():
94
+ for i in tqdm.trange(0, ids.size(1) - 1, STRIDE, desc="Calibration"):
95
+ uint8_model(ids[:, i : i + STRIDE])
96
+
97
+ # (c) freeze (scale, zero-point)
98
+ for l in uint8_model.model.layers:
99
+ l.freeze_qparams()
100
+
101
+ # -------------------------------------------------------------------------
102
+ # 4. Evaluate perplexity on Wikitext-2
103
+ # -------------------------------------------------------------------------
104
+ print("\nCalculating perplexities …")
105
+ test_ds = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
106
+ enc = tokenizer("\n\n".join(test_ds["text"]), return_tensors="pt")
107
+
108
+ if RUN_FP:
109
+ ppl_fp = perplexity(fp_model, enc, DEVICE, stride=STRIDE)
110
+ else:
111
+ ppl_int8 = perplexity(uint8_model, enc, DEVICE, stride=STRIDE)
112
+
113
+ # -------------------------------------------------------------------------
114
+ # 5. Report
115
+ # -------------------------------------------------------------------------
116
+ print("\n┌── Wikitext-2 test perplexity ─────────────")
117
+ if RUN_FP:
118
+ print(f"│ FP32 : {ppl_fp:8.2f}")
119
+ else:
120
+ print(f"│ UINT-8 : {ppl_int8:8.2f}")
121
+ print("└───────────────────────────────────────────")
@@ -0,0 +1,129 @@
1
+ # Copyright (c) 2025 Samsung Electronics Co., Ltd. All Rights Reserved
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import torch
16
+ import tqdm
17
+ from datasets import load_dataset
18
+ from transformers import AutoModelForCausalLM, AutoTokenizer
19
+
20
+ from tico.experimental.quantization.ptq.quant_config import QuantConfig
21
+ from tico.experimental.quantization.ptq.utils.introspection import (
22
+ build_fqn_map,
23
+ compare_layer_outputs,
24
+ save_fp_outputs,
25
+ )
26
+ from tico.experimental.quantization.ptq.wrappers.ptq_wrapper import PTQWrapper
27
+
28
+ # ============================================================================
29
+ # LAYER-WISE DIFF DEBUGGING PIPELINE
30
+ # ----------------------------------------------------------------------------
31
+ # A quantization debugging pipeline that identifies accuracy regressions
32
+ # by comparing UINT vs FP outputs at each layer.
33
+ #
34
+ # 1. Load a full-precision (FP) LLaMA-3-1B model.
35
+ # 2. Wrap each Transformer block with PTQWrapper (activations → fake-quant).
36
+ # 3. Capture reference FP layer outputs before quantization.
37
+ # 4. Calibrate UINT-8 activation observers in a single pass.
38
+ # 5. Freeze quantization parameters (scale, zero-point).
39
+ # 6. Re-run inference and compare UINT-8 vs FP outputs per layer.
40
+ # 7. Report where quantization hurts the most.
41
+ #
42
+ # Use this pipeline to trace precision loss layer by layer, and pinpoint
43
+ # problematic modules during post-training quantization.
44
+ # ============================================================================
45
+
46
+ # -------------------------------------------------------------------------
47
+ # 0. Global configuration
48
+ # -------------------------------------------------------------------------
49
+ MODEL_NAME = "meta-llama/Meta-Llama-3-1B"
50
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
51
+ STRIDE = 512
52
+
53
+ # Token-budget presets for activation calibration
54
+ TOKENS: dict[str, int] = {
55
+ # Smoke test (<1 min turnaround on CPU/GPU)
56
+ "debug": 2_000, # ≈16 × 128-seq batches
57
+ # Good default for 1-7B models (≲3 % ppl delta)
58
+ "baseline": 50_000,
59
+ # Production / 4-bit observer smoothing
60
+ "production": 200_000,
61
+ }
62
+ CALIB_TOKENS = TOKENS["baseline"]
63
+ print(f"Calibrating with {CALIB_TOKENS:,} tokens.\n")
64
+
65
+ # -------------------------------------------------------------------------
66
+ # 1. Load the FP backbone
67
+ # -------------------------------------------------------------------------
68
+ print("Loading FP model …")
69
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
70
+ model = AutoModelForCausalLM.from_pretrained(MODEL_NAME).to(DEVICE).eval()
71
+ model.config.use_cache = False # disable KV-cache → full forward
72
+ m_to_fqn = build_fqn_map(model) # map modules → fully-qualified names
73
+
74
+ # Use Wikitext-2 train split for calibration.
75
+ dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="train")
76
+
77
+ # -------------------------------------------------------------------------
78
+ # 2. Wrap every layer with PTQWrapper (UINT-8 activations)
79
+ # -------------------------------------------------------------------------
80
+ print("Wrapping layers with PTQWrapper …")
81
+ qcfg = QuantConfig() # default: per-tensor UINT8
82
+
83
+ new_layers = torch.nn.ModuleList()
84
+ for idx, fp_layer in enumerate(model.model.layers):
85
+ layer_cfg = qcfg.child(f"layer{idx}")
86
+ q_layer = PTQWrapper(
87
+ fp_layer,
88
+ qcfg=layer_cfg,
89
+ fp_name=m_to_fqn.get(fp_layer),
90
+ )
91
+ new_layers.append(q_layer)
92
+
93
+ model.model.layers = new_layers # swap in quant wrappers
94
+
95
+ # -------------------------------------------------------------------------
96
+ # 3. Activation calibration plus FP-vs-UINT8 diffing
97
+ # -------------------------------------------------------------------------
98
+ print("Calibrating UINT-8 observers …")
99
+ calib_txt = " ".join(dataset["text"])[:CALIB_TOKENS]
100
+ ids = tokenizer(calib_txt, return_tensors="pt").input_ids.to(DEVICE)
101
+
102
+ # (a) Enable CALIB mode on every QuantModuleBase
103
+ for l in model.model.layers:
104
+ l.enable_calibration()
105
+
106
+ # Save reference FP activations before observers clamp/quantize
107
+ save_handles, act_cache = save_fp_outputs(model)
108
+
109
+ with torch.no_grad():
110
+ for i in tqdm.trange(0, ids.size(1) - 1, STRIDE, desc="Act-calibration"):
111
+ inputs = ids[:, i : i + STRIDE]
112
+ model(inputs) # observers collect act. ranges
113
+
114
+ # Remove save hooks now that FP activations are cached
115
+ for h in save_handles:
116
+ h.remove()
117
+
118
+ # (b) Freeze (scale, zero-point) after calibration
119
+ for l in model.model.layers:
120
+ l.freeze_qparams()
121
+
122
+ # (c) Register diff hooks and measure per-layer deltas
123
+ cmp_handles = compare_layer_outputs(model, act_cache, metrics=["diff", "peir"])
124
+ # Use same inputs for comparison.
125
+ model(inputs)
126
+
127
+ assert isinstance(cmp_handles, list)
128
+ for h in cmp_handles:
129
+ h.remove()
@@ -0,0 +1,165 @@
1
+ # Copyright (c) 2025 Samsung Electronics Co., Ltd. All Rights Reserved
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ # =============================================================================
16
+ # PTQ + GPTQ HYBRID QUANTIZATION PIPELINE
17
+ # -----------------------------------------------------------------------------
18
+ # This script shows how to:
19
+ # 1. Load a pretrained FP Llama-3 model.
20
+ # 2. Run GPTQ to quantize weights only.
21
+ # 3. Wrap every Transformer layer with a PTQWrapper to quantize activations.
22
+ # 4. Calibrate UINT-8 observers in a single pass over a text corpus.
23
+ # 5. Inject GPTQ’s per-tensor weight scales / zero-points into the PTQ graph.
24
+ # 6. Freeze all Q-params and compute Wikitext-2 perplexity.
25
+ # =============================================================================
26
+
27
+ from typing import Any
28
+
29
+ import torch
30
+ import tqdm
31
+ from datasets import load_dataset
32
+ from transformers import AutoModelForCausalLM, AutoTokenizer
33
+
34
+ from tico.experimental.quantization import convert, prepare
35
+ from tico.experimental.quantization.config import GPTQConfig
36
+ from tico.experimental.quantization.ptq.observers.affine_base import AffineObserverBase
37
+ from tico.experimental.quantization.ptq.quant_config import QuantConfig
38
+ from tico.experimental.quantization.ptq.utils.introspection import build_fqn_map
39
+ from tico.experimental.quantization.ptq.utils.metrics import perplexity
40
+ from tico.experimental.quantization.ptq.wrappers.ptq_wrapper import PTQWrapper
41
+ from tico.experimental.quantization.ptq.wrappers.quant_module_base import (
42
+ QuantModuleBase,
43
+ )
44
+
45
+ # -------------------------------------------------------------------------
46
+ # 0. Global configuration
47
+ # -------------------------------------------------------------------------
48
+ MODEL_NAME = "meta-llama/Meta-Llama-3-1B"
49
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
50
+ STRIDE = 512
51
+
52
+ # Token-budget presets for activation calibration
53
+ TOKENS: dict[str, int] = {
54
+ # Smoke test (<1 min turnaround on CPU/GPU)
55
+ "debug": 2_000, # ≈16 × 128-seq batches
56
+ # Good default for 1-7B models (≲3 % ppl delta)
57
+ "baseline": 50_000,
58
+ # Production / 4-bit observer smoothing
59
+ "production": 200_000,
60
+ }
61
+ CALIB_TOKENS = TOKENS["baseline"]
62
+
63
+ # -------------------------------------------------------------------------
64
+ # 1. Helper — copy GPTQ (scale, zp) into PTQ observers
65
+ # -------------------------------------------------------------------------
66
+ def inject_gptq_qparams(
67
+ root: torch.nn.Module,
68
+ gptq_quantizers: dict[str, Any], # {fp_name: quantizer}
69
+ weight_obs_name: str = "weight",
70
+ ):
71
+ """
72
+ For every `QuantModuleBase` whose `fp_name` matches a GPTQ key,
73
+ locate the observer called `weight_obs_name` and overwrite its
74
+ (scale, zero-point), then lock them against further updates.
75
+ """
76
+ for m in root.modules():
77
+ if not isinstance(m, QuantModuleBase):
78
+ continue
79
+ if m.fp_name is None:
80
+ continue
81
+ quantizer = gptq_quantizers.get(m.fp_name)
82
+ if quantizer is None:
83
+ continue
84
+ obs = m.get_observer(weight_obs_name)
85
+ if obs is None:
86
+ continue
87
+ assert isinstance(obs, AffineObserverBase)
88
+ # GPTQ quantizer attributes
89
+ obs.load_qparams(quantizer.scale, quantizer.zero, lock=True)
90
+
91
+
92
+ # -------------------------------------------------------------------------
93
+ # 2. Load the FP backbone
94
+ # -------------------------------------------------------------------------
95
+ print("Loading FP model …")
96
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
97
+ model = AutoModelForCausalLM.from_pretrained(MODEL_NAME).to(DEVICE).eval()
98
+ model.config.use_cache = False # disable KV-cache → full forward
99
+ m_to_fqn = build_fqn_map(model) # map modules → fully-qualified names
100
+
101
+ # -------------------------------------------------------------------------
102
+ # 3. Run GPTQ (weight-only) pass
103
+ # -------------------------------------------------------------------------
104
+ print("Applying GPTQ …")
105
+ dataset = load_dataset("wikiText", "wikitext-2-raw-v1", split="test")
106
+ q_m = prepare(model, GPTQConfig(), inplace=True)
107
+
108
+ for d in tqdm.tqdm(dataset, desc="GPTQ calibration"):
109
+ ids = tokenizer(d["text"], return_tensors="pt").input_ids.to(DEVICE)
110
+ q_m(ids) # observers gather weight stats
111
+
112
+ q_m = convert(q_m, inplace=True) # materialize INT-weight tensors
113
+
114
+ # -------------------------------------------------------------------------
115
+ # 4. Wrap every layer with PTQWrapper (activation UINT-8)
116
+ # -------------------------------------------------------------------------
117
+ qcfg = QuantConfig() # default: per-tensor UINT8
118
+ new_layers = torch.nn.ModuleList()
119
+
120
+ for idx, fp_layer in enumerate(q_m.model.layers):
121
+ layer_cfg = qcfg.child(f"layer{idx}")
122
+ q_layer = PTQWrapper(
123
+ fp_layer,
124
+ qcfg=layer_cfg,
125
+ fp_name=m_to_fqn.get(fp_layer),
126
+ )
127
+ new_layers.append(q_layer)
128
+
129
+ q_m.model.layers = new_layers
130
+
131
+ # -------------------------------------------------------------------------
132
+ # 5. Single-pass activation calibration
133
+ # -------------------------------------------------------------------------
134
+ print("Calibrating UINT-8 observers …")
135
+ calib_txt = " ".join(
136
+ load_dataset("wikitext", "wikitext-2-raw-v1", split="train")["text"]
137
+ )[:CALIB_TOKENS]
138
+ ids = tokenizer(calib_txt, return_tensors="pt").input_ids.to(DEVICE)
139
+
140
+ # (a) Enable CALIB mode on every QuantModuleBase
141
+ for l in q_m.model.layers:
142
+ l.enable_calibration()
143
+
144
+ # (b) Overwrite weight observers with GPTQ statistics
145
+ inject_gptq_qparams(q_m, q_m.quantizers)
146
+
147
+ with torch.no_grad():
148
+ for i in tqdm.trange(0, ids.size(1) - 1, STRIDE, desc="Act-calibration"):
149
+ q_m(ids[:, i : i + STRIDE]) # observers collect act. ranges
150
+
151
+ # (c) Freeze all Q-params (scale, zp)
152
+ for l in q_m.model.layers:
153
+ l.freeze_qparams()
154
+
155
+ # -------------------------------------------------------------------------
156
+ # 6. Evaluate perplexity on Wikitext-2
157
+ # -------------------------------------------------------------------------
158
+ print("\nCalculating perplexities …")
159
+ test_ds = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
160
+ enc = tokenizer("\n\n".join(test_ds["text"]), return_tensors="pt")
161
+ ppl_uint8 = perplexity(q_m, enc, DEVICE, stride=STRIDE)
162
+
163
+ print("\n┌── Wikitext-2 test perplexity ─────────────")
164
+ print(f"│ UINT-8 : {ppl_uint8:8.2f}")
165
+ print("└───────────────────────────────────────────")
@@ -1,15 +1 @@
1
- from tico.experimental.quantization.ptq.observers.affine_base import AffineObserverBase
2
- from tico.experimental.quantization.ptq.observers.base import ObserverBase
3
- from tico.experimental.quantization.ptq.observers.ema import EMAObserver
4
- from tico.experimental.quantization.ptq.observers.identity import IdentityObserver
5
- from tico.experimental.quantization.ptq.observers.minmax import MinMaxObserver
6
- from tico.experimental.quantization.ptq.observers.mx import MXObserver
7
-
8
- __all__ = [
9
- "AffineObserverBase",
10
- "ObserverBase",
11
- "EMAObserver",
12
- "IdentityObserver",
13
- "MinMaxObserver",
14
- "MXObserver",
15
- ]
1
+ # DO NOT REMOVE THIS FILE
@@ -15,7 +15,7 @@
15
15
  import torch
16
16
 
17
17
  from tico.experimental.quantization.ptq.observers.affine_base import AffineObserverBase
18
- from tico.experimental.quantization.ptq.utils import channelwise_minmax
18
+ from tico.experimental.quantization.ptq.utils.reduce_utils import channelwise_minmax
19
19
 
20
20
 
21
21
  class EMAObserver(AffineObserverBase):
@@ -15,7 +15,7 @@
15
15
  import torch
16
16
 
17
17
  from tico.experimental.quantization.ptq.observers.affine_base import AffineObserverBase
18
- from tico.experimental.quantization.ptq.utils import channelwise_minmax
18
+ from tico.experimental.quantization.ptq.utils.reduce_utils import channelwise_minmax
19
19
 
20
20
 
21
21
  class MinMaxObserver(AffineObserverBase):
@@ -1,5 +1 @@
1
- from tico.experimental.quantization.ptq.utils.reduce_utils import channelwise_minmax
2
-
3
- __all__ = [
4
- "channelwise_minmax",
5
- ]
1
+ # DO NOT REMOVE THIS FILE
@@ -0,0 +1,169 @@
1
+ # Copyright (c) 2025 Samsung Electronics Co., Ltd. All Rights Reserved
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from typing import Callable, Dict, List, Optional, Tuple
16
+
17
+ import torch
18
+
19
+ from tico.experimental.quantization.evaluation.metric import MetricCalculator
20
+ from tico.experimental.quantization.ptq.wrappers.ptq_wrapper import PTQWrapper
21
+ from tico.experimental.quantization.ptq.wrappers.quant_module_base import (
22
+ QuantModuleBase,
23
+ )
24
+
25
+
26
+ def build_fqn_map(root: torch.nn.Module) -> dict[torch.nn.Module, str]:
27
+ """
28
+ Return {module_object: full_qualified_name} without touching the modules.
29
+ """
30
+ return {m: n for n, m in root.named_modules()}
31
+
32
+
33
+ def save_fp_outputs(
34
+ model: torch.nn.Module,
35
+ ) -> Tuple[List[torch.utils.hooks.RemovableHandle], Dict[str, torch.Tensor]]:
36
+ """
37
+ Register forward-hooks on every `QuantModuleBase` wrapper itself (not the
38
+ wrapped `module`) and cache its output while the wrapper runs in CALIB mode.
39
+
40
+ Parameters
41
+ ----------
42
+ model : torch.nn.Module
43
+ The model whose wrappers are already switched to CALIB mode
44
+ (`enable_calibration()` has been called).
45
+
46
+ Returns
47
+ -------
48
+ handles : list[RemovableHandle]
49
+ Hook handles; call `.remove()` on each one to detach the hooks.
50
+ cache : dict[str, torch.Tensor]
51
+ Mapping "wrapper-name → cached FP32 activation" captured from the first
52
+ forward pass. Keys default to `wrapper.fp_name`; if that attribute is
53
+ `None`, the `id(wrapper)` string is used instead.
54
+ """
55
+ cache: Dict[str, torch.Tensor] = {}
56
+ handles: List[torch.utils.hooks.RemovableHandle] = []
57
+
58
+ def _save(name: str):
59
+ def hook(_, __, out: torch.Tensor | Tuple):
60
+ if isinstance(out, tuple):
61
+ out = out[0]
62
+ assert isinstance(out, torch.Tensor)
63
+ cache[name] = out.detach()
64
+
65
+ return hook
66
+
67
+ for m in model.modules():
68
+ if isinstance(m, QuantModuleBase):
69
+ name = m.fp_name or str(id(m))
70
+ handles.append(m.register_forward_hook(_save(name)))
71
+
72
+ return handles, cache
73
+
74
+
75
+ def compare_layer_outputs(
76
+ model: torch.nn.Module,
77
+ cache: Dict[str, torch.Tensor],
78
+ *,
79
+ metrics: Optional[List[str]] = None,
80
+ custom_metrics: Optional[Dict[str, Callable]] = None,
81
+ rtol: float = 1e-3,
82
+ atol: float = 1e-3,
83
+ collect: bool = False,
84
+ ):
85
+ """
86
+ Register forward-hooks on every `QuantModuleBase` wrapper to compare its
87
+ QUANT-mode output to the FP32 reference saved by `save_fp_outputs()`.
88
+
89
+ Each hook prints a per-layer diff report:
90
+
91
+ ✓ layer_name max=1.23e-02 mean=8.45e-04 (within tolerance)
92
+ ⚠️ layer_name max=3.07e+00 mean=5.12e-01 (exceeds tolerance)
93
+
94
+ Parameters
95
+ ----------
96
+ model : torch.nn.Module
97
+ The model whose wrappers are now in QUANT mode
98
+ (`freeze_qparams()` has been called).
99
+ cache : dict[str, torch.Tensor]
100
+ The reference activations captured during CALIB mode.
101
+ metrics
102
+ Metrics to compute. Defaults to `["diff"]`. Add `peir` to print PEIR.
103
+ custom_metrics
104
+ Optional user metric functions. Same signature as built-ins.
105
+ rtol, atol : float, optional
106
+ Relative / absolute tolerances used to flag large deviations
107
+ (similar to `torch.allclose` semantics).
108
+ collect : bool, optional
109
+ • False (default) → print one-line report per layer, return `None`
110
+ • True → suppress printing, return a nested dict
111
+ {layer_name -> {metric -> value}}
112
+
113
+ Returns
114
+ -------
115
+ handles
116
+ Hook handles; call `.remove()` once diffing is complete.
117
+ results
118
+ Only if *collect* is True.
119
+ """
120
+ metrics = metrics or ["diff"]
121
+ calc = MetricCalculator(custom_metrics)
122
+ handles: List[torch.utils.hooks.RemovableHandle] = []
123
+ results: Dict[
124
+ str, Dict[str, float]
125
+ ] = {} # Dict[layer_name, Dict[metric_name, value]]
126
+
127
+ def _cmp(name: str):
128
+ ref = cache.get(name)
129
+
130
+ def hook(_, __, out):
131
+ if ref is None:
132
+ if not collect:
133
+ print(f"[{name}] no cached reference")
134
+ return
135
+ if isinstance(out, tuple):
136
+ out = out[0]
137
+ assert isinstance(out, torch.Tensor)
138
+
139
+ # Compute all requested metrics
140
+ res = calc.compute([ref], [out], metrics) # lists with length-1 tensors
141
+ res = {k: v[0] for k, v in res.items()} # flatten
142
+
143
+ if collect:
144
+ results[name] = res # type: ignore[assignment]
145
+ return
146
+
147
+ # Pretty print ------------------------------------------------ #
148
+ diff_val = res.get("diff") or res.get("max_abs_diff")
149
+ thresh = atol + rtol * ref.abs().max().item()
150
+ flag = "⚠️" if (diff_val is not None and diff_val > thresh) else "✓" # type: ignore[operator]
151
+
152
+ pieces = [f"{flag} {name:45s}"]
153
+ for key, val in res.items():
154
+ pieces.append(f"{key}={val:<7.4}")
155
+ print(" ".join(pieces))
156
+
157
+ return hook
158
+
159
+ for m in model.modules():
160
+ if isinstance(m, PTQWrapper):
161
+ # skip the internal fp module inside the wrapper
162
+ continue
163
+ if isinstance(m, QuantModuleBase):
164
+ lname = m.fp_name or str(id(m))
165
+ handles.append(m.register_forward_hook(_cmp(lname)))
166
+
167
+ if collect:
168
+ return handles, results
169
+ return handles
@@ -0,0 +1,123 @@
1
+ # Copyright (c) 2025 Samsung Electronics Co., Ltd. All Rights Reserved
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from typing import Optional
16
+
17
+ import torch
18
+ import tqdm
19
+
20
+
21
+ def perplexity(
22
+ model: torch.nn.Module,
23
+ encodings: torch.Tensor,
24
+ device: torch.device | str,
25
+ *,
26
+ max_length: Optional[int] = None,
27
+ stride: int = 512,
28
+ ignore_index: int | None = -100,
29
+ show_progress: bool = True,
30
+ ) -> float:
31
+ """
32
+ Compute perplexity (PPL) using a "strided sliding-window"
33
+ evaluation strategy.
34
+
35
+ The function:
36
+ 1. Splits the token sequence into overlapping windows of length
37
+ `max_length` (model context size).
38
+ 2. Masks tokens that were already scored in previous windows
39
+ (`labels == -100`), so each token's negative log-likelihood (NLL)
40
+ is counted EXACTLY once.
41
+ 3. Aggregates token-wise NLL to return corpus-level PPL.
42
+
43
+ Parameters
44
+ ----------
45
+ model : torch.nn.Module
46
+ Causal LM loaded in evaluation mode (`model.eval()`).
47
+ encodings : torch.Tensor | transformers.BatchEncoding
48
+ Tokenised corpus. If a `BatchEncoding` is passed, its
49
+ `.input_ids` field is used. Shape must be `(1, seq_len)`.
50
+ device : torch.device | str
51
+ CUDA or CPU device on which to run evaluation.
52
+ max_length : int, optional
53
+ Context window size. Defaults to `model.config.max_position_embeddings`.
54
+ stride : int, default = 512
55
+ Step size by which the sliding window advances. Must satisfy
56
+ `1 ≤ stride ≤ max_length`.
57
+ ignore_index : int, default = -100
58
+ Label value to ignore in loss computation. This should match
59
+ the `ignore_index` used by the model's internal
60
+ `CrossEntropyLoss`. For Hugging Face causal LMs, the
61
+ convention is `-100`.
62
+ show_progress : bool, default = True
63
+ If True, displays a tqdm progess bar while evaluating.
64
+
65
+ Returns
66
+ -------
67
+ float
68
+ Corpus-level perplexity.
69
+ """
70
+ # -------- input preparation -------- #
71
+ try:
72
+ # transformers.BatchEncoding has `input_ids`
73
+ input_ids_full = encodings.input_ids # type: ignore[attr-defined]
74
+ except AttributeError: # already a tensor
75
+ input_ids_full = encodings
76
+ assert isinstance(input_ids_full, torch.Tensor)
77
+ input_ids_full = input_ids_full.to(device)
78
+
79
+ if max_length is None:
80
+ assert hasattr(model, "config")
81
+ assert hasattr(model.config, "max_position_embeddings")
82
+ assert isinstance(model.config.max_position_embeddings, int)
83
+ max_length = model.config.max_position_embeddings
84
+ assert max_length is not None
85
+ assert (
86
+ 1 <= stride <= max_length
87
+ ), f"stride ({stride}) must be in [1, max_length ({max_length})]"
88
+
89
+ seq_len = input_ids_full.size(1)
90
+ nll_sum = 0.0
91
+ n_tokens = 0
92
+ prev_end = 0
93
+
94
+ # -------- main loop -------- #
95
+ for begin in tqdm.trange(0, seq_len, stride, desc="PPL", disable=not show_progress):
96
+ end = min(begin + max_length, seq_len)
97
+ trg_len = end - prev_end # fresh tokens in this window
98
+
99
+ input_ids = input_ids_full[:, begin:end]
100
+ target_ids = input_ids.clone()
101
+ target_ids[:, :-trg_len] = ignore_index # mask previously-scored tokens
102
+
103
+ with torch.no_grad():
104
+ outputs = model(input_ids, labels=target_ids)
105
+ # loss is already averaged over non-masked labels
106
+ neg_log_likelihood = outputs.loss
107
+
108
+ # exact number of labels that contributed to loss
109
+ loss_tokens = (target_ids[:, 1:] != ignore_index).sum().item()
110
+ nll_sum += neg_log_likelihood * loss_tokens
111
+ n_tokens += int(loss_tokens)
112
+
113
+ prev_end = end
114
+ if end == seq_len:
115
+ break
116
+
117
+ avg_nll: float | torch.Tensor = nll_sum / n_tokens
118
+ if not isinstance(avg_nll, torch.Tensor):
119
+ avg_nll = torch.tensor(avg_nll)
120
+ assert isinstance(avg_nll, torch.Tensor)
121
+ ppl = torch.exp(avg_nll)
122
+
123
+ return ppl.item()
@@ -0,0 +1 @@
1
+ # DO NOT REMOVE THIS FILE
@@ -1,9 +1 @@
1
- from tico.experimental.quantization.ptq.wrappers.llama.quant_attn import (
2
- QuantLlamaAttention,
3
- )
4
- from tico.experimental.quantization.ptq.wrappers.llama.quant_decoder_layer import (
5
- QuantLlamaDecoderLayer,
6
- )
7
- from tico.experimental.quantization.ptq.wrappers.llama.quant_mlp import QuantLlamaMLP
8
-
9
- __all__ = ["QuantLlamaAttention", "QuantLlamaDecoderLayer", "QuantLlamaMLP"]
1
+ # DO NOT REMOVE THIS FILE
@@ -25,7 +25,10 @@ from tico.experimental.quantization.ptq.wrappers.quant_module_base import (
25
25
  from tico.experimental.quantization.ptq.wrappers.registry import try_register
26
26
 
27
27
 
28
- @try_register("transformers.models.llama.modeling_llama.LlamaAttention")
28
+ @try_register(
29
+ "transformers.models.llama.modeling_llama.LlamaAttention",
30
+ "transformers.models.llama.modeling_llama.LlamaSdpaAttention",
31
+ )
29
32
  class QuantLlamaAttention(QuantModuleBase):
30
33
  def __init__(
31
34
  self,
@@ -1,11 +1 @@
1
- from tico.experimental.quantization.ptq.wrappers.nn.quant_layernorm import (
2
- QuantLayerNorm,
3
- )
4
- from tico.experimental.quantization.ptq.wrappers.nn.quant_linear import QuantLinear
5
- from tico.experimental.quantization.ptq.wrappers.nn.quant_silu import QuantSiLU
6
-
7
- __all__ = [
8
- "QuantLayerNorm",
9
- "QuantLinear",
10
- "QuantSiLU",
11
- ]
1
+ # DO NOT REMOVE THIS FILE
@@ -90,7 +90,9 @@ def register(
90
90
 
91
91
 
92
92
  # ───────────────────────────── conditional decorator
93
- def try_register(path: str) -> Callable[[Type[QuantModuleBase]], Type[QuantModuleBase]]:
93
+ def try_register(
94
+ *paths: str,
95
+ ) -> Callable[[Type[QuantModuleBase]], Type[QuantModuleBase]]:
94
96
  """
95
97
  @try_register("transformers.models.llama.modeling_llama.LlamaMLP")
96
98
 
@@ -99,14 +101,15 @@ def try_register(path: str) -> Callable[[Type[QuantModuleBase]], Type[QuantModul
99
101
  """
100
102
 
101
103
  def _decorator(quant_cls: Type[QuantModuleBase]):
102
- module_name, _, cls_name = path.rpartition(".")
103
- try:
104
- mod = importlib.import_module(module_name)
105
- fp_cls = getattr(mod, cls_name)
106
- _WRAPPERS[fp_cls] = quant_cls
107
- except (ModuleNotFoundError, AttributeError):
108
- # transformers not installed or class renamed – silently skip
109
- pass
104
+ for path in paths:
105
+ module_name, _, cls_name = path.rpartition(".")
106
+ try:
107
+ mod = importlib.import_module(module_name)
108
+ fp_cls = getattr(mod, cls_name)
109
+ _WRAPPERS[fp_cls] = quant_cls
110
+ except (ModuleNotFoundError, AttributeError):
111
+ # optional dep missing or class renamed – skip silently
112
+ pass
110
113
  return quant_cls
111
114
 
112
115
  return _decorator
@@ -21,7 +21,9 @@ from tico.utils.utils import is_target_node
21
21
 
22
22
 
23
23
  assert_node_targets = [
24
+ torch.ops.aten._assert_scalar.default,
24
25
  torch.ops.aten._assert_tensor_metadata.default,
26
+ torch.ops.aten.sym_constrain_range_for_size.default, # Related to symbolic shape validation
25
27
  ]
26
28
 
27
29
 
@@ -29,7 +31,7 @@ assert_node_targets = [
29
31
  class RemoveRedundantAssertionNodes(PassBase):
30
32
  """
31
33
  This removes redundant assertion nodes.
32
- - `aten.assert_tensor_meta.default`
34
+ When assertion node is erased, related comparison nodes are also removed by graph.eliminate_dead_code().
33
35
  """
34
36
 
35
37
  def __init__(self):
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: tico
3
- Version: 0.1.0.dev250901
3
+ Version: 0.1.0.dev250903
4
4
  Summary: Convert exported Torch module to circle
5
5
  Home-page: UNKNOWN
6
6
  License: UNKNOWN
@@ -1,4 +1,4 @@
1
- tico/__init__.py,sha256=MgvCVXWMpNL2dxPn54C8fdQaTJPdtHivhuNHH4qN5R8,1883
1
+ tico/__init__.py,sha256=Y253CsOm0_3d-G5glZ1O4TGs9GTjnvCqSpe8pqRIGgE,1883
2
2
  tico/pt2_to_circle.py,sha256=gu3MD4Iqc0zMZcCZ2IT8oGbyj21CTSbT3Rgd9s2B_9A,2767
3
3
  tico/config/__init__.py,sha256=xZzCXjZ84qE-CsBi-dfaL05bqpQ3stKKfTXhnrJRyVs,142
4
4
  tico/config/base.py,sha256=q5xMqGxTUZs4mFqt5c7i_y9U00fYgdMGl9nUqIVMlCo,1248
@@ -56,35 +56,40 @@ tico/experimental/quantization/passes/propagate_qparam_backward.py,sha256=TGtyW0
56
56
  tico/experimental/quantization/passes/propagate_qparam_forward.py,sha256=RhUHGCR2RpBO5KYkQ7Z8U5u7HEwDq2wdKHLKAJCi-5c,5138
57
57
  tico/experimental/quantization/passes/quantize_bias.py,sha256=T7YxJ70N0tSK0FF9VJZA5iP0sHdnnsX9GX4AT4JDFSk,4325
58
58
  tico/experimental/quantization/passes/remove_weight_dequant_op.py,sha256=gI1MtrHazWpdNfys7f1ngTTWplzluF7SA-uX0HMR5Mc,6592
59
- tico/experimental/quantization/ptq/__init__.py,sha256=ZoPdEwZ1i1n5pBFChx8GuUrkfRP2vsSoLPNILQjNBaA,298
59
+ tico/experimental/quantization/ptq/__init__.py,sha256=IO6FP_xYbGy0dW0HL26GXD3ouxARaxCK7bz9dn4blPQ,26
60
60
  tico/experimental/quantization/ptq/dtypes.py,sha256=xfCBtq6mQmUYRwsoFgII6gvRl1raQi0Inj9pznDuKwQ,2236
61
61
  tico/experimental/quantization/ptq/mode.py,sha256=lT-T8vIv8YWcwrjT7xXVhOw1g7aoAdh_3PWB-ptPKaI,1052
62
62
  tico/experimental/quantization/ptq/qscheme.py,sha256=uwhv7bCxOOXB3I-IKlRyr_u4eXOq48uIqGy4TLDqGxY,1301
63
63
  tico/experimental/quantization/ptq/quant_config.py,sha256=nm7570Y1X2mOT_8s27ilWid04otor6cVTi9GwgAEaKc,4300
64
64
  tico/experimental/quantization/ptq/examples/__init__.py,sha256=IO6FP_xYbGy0dW0HL26GXD3ouxARaxCK7bz9dn4blPQ,26
65
+ tico/experimental/quantization/ptq/examples/compare_ppl.py,sha256=SmSmaCBVWTcGRPRk2zopDqESD_gF8D7J4kUNNZ-0cMk,5295
66
+ tico/experimental/quantization/ptq/examples/debug_quant_outputs.py,sha256=astXzx-maq1W4gKvX2QaGmD2Tpmjunv4JqDYVk9eZRQ,5177
65
67
  tico/experimental/quantization/ptq/examples/quantize_linear.py,sha256=8zq-ZJDYgam0xQ-PbC6Xb1I7W1mv0Wi-b--IP2wwXtw,4539
66
68
  tico/experimental/quantization/ptq/examples/quantize_llama_attn.py,sha256=cVWUSSzaZWFp5QZkNkrlpHU3kXyP84QtnZbahVml_yQ,4329
67
69
  tico/experimental/quantization/ptq/examples/quantize_llama_decoder_layer.py,sha256=mBWrjkyEovYQsPC4Rrsri6Pm1rlFmDb3NiP0DQQhFyM,5751
68
70
  tico/experimental/quantization/ptq/examples/quantize_llama_mlp.py,sha256=N1qZQgt1S-xZrdv-PW7OfXEcv0gsO2q9faOF4aD-zKo,4147
69
- tico/experimental/quantization/ptq/observers/__init__.py,sha256=WF2MvL9M_jl-B1FqcY9zic34NOCRp17HkRYv-TMxMr4,613
71
+ tico/experimental/quantization/ptq/examples/quantize_with_gptq.py,sha256=w21Qao5_6SnWMuxmnZbZOoqaLQOuSnK52mHin4aedtA,6979
72
+ tico/experimental/quantization/ptq/observers/__init__.py,sha256=IO6FP_xYbGy0dW0HL26GXD3ouxARaxCK7bz9dn4blPQ,26
70
73
  tico/experimental/quantization/ptq/observers/affine_base.py,sha256=e2Eba64nrxKQyE4F_WJ7WTSsk3xe6bkdGUKaoLFWGFw,4638
71
74
  tico/experimental/quantization/ptq/observers/base.py,sha256=Wons1MzpqK1mfcy-ppl-B2Dum0edXg2dWW2Lw3V18tw,3280
72
- tico/experimental/quantization/ptq/observers/ema.py,sha256=MAMdBmjVNMg_vsqXrcBzbw_1nFJ-j4Gz651k3-VlaMQ,2057
75
+ tico/experimental/quantization/ptq/observers/ema.py,sha256=oISP1XaD3lapVaHQKscD3rjLcKbhOy4Nvi6dqRFZwF8,2070
73
76
  tico/experimental/quantization/ptq/observers/identity.py,sha256=vkec8Or-7VwM4zkFEvEKROQJk8XEHMVX8mBNDnxSyS8,2591
74
- tico/experimental/quantization/ptq/observers/minmax.py,sha256=mLHkwIzWFzQXev7EU7w1333KckwRjukc3_cUPJOnUfs,1486
77
+ tico/experimental/quantization/ptq/observers/minmax.py,sha256=WWcAyEIrd5j3k9qsoBJi3nUnWtrwPaKlR9CPezbDSqQ,1499
75
78
  tico/experimental/quantization/ptq/observers/mx.py,sha256=aP4qmBgeiRIYZJksShN5gs6UyYOFi2-Sbk5k5xvPQ4w,1863
76
- tico/experimental/quantization/ptq/utils/__init__.py,sha256=PL9IZgiWoMtsXVljeOy7KymmLVP238SXEFRLXYK72WQ,126
79
+ tico/experimental/quantization/ptq/utils/__init__.py,sha256=IO6FP_xYbGy0dW0HL26GXD3ouxARaxCK7bz9dn4blPQ,26
80
+ tico/experimental/quantization/ptq/utils/introspection.py,sha256=y2oGf7RoApMHJeXLmIz3VVWB9vazGEgyLbxLiVTTQdw,6000
81
+ tico/experimental/quantization/ptq/utils/metrics.py,sha256=EW_FQmJrl9Y4esspZQ0GHfJ58RwuJUz0l8IfYq3NWY4,4461
77
82
  tico/experimental/quantization/ptq/utils/reduce_utils.py,sha256=3kWawLB91EcvvHlCrNqqfZF7tpgr22htBSA049mKw_4,973
78
- tico/experimental/quantization/ptq/wrappers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
83
+ tico/experimental/quantization/ptq/wrappers/__init__.py,sha256=IO6FP_xYbGy0dW0HL26GXD3ouxARaxCK7bz9dn4blPQ,26
79
84
  tico/experimental/quantization/ptq/wrappers/ptq_wrapper.py,sha256=F9sK_DiRaXiGNHULcwIbs5EUtHz6ZJ7N4r5CWTTfhsM,2442
80
85
  tico/experimental/quantization/ptq/wrappers/quant_elementwise.py,sha256=LhEoobfvto6zKrBOKL4gmxfFFc31jHzyQV_zfps-iQM,3604
81
86
  tico/experimental/quantization/ptq/wrappers/quant_module_base.py,sha256=vkcDos_knGSS29rIZuEIWkAJLHrENbGz8nCH2-iara8,5969
82
- tico/experimental/quantization/ptq/wrappers/registry.py,sha256=M1D_foC0PR-Ii4G0lbOO3_pmhvHlMF28NolK_q2DZtw,4783
83
- tico/experimental/quantization/ptq/wrappers/llama/__init__.py,sha256=4xuAYnJcohMTtBzrH4cxq8WKG2GQo8nbhektVg8w7F0,380
84
- tico/experimental/quantization/ptq/wrappers/llama/quant_attn.py,sha256=WIUI6EFMTvvruvqu8pBxWy6qJeDyjkaYbJk1R3pAmwE,8578
87
+ tico/experimental/quantization/ptq/wrappers/registry.py,sha256=wauoZdZBR15bGj1Upt9owEfFDT-Tj6HzciG9HDM1BHo,4845
88
+ tico/experimental/quantization/ptq/wrappers/llama/__init__.py,sha256=IO6FP_xYbGy0dW0HL26GXD3ouxARaxCK7bz9dn4blPQ,26
89
+ tico/experimental/quantization/ptq/wrappers/llama/quant_attn.py,sha256=-K1COLHIHfJZhQu-RE6KfJIkaL7S6yR4iUj48QkjMTw,8652
85
90
  tico/experimental/quantization/ptq/wrappers/llama/quant_decoder_layer.py,sha256=2XsIf5rcabDXXkahqriSxfo2curFq0Y5bnRPcYkJPg8,7187
86
91
  tico/experimental/quantization/ptq/wrappers/llama/quant_mlp.py,sha256=uZMnrX66oZwxhKhcNbLXXeri-WxxRBiZnr15aBXJMm0,3562
87
- tico/experimental/quantization/ptq/wrappers/nn/__init__.py,sha256=I9uTt5HfcRoMEDYHpAeATMv2TbCQiX0ZbfUFMzSJ4Qw,336
92
+ tico/experimental/quantization/ptq/wrappers/nn/__init__.py,sha256=IO6FP_xYbGy0dW0HL26GXD3ouxARaxCK7bz9dn4blPQ,26
88
93
  tico/experimental/quantization/ptq/wrappers/nn/quant_layernorm.py,sha256=G5Sgt-tXnzh0Rxyk-2honmZIfEQOZlRfOsoDBdSGmA4,6887
89
94
  tico/experimental/quantization/ptq/wrappers/nn/quant_linear.py,sha256=xW-VEPB7RJoslS3xLVCdhIuMjppknvpkZleRGK4JFVQ,2240
90
95
  tico/experimental/quantization/ptq/wrappers/nn/quant_silu.py,sha256=XnJDggkWUTfXC1-BLeAbcCUtp687XLIkIIbuQlqycDw,1864
@@ -119,7 +124,7 @@ tico/passes/lower_to_slice.py,sha256=OzlFzK3lBYyYwC3WThsWd94Ob4JINIJF8UaLAtnumzU
119
124
  tico/passes/merge_consecutive_cat.py,sha256=ayZNLDA1DFM7Fxxi2Dmk1CujkgUuaVCH1rhQgLrvvOQ,2701
120
125
  tico/passes/ops.py,sha256=cSj3Sk2x2cOE9b8oU5pmSa_rHr-iX2lORzu3N_UHMSQ,2967
121
126
  tico/passes/remove_nop.py,sha256=Hf91p_EJAOC6DyWNthash0_UWtEcNc_M7znamQfYQ5Y,2686
122
- tico/passes/remove_redundant_assert_nodes.py,sha256=IONd3xBy6I8tH6_Y1eN3_eCHH7WTC8soBgjXzOju9cQ,1612
127
+ tico/passes/remove_redundant_assert_nodes.py,sha256=rYbTCyuNIXIC-2NreHKBVCuaSUkEQvB_iSRzb26P_EA,1821
123
128
  tico/passes/remove_redundant_expand.py,sha256=auyqIoQT4HJhiJfuUe6BrEtUhvz221ohnIK5EuszWeg,2112
124
129
  tico/passes/remove_redundant_permute.py,sha256=98UsaZzFZdQzEEAR1pIzRisAf6hgfXLa88aayjalt3E,4292
125
130
  tico/passes/remove_redundant_reshape.py,sha256=aeep6LDvY58GEuOrWckkEXnJa6wkkbiJ9FrimT9F3-s,16384
@@ -242,9 +247,9 @@ tico/utils/mx/__init__.py,sha256=IO6FP_xYbGy0dW0HL26GXD3ouxARaxCK7bz9dn4blPQ,26
242
247
  tico/utils/mx/elemwise_ops.py,sha256=V6glyAHsVR1joqpsgnNytatCD_ew92xNWZ19UFDoMTA,10281
243
248
  tico/utils/mx/formats.py,sha256=uzNWyu-1onUlwQfX5cZ6fZSUfHMRqorper7_T1k3jfk,3404
244
249
  tico/utils/mx/mx_ops.py,sha256=RcfUTYVi-wilGB2sC35OeARdwDqnixv7dG5iyZ-fQT8,8555
245
- tico-0.1.0.dev250901.dist-info/LICENSE,sha256=kp4JLII7bzRhPb0CPD5XTDZMh22BQ7h3k3B7t8TiSbw,12644
246
- tico-0.1.0.dev250901.dist-info/METADATA,sha256=LMgoYoHYFT8cJU9VNYiiX89tMSxEX30x17x_6eWAr4o,8450
247
- tico-0.1.0.dev250901.dist-info/WHEEL,sha256=G16H4A3IeoQmnOrYV4ueZGKSjhipXx8zc8nu9FGlvMA,92
248
- tico-0.1.0.dev250901.dist-info/entry_points.txt,sha256=kBKYSS_IYrSXmUYevmmepqIVPScq5vF8ulQRu3I_Zf0,59
249
- tico-0.1.0.dev250901.dist-info/top_level.txt,sha256=oqs7UPoNSKZEwqsX8B-KAWdQwfAa7i60pbxW_Jk7P3w,5
250
- tico-0.1.0.dev250901.dist-info/RECORD,,
250
+ tico-0.1.0.dev250903.dist-info/LICENSE,sha256=kp4JLII7bzRhPb0CPD5XTDZMh22BQ7h3k3B7t8TiSbw,12644
251
+ tico-0.1.0.dev250903.dist-info/METADATA,sha256=3mrCVEvr_rIq-iHE5roTYJn1Pw3_3za-EMoxG-yg3dg,8450
252
+ tico-0.1.0.dev250903.dist-info/WHEEL,sha256=G16H4A3IeoQmnOrYV4ueZGKSjhipXx8zc8nu9FGlvMA,92
253
+ tico-0.1.0.dev250903.dist-info/entry_points.txt,sha256=kBKYSS_IYrSXmUYevmmepqIVPScq5vF8ulQRu3I_Zf0,59
254
+ tico-0.1.0.dev250903.dist-info/top_level.txt,sha256=oqs7UPoNSKZEwqsX8B-KAWdQwfAa7i60pbxW_Jk7P3w,5
255
+ tico-0.1.0.dev250903.dist-info/RECORD,,