tico 0.1.0.dev250907__py3-none-any.whl → 0.1.0.dev250909__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
tico/__init__.py CHANGED
@@ -29,7 +29,7 @@ __all__ = [
29
29
  ]
30
30
 
31
31
  # THIS LINE IS AUTOMATICALLY GENERATED BY setup.py
32
- __version__ = "0.1.0.dev250907"
32
+ __version__ = "0.1.0.dev250909"
33
33
 
34
34
  MINIMUM_SUPPORTED_VERSION = "2.5.0"
35
35
  SECURE_TORCH_VERSION = "2.6.0"
@@ -13,7 +13,7 @@
13
13
  # limitations under the License.
14
14
 
15
15
  import functools
16
- from typing import Any, Dict, List
16
+ from typing import Any, Dict, List, Literal
17
17
 
18
18
  import torch
19
19
 
@@ -21,18 +21,24 @@ import torch
21
21
  class ChannelwiseMaxActsObserver:
22
22
  """
23
23
  Observer to calcuate channelwise maximum activation
24
+ It supports collecting activations from either module inputs or outputs.
24
25
  """
25
26
 
26
- def __init__(self, model):
27
+ def __init__(
28
+ self, model: torch.nn.Module, acts_from: Literal["input", "output"] = "input"
29
+ ):
27
30
  """
28
31
  model
29
32
  A torch module whose activations are to be analyzed.
33
+ acts_from
34
+ Where to hook: "input" for forward-pre-hook, "output" for forward-hook.
30
35
  hooks
31
- A list to store the hooks which are registered to collect activation statistics.
36
+ A list to store the hooks registered to collect activation statistics.
32
37
  max_acts
33
- A dictionary to store the maximum activation values
38
+ A dictionary to store the per-channel maxima.
34
39
  """
35
40
  self.model = model
41
+ self.acts_from: Literal["input", "output"] = acts_from
36
42
  self.hooks: List[Any] = []
37
43
  self.max_acts: Dict[str, torch.Tensor] = {}
38
44
 
@@ -62,13 +68,25 @@ class ChannelwiseMaxActsObserver:
62
68
  input = input[0]
63
69
  stat_tensor(name, input)
64
70
 
71
+ def stat_output_hook(m, input, output, name):
72
+ if isinstance(output, tuple):
73
+ output = output[0]
74
+ stat_tensor(name, output)
75
+
65
76
  for name, m in self.model.named_modules():
66
77
  if isinstance(m, torch.nn.Linear):
67
- self.hooks.append(
68
- m.register_forward_pre_hook(
69
- functools.partial(stat_input_hook, name=name)
78
+ if self.acts_from == "input":
79
+ self.hooks.append(
80
+ m.register_forward_pre_hook(
81
+ functools.partial(stat_input_hook, name=name)
82
+ )
83
+ )
84
+ else: # "output"
85
+ self.hooks.append(
86
+ m.register_forward_hook(
87
+ functools.partial(stat_output_hook, name=name)
88
+ )
70
89
  )
71
- )
72
90
 
73
91
  def remove(self):
74
92
  for hook in self.hooks:
@@ -30,6 +30,25 @@ from tico.experimental.quantization.quantizer import BaseQuantizer
30
30
  class SmoothQuantQuantizer(BaseQuantizer):
31
31
  """
32
32
  Quantizer for applying the SmoothQuant algorithm
33
+
34
+ Q) Why allow choosing between input and output activations?
35
+
36
+ SmoothQuant relies on channel-wise activation statistics to balance
37
+ weights and activations. In practice, there are two natural sources:
38
+
39
+ - "input": captures the tensor right before a Linear layer
40
+ (forward-pre-hook). This matches the original SmoothQuant paper
41
+ and focuses on scaling the raw hidden state.
42
+
43
+ - "output": captures the tensor right after a Linear layer
44
+ (forward-hook). This can better reflect post-weight dynamics,
45
+ especially when subsequent operations (bias, activation functions)
46
+ dominate the dynamic range.
47
+
48
+ Allowing both options provides flexibility: depending on model
49
+ architecture and calibration data, one may yield lower error than
50
+ the other. The default remains "input" for compatibility, but "output"
51
+ can be selected to empirically reduce error or runtime overhead.
33
52
  """
34
53
 
35
54
  def __init__(self, config: SmoothQuantConfig):
@@ -37,6 +56,7 @@ class SmoothQuantQuantizer(BaseQuantizer):
37
56
 
38
57
  self.alpha = config.alpha
39
58
  self.custom_alpha_map = config.custom_alpha_map
59
+ self.acts_from = config.acts_from # "input" (default) or "output"
40
60
  self.observer: Optional[ChannelwiseMaxActsObserver] = None
41
61
 
42
62
  @torch.no_grad()
@@ -55,7 +75,8 @@ class SmoothQuantQuantizer(BaseQuantizer):
55
75
  Returns:
56
76
  The model prepared for SmoothQuant quantization.
57
77
  """
58
- self.observer = ChannelwiseMaxActsObserver(model)
78
+ # Attach hooks according to `config.acts_from`
79
+ self.observer = ChannelwiseMaxActsObserver(model, acts_from=self.acts_from)
59
80
  self.observer.attach()
60
81
 
61
82
  return model
@@ -12,7 +12,7 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- from typing import Dict, List, Optional
15
+ from typing import Callable, Dict, List, Optional
16
16
 
17
17
  import torch
18
18
 
@@ -110,6 +110,185 @@ def smooth_weights(
110
110
  back_m.weight.mul_(scales.view(1, -1)) # type: ignore[operator]
111
111
 
112
112
 
113
+ # TODO Split the files per model
114
+ # ────────────────────────────────────────────────────────────
115
+ # fairseq ReLU bridge (input-hook stats) helpers
116
+ # ────────────────────────────────────────────────────────────
117
+
118
+
119
+ @torch.no_grad()
120
+ def _compute_s_for_linear(
121
+ linear_like: torch.nn.Module, # 2D weight [out, in]
122
+ activation_max: torch.Tensor, # shape [in]
123
+ alpha: float,
124
+ ) -> torch.Tensor:
125
+ """
126
+ s = (amax^alpha / w_col_max^(1-alpha))
127
+ - amax: channel-wise max of the input to this module
128
+ - w_col_max: max(|W|) per input column
129
+ """
130
+ if not hasattr(linear_like, "weight"):
131
+ raise RuntimeError(f"{type(linear_like).__name__} has no 'weight' attribute.")
132
+ W = linear_like.weight # [out, in]
133
+ assert isinstance(W, torch.Tensor)
134
+ if W.ndim != 2:
135
+ raise RuntimeError(
136
+ f"Expected 2D weight, got {W.ndim}D for {type(linear_like).__name__}"
137
+ )
138
+
139
+ device, dtype = W.device, W.dtype
140
+ amax = activation_max.to(device=device, dtype=dtype)
141
+
142
+ if amax.numel() != W.shape[1]:
143
+ raise ValueError(
144
+ f"activation_max numel({amax.numel()}) != in_features({W.shape[1]})"
145
+ )
146
+
147
+ w_col_max = W.abs().max(dim=0)[0].clamp(min=1e-5) # [in]
148
+ s = (amax.pow(alpha) / w_col_max.pow(1.0 - alpha)).clamp(min=1e-5) # [in]
149
+ return s
150
+
151
+
152
+ @torch.no_grad()
153
+ def _fuse_relu_bridge_no_runtime_mul(
154
+ fc1: torch.nn.Module,
155
+ fc2: torch.nn.Module,
156
+ s_hidden: torch.Tensor,
157
+ ):
158
+ """
159
+ Fuse scaling across fc1 → ReLU → fc2 without runtime multiplies:
160
+ - fc1 rows *= 1/s, (fc1.bias *= 1/s)
161
+ - fc2 cols *= s
162
+ Assumes middle activation is ReLU (positive homogeneous).
163
+ """
164
+ if not hasattr(fc1, "weight") or not hasattr(fc2, "weight"):
165
+ raise RuntimeError("fc1/fc2 must have 'weight' attributes.")
166
+
167
+ W1, W2 = fc1.weight, fc2.weight
168
+ assert isinstance(W1, torch.Tensor) and isinstance(W2, torch.Tensor)
169
+ if W1.ndim != 2 or W2.ndim != 2:
170
+ raise RuntimeError("fc1/fc2 weights must be 2D.")
171
+
172
+ hidden = W1.shape[0]
173
+ if W2.shape[1] != hidden or s_hidden.numel() != hidden:
174
+ raise ValueError(
175
+ f"Dimension mismatch: hidden={hidden}, W2.in={W2.shape[1]}, s={s_hidden.numel()}"
176
+ )
177
+
178
+ s = s_hidden.to(device=W1.device, dtype=W1.dtype).clamp(min=1e-5) # [hidden]
179
+ inv_s = (1.0 / s).clamp(min=1e-5)
180
+
181
+ # fc1: row-wise scale
182
+ W1.mul_(inv_s.view(-1, 1))
183
+ if hasattr(fc1, "bias") and getattr(fc1, "bias") is not None:
184
+ assert isinstance(fc1.bias, torch.Tensor)
185
+ fc1.bias.mul_(inv_s)
186
+
187
+ # fc2: column-wise scale
188
+ W2.mul_(s.view(1, -1))
189
+
190
+
191
+ # ────────────────────────────────────────────────────────────
192
+ # Per-layer appliers (uniform protocol): return True if applied, else False
193
+ # ────────────────────────────────────────────────────────────
194
+
195
+
196
+ @torch.no_grad()
197
+ def _apply_if_llama_decoder(
198
+ name: str,
199
+ module: torch.nn.Module,
200
+ activation_max: Dict[str, torch.Tensor],
201
+ alpha_to_apply: float,
202
+ ) -> bool:
203
+ """
204
+ Apply LLaMA decoder-layer smoothing (input-hook stats).
205
+ Returns True if this handler applied smoothing to `module`.
206
+ """
207
+ try:
208
+ from transformers.models.llama.modeling_llama import ( # type: ignore
209
+ LlamaDecoderLayer,
210
+ )
211
+ except Exception:
212
+ return False
213
+
214
+ if not isinstance(module, LlamaDecoderLayer):
215
+ return False
216
+
217
+ attn_ln = module.input_layernorm
218
+ qkv = [
219
+ module.self_attn.q_proj,
220
+ module.self_attn.k_proj,
221
+ module.self_attn.v_proj,
222
+ ]
223
+ # Input-hook stats for q_proj input
224
+ qkv_input_scales = activation_max[name + ".self_attn.q_proj"]
225
+ smooth_weights(attn_ln, qkv, qkv_input_scales, alpha_to_apply)
226
+
227
+ ffn_ln = module.post_attention_layernorm
228
+ fcs = [module.mlp.gate_proj, module.mlp.up_proj]
229
+ # Input-hook stats for gate_proj input
230
+ fcs_input_scales = activation_max[name + ".mlp.gate_proj"]
231
+ smooth_weights(ffn_ln, fcs, fcs_input_scales, alpha_to_apply)
232
+
233
+ return True
234
+
235
+
236
+ @torch.no_grad()
237
+ def _apply_if_fairseq_relu_bridge(
238
+ name: str,
239
+ module: torch.nn.Module,
240
+ activation_max: Dict[str, torch.Tensor],
241
+ alpha_to_apply: float,
242
+ ) -> bool:
243
+ """
244
+ Apply fairseq Transformer (Encoder/Decoder) ReLU-FFN bridge fusion
245
+ using input-hook stats at '{name}.fc1'. Returns True if applied.
246
+ """
247
+ try:
248
+ from fairseq.modules.transformer_layer import (
249
+ TransformerDecoderLayerBase,
250
+ TransformerEncoderLayerBase,
251
+ ) # type: ignore
252
+ except Exception:
253
+ return False
254
+
255
+ if not isinstance(
256
+ module, (TransformerEncoderLayerBase, TransformerDecoderLayerBase)
257
+ ):
258
+ return False
259
+
260
+ # Only when FFN activation is ReLU (positive homogeneity)
261
+ act_fn = getattr(module, "activation_fn", None)
262
+ is_relu = (act_fn is torch.nn.functional.relu) or getattr(
263
+ act_fn, "__name__", ""
264
+ ) == "relu"
265
+ if not is_relu:
266
+ return False
267
+
268
+ fc1_key = f"{name}.fc1"
269
+ amax2 = activation_max.get(fc1_key)
270
+ if amax2 is None:
271
+ return False
272
+
273
+ fc1 = getattr(module, "fc1", None)
274
+ fc2 = getattr(module, "fc2", None)
275
+ if fc1 is None or fc2 is None or not hasattr(fc2, "weight") or fc2.weight.ndim != 2:
276
+ return False
277
+
278
+ s_hidden = _compute_s_for_linear(fc2, amax2, alpha_to_apply) # [hidden]
279
+ _fuse_relu_bridge_no_runtime_mul(fc1, fc2, s_hidden)
280
+ return True
281
+
282
+
283
+ # Registry of appliers (order matters: try LLaMA first, then fairseq)
284
+ _APPLIERS: List[
285
+ Callable[[str, torch.nn.Module, Dict[str, torch.Tensor], float], bool]
286
+ ] = [
287
+ _apply_if_llama_decoder,
288
+ _apply_if_fairseq_relu_bridge,
289
+ ]
290
+
291
+
113
292
  @torch.no_grad()
114
293
  def apply_smoothing(
115
294
  model: torch.nn.Module,
@@ -133,32 +312,16 @@ def apply_smoothing(
133
312
  Layers specified in this dictionary will use the corresponding alpha
134
313
  value instead of the default.
135
314
  """
136
- from transformers.models.llama.modeling_llama import LlamaDecoderLayer
137
-
138
315
  for name, module in model.named_modules():
139
- alpha_to_apply = alpha
140
- if custom_alpha_map and name in custom_alpha_map:
141
- alpha_to_apply = custom_alpha_map[name]
316
+ alpha_to_apply = (
317
+ custom_alpha_map.get(name, alpha) if custom_alpha_map else alpha
318
+ )
142
319
  if alpha_to_apply > 1.0:
143
320
  raise RuntimeError(
144
321
  f"Alpha value cannot exceed 1.0. Given alpha: {alpha_to_apply}"
145
322
  )
146
- # SmoothQuant is applied before capturing the graph. Therefore, it needs to know
147
- # specific module information.
148
- # TODO Suport more modules.
149
- if isinstance(module, LlamaDecoderLayer):
150
- attn_ln = module.input_layernorm
151
- qkv = [
152
- module.self_attn.q_proj,
153
- module.self_attn.k_proj,
154
- module.self_attn.v_proj,
155
- ]
156
-
157
- qkv_input_scales = activation_max[name + ".self_attn.q_proj"]
158
- smooth_weights(attn_ln, qkv, qkv_input_scales, alpha_to_apply)
159
-
160
- ffn_ln = module.post_attention_layernorm
161
- fcs = [module.mlp.gate_proj, module.mlp.up_proj]
162
- fcs_input_scales = activation_max[name + ".mlp.gate_proj"]
163
-
164
- smooth_weights(ffn_ln, fcs, fcs_input_scales, alpha_to_apply)
323
+
324
+ # Try each applier until one succeeds.
325
+ for applier in _APPLIERS:
326
+ if applier(name, module, activation_max, alpha_to_apply):
327
+ break # applied → stop trying others
@@ -13,7 +13,7 @@
13
13
  # limitations under the License.
14
14
 
15
15
  from abc import ABC, abstractmethod
16
- from typing import Dict, Optional
16
+ from typing import Dict, Literal, Optional
17
17
 
18
18
 
19
19
  class BaseConfig(ABC):
@@ -60,9 +60,15 @@ class SmoothQuantConfig(BaseConfig):
60
60
  self,
61
61
  alpha: float = 0.5,
62
62
  custom_alpha_map: Optional[Dict[str, float]] = None,
63
+ acts_from: Literal["input", "output"] = "input",
63
64
  ):
64
65
  self.alpha = alpha
65
66
  self.custom_alpha_map = custom_alpha_map
67
+ # Where to collect activation statistics from:
68
+ # - "input": use forward-pre-hook (Tensor before the Linear op)
69
+ # - "output": use forward-hook (Tensor after the Linear op)
70
+ # Default is "input".
71
+ self.acts_from = acts_from
66
72
 
67
73
  @property
68
74
  def name(self) -> str:
@@ -0,0 +1,5 @@
1
+ from tico.experimental.quantization.ptq.wrappers.fairseq.quant_mha import (
2
+ QuantFairseqMultiheadAttention,
3
+ )
4
+
5
+ __all__ = ["QuantFairseqMultiheadAttention"]
@@ -0,0 +1,165 @@
1
+ # Copyright (c) 2025 Samsung Electronics Co., Ltd. All Rights Reserved
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ #
15
+ # -----------------------------------------------------------------------------
16
+ # This file includes modifications based on fairseq
17
+ # (https://github.com/facebookresearch/fairseq), originally licensed under
18
+ # the MIT License. See the LICENSE file in the fairseq repository for details.
19
+ # -----------------------------------------------------------------------------
20
+
21
+ from typing import Optional
22
+
23
+ import torch.nn as nn
24
+ from torch import Tensor
25
+
26
+ from tico.experimental.quantization.ptq.quant_config import QuantConfig
27
+ from tico.experimental.quantization.ptq.wrappers.fairseq.quant_mha import (
28
+ QuantFairseqMultiheadAttention,
29
+ )
30
+ from tico.experimental.quantization.ptq.wrappers.ptq_wrapper import PTQWrapper
31
+ from tico.experimental.quantization.ptq.wrappers.quant_module_base import (
32
+ QuantModuleBase,
33
+ )
34
+ from tico.experimental.quantization.ptq.wrappers.registry import try_register
35
+
36
+
37
+ @try_register("fairseq.modules.transformer_layer.TransformerEncoderLayerBase")
38
+ class QuantFairseqEncoderLayer(QuantModuleBase):
39
+ """
40
+ Quant-aware drop-in replacement for Fairseq TransformerEncoderLayerBase.
41
+
42
+ Design notes (inference-friendly):
43
+ - All training-time logic (dropout, activation-dropout) is removed.
44
+ - I/O shape follows Fairseq convention: [T, B, C].
45
+ - `return_fc` behavior is preserved (returns (x, fc_result) if enabled).
46
+ """
47
+
48
+ def __init__(
49
+ self,
50
+ fp_layer: nn.Module,
51
+ *,
52
+ qcfg: Optional[QuantConfig] = None,
53
+ fp_name: Optional[str] = None,
54
+ ):
55
+ super().__init__(qcfg, fp_name=fp_name)
56
+
57
+ # --- copy meta / config flags from FP layer (read-only) -------------
58
+ assert hasattr(fp_layer, "embed_dim")
59
+ assert hasattr(fp_layer, "normalize_before")
60
+ self.embed_dim: int = int(fp_layer.embed_dim) # type: ignore[arg-type]
61
+ self.normalize_before: bool = bool(fp_layer.normalize_before)
62
+ self.return_fc: bool = bool(getattr(fp_layer, "return_fc", False))
63
+
64
+ # --- PTQ-wrapped submodules ----------------------------------------
65
+ attn_cfg = qcfg.child("self_attn") if qcfg else None
66
+ fc1_cfg = qcfg.child("fc1") if qcfg else None
67
+ fc2_cfg = qcfg.child("fc2") if qcfg else None
68
+ attn_ln_cfg = qcfg.child("self_attn_layer_norm") if qcfg else None
69
+ final_ln_cfg = qcfg.child("final_layer_norm") if qcfg else None
70
+
71
+ assert hasattr(fp_layer, "self_attn") and isinstance(
72
+ fp_layer.self_attn, nn.Module
73
+ )
74
+ assert hasattr(fp_layer, "fc1") and isinstance(fp_layer.fc1, nn.Module)
75
+ assert hasattr(fp_layer, "fc2") and isinstance(fp_layer.fc2, nn.Module)
76
+
77
+ self.self_attn = QuantFairseqMultiheadAttention(
78
+ fp_layer.self_attn, qcfg=attn_cfg, fp_name=f"{fp_name}.self_attn"
79
+ )
80
+ self.fc1 = PTQWrapper(fp_layer.fc1, qcfg=fc1_cfg, fp_name=f"{fp_name}.fc1")
81
+ self.fc2 = PTQWrapper(fp_layer.fc2, qcfg=fc2_cfg, fp_name=f"{fp_name}.fc2")
82
+
83
+ # LayerNorms
84
+ assert hasattr(fp_layer, "self_attn_layer_norm") and isinstance(
85
+ fp_layer.self_attn_layer_norm, nn.Module
86
+ )
87
+ assert hasattr(fp_layer, "final_layer_norm") and isinstance(
88
+ fp_layer.final_layer_norm, nn.Module
89
+ )
90
+ self.self_attn_layer_norm = PTQWrapper(
91
+ fp_layer.self_attn_layer_norm,
92
+ qcfg=attn_ln_cfg,
93
+ fp_name=f"{fp_name}.self_attn_layer_norm",
94
+ )
95
+ self.final_layer_norm = PTQWrapper(
96
+ fp_layer.final_layer_norm,
97
+ qcfg=final_ln_cfg,
98
+ fp_name=f"{fp_name}.final_layer_norm",
99
+ )
100
+
101
+ # Activation function
102
+ self.activation_fn = fp_layer.activation_fn # type: ignore[operator] # e.g., GELU/ReLU
103
+ self.obs_activation_fn = self._make_obs("activation_fn")
104
+
105
+ # ----------------------------------------------------------------------
106
+ def forward(
107
+ self,
108
+ x: Tensor, # [T,B,C]
109
+ encoder_padding_mask: Optional[Tensor],
110
+ attn_mask: Optional[Tensor] = None, # [T,S] boolean/byte or additive float
111
+ ):
112
+ """
113
+ Returns:
114
+ x' of shape [T, B, C] (or (x', fc_result) when return_fc=True)
115
+ """
116
+ # ---- Self-Attention block (pre-/post-norm kept as in FP layer) ----
117
+ residual = x
118
+ if self.normalize_before:
119
+ x = self.self_attn_layer_norm(x)
120
+
121
+ # Fairseq MHA expects [T,B,C]; our wrapped module keeps the same API
122
+ attn_out, _ = self.self_attn(
123
+ query=x,
124
+ key=x,
125
+ value=x,
126
+ key_padding_mask=encoder_padding_mask, # additive float [B,S] or None
127
+ need_weights=False,
128
+ attn_mask=attn_mask, # additive float [T,S] or None
129
+ )
130
+ x = residual + attn_out
131
+
132
+ if not self.normalize_before:
133
+ x = self.self_attn_layer_norm(x)
134
+
135
+ # ---- FFN block (no dropout/activation-dropout) --------------------
136
+ residual = x
137
+ if self.normalize_before:
138
+ x = self.final_layer_norm(x)
139
+
140
+ x = self.fc1(x) # Linear
141
+ x = self.activation_fn(x) # type: ignore[operator]
142
+ x = self._fq(x, self.obs_activation_fn)
143
+ x = self.fc2(x) # Linear
144
+
145
+ fc_result = x # keep before residual for optional return
146
+
147
+ x = residual + x
148
+ if not self.normalize_before:
149
+ x = self.final_layer_norm(x)
150
+
151
+ if self.return_fc:
152
+ return x, fc_result
153
+ return x
154
+
155
+ def _all_observers(self):
156
+ yield from (self.obs_activation_fn,)
157
+ for m in (
158
+ self.self_attn,
159
+ self.fc1,
160
+ self.fc2,
161
+ self.self_attn_layer_norm,
162
+ self.final_layer_norm,
163
+ ):
164
+ if isinstance(m, QuantModuleBase):
165
+ yield from m._all_observers()
@@ -0,0 +1,383 @@
1
+ # Copyright (c) 2025 Samsung Electronics Co., Ltd. All Rights Reserved
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ #
15
+ # -----------------------------------------------------------------------------
16
+ # This file includes modifications based on fairseq
17
+ # (https://github.com/facebookresearch/fairseq), originally licensed under
18
+ # the MIT License. See the LICENSE file in the fairseq repository for details.
19
+ # -----------------------------------------------------------------------------
20
+
21
+ from typing import Dict, Optional, Tuple, Union
22
+
23
+ import torch
24
+ import torch.nn as nn
25
+ import torch.nn.functional as F
26
+
27
+ from tico.experimental.quantization.ptq.quant_config import QuantConfig
28
+ from tico.experimental.quantization.ptq.wrappers.ptq_wrapper import PTQWrapper
29
+ from tico.experimental.quantization.ptq.wrappers.quant_module_base import (
30
+ QuantModuleBase,
31
+ )
32
+ from tico.experimental.quantization.ptq.wrappers.registry import try_register
33
+
34
+
35
+ @try_register("fairseq.modules.multihead_attention.MultiheadAttention")
36
+ class QuantFairseqMultiheadAttention(QuantModuleBase):
37
+ """
38
+ Quant-aware drop-in for Fairseq MultiheadAttention.
39
+
40
+ - No xFormers / no torch F.multi_head_attention_forward fast-path.
41
+ - Self/cross attention + minimal incremental KV cache.
42
+ - Causal mask is pre-built statically; `key_padding_mask` is additive float.
43
+ - I/O shape: [T, B, C]
44
+
45
+ Runtime optimization flags
46
+ --------------------------
47
+ use_static_causal : bool
48
+ If True, reuse a precomputed upper-triangular causal mask template
49
+ instead of rebuilding it each forward step. Reduces per-step mask
50
+ construction overhead during incremental decoding.
51
+
52
+ assume_additive_key_padding : bool
53
+ If True, assume the `key_padding_mask` is already an additive float
54
+ tensor (large negative values at padded positions). Skips conversion
55
+ from boolean masks, reducing runtime overhead.
56
+ """
57
+
58
+ def __init__(
59
+ self,
60
+ fp_attn: nn.Module,
61
+ *,
62
+ qcfg: Optional[QuantConfig] = None,
63
+ fp_name: Optional[str] = None,
64
+ max_seq: int = 4096,
65
+ use_static_causal: bool = False,
66
+ mask_fill_value: float = -120.0,
67
+ assume_additive_key_padding: bool = False,
68
+ ):
69
+ super().__init__(qcfg, fp_name=fp_name)
70
+
71
+ self.use_static_causal = use_static_causal
72
+ self.mask_fill_value = mask_fill_value
73
+ self.assume_additive_key_padding = assume_additive_key_padding
74
+ self.embed_dim: int = int(fp_attn.embed_dim) # type: ignore[arg-type]
75
+ self.num_heads: int = int(fp_attn.num_heads) # type: ignore[arg-type]
76
+ self.head_dim: int = self.embed_dim // self.num_heads
77
+ assert self.head_dim * self.num_heads == self.embed_dim
78
+
79
+ self.self_attention: bool = bool(getattr(fp_attn, "self_attention", False))
80
+ self.encoder_decoder_attention: bool = bool(
81
+ getattr(fp_attn, "encoder_decoder_attention", False)
82
+ )
83
+ assert self.self_attention != self.encoder_decoder_attention
84
+
85
+ # PTQ-wrapped projections
86
+ qc = qcfg.child("q_proj") if qcfg else None
87
+ kc = qcfg.child("k_proj") if qcfg else None
88
+ vc = qcfg.child("v_proj") if qcfg else None
89
+ oc = qcfg.child("out_proj") if qcfg else None
90
+ assert hasattr(fp_attn, "q_proj") and hasattr(fp_attn, "k_proj")
91
+ assert hasattr(fp_attn, "v_proj") and hasattr(fp_attn, "out_proj")
92
+ assert isinstance(fp_attn.q_proj, nn.Module) and isinstance(
93
+ fp_attn.k_proj, nn.Module
94
+ )
95
+ assert isinstance(fp_attn.v_proj, nn.Module) and isinstance(
96
+ fp_attn.out_proj, nn.Module
97
+ )
98
+ self.q_proj = PTQWrapper(fp_attn.q_proj, qcfg=qc, fp_name=f"{fp_name}.q_proj")
99
+ self.k_proj = PTQWrapper(fp_attn.k_proj, qcfg=kc, fp_name=f"{fp_name}.k_proj")
100
+ self.v_proj = PTQWrapper(fp_attn.v_proj, qcfg=vc, fp_name=f"{fp_name}.v_proj")
101
+ self.out_proj = PTQWrapper(
102
+ fp_attn.out_proj, qcfg=oc, fp_name=f"{fp_name}.out_proj"
103
+ )
104
+
105
+ # scale & static causal mask
106
+ self.register_buffer(
107
+ "scale_const", torch.tensor(self.head_dim**-0.5), persistent=False
108
+ )
109
+ mask = torch.full((1, 1, max_seq, max_seq), float(self.mask_fill_value))
110
+ mask.triu_(1)
111
+ self.register_buffer("causal_mask_template", mask, persistent=False)
112
+
113
+ # observers (no *_proj_out here; PTQWrapper handles module outputs)
114
+ mk = self._make_obs
115
+ self.obs_query_in = mk("query_in")
116
+ self.obs_key_in = mk("key_in")
117
+ self.obs_value_in = mk("value_in")
118
+ self.obs_kpm_in = mk("kpm_in")
119
+ self.obs_causal_mask = mk("causal_mask")
120
+ self.obs_q_fold = mk("q_fold")
121
+ self.obs_k_fold = mk("k_fold")
122
+ self.obs_v_fold = mk("v_fold")
123
+ self.obs_scale = mk("scale")
124
+ self.obs_logits_raw = mk("logits_raw")
125
+ self.obs_logits = mk("logits_scaled")
126
+ self.obs_attn_mask_add = mk("obs_attn_mask_add")
127
+ self.obs_kp_mask_add = mk("obs_kp_mask_add")
128
+ self.obs_softmax = mk("softmax")
129
+ self.obs_attn_out = mk("attn_out")
130
+
131
+ safe_name = (
132
+ fp_name if (fp_name not in (None, "", "None")) else f"QuantFsMHA_{id(self)}"
133
+ )
134
+ assert safe_name is not None
135
+ self._state_key = safe_name + ".attn_state"
136
+
137
+ def _get_input_buffer(
138
+ self,
139
+ incremental_state: Optional[Dict[str, Dict[str, Optional[torch.Tensor]]]],
140
+ ) -> Optional[Dict[str, Optional[torch.Tensor]]]:
141
+ """Return saved KV/mask dict or None."""
142
+ if incremental_state is None:
143
+ return None
144
+ return incremental_state.get(self._state_key, None)
145
+
146
+ def _set_input_buffer(
147
+ self,
148
+ incremental_state: Optional[Dict[str, Dict[str, Optional[torch.Tensor]]]],
149
+ buffer: Dict[str, Optional[torch.Tensor]],
150
+ ):
151
+ """Store KV/mask dict in incremental_state."""
152
+ if incremental_state is not None:
153
+ incremental_state[self._state_key] = buffer
154
+ return incremental_state
155
+
156
+ # ---- utils ----
157
+ def _fold_heads(self, x: torch.Tensor, B: int) -> torch.Tensor:
158
+ # [T,B,E] -> [B*H, T, Dh]
159
+ T = x.size(0)
160
+ x = x.view(T, B, self.num_heads, self.head_dim).permute(1, 2, 0, 3).contiguous()
161
+ return x.view(B * self.num_heads, T, self.head_dim)
162
+
163
+ def _unfold_heads(self, x: torch.Tensor, B: int, T: int) -> torch.Tensor:
164
+ # [B*H, T, Dh] -> [T,B,E]
165
+ x = x.view(B, self.num_heads, T, self.head_dim).permute(2, 0, 1, 3).contiguous()
166
+ return x.view(T, B, self.embed_dim)
167
+
168
+ def forward(
169
+ self,
170
+ query: torch.Tensor, # [Tq,B,C]
171
+ key: Optional[torch.Tensor],
172
+ value: Optional[torch.Tensor],
173
+ key_padding_mask: Optional[
174
+ torch.Tensor
175
+ ] = None, # additive float (e.g. -120 at pads)
176
+ incremental_state: Optional[
177
+ Dict[str, Dict[str, Optional[torch.Tensor]]]
178
+ ] = None,
179
+ need_weights: bool = False,
180
+ static_kv: bool = False,
181
+ attn_mask: Optional[torch.Tensor] = None, # if None -> internal causal
182
+ before_softmax: bool = False,
183
+ need_head_weights: bool = False,
184
+ return_new_kv: bool = False,
185
+ ) -> Union[
186
+ Tuple[torch.Tensor, Optional[torch.Tensor]],
187
+ Tuple[
188
+ torch.Tensor,
189
+ Optional[torch.Tensor],
190
+ Optional[torch.Tensor],
191
+ Optional[torch.Tensor],
192
+ ],
193
+ ]:
194
+
195
+ if need_head_weights:
196
+ need_weights = True
197
+
198
+ Tq, B, _ = query.shape
199
+ if self.self_attention:
200
+ key = query if key is None else key
201
+ value = query if value is None else value
202
+ else:
203
+ assert key is not None and value is not None
204
+
205
+ Tk, Bk, _ = key.shape
206
+ Tv, Bv, _ = value.shape
207
+ assert B == Bk == Bv
208
+
209
+ q = self.q_proj(self._fq(query, self.obs_query_in))
210
+ k = self.k_proj(self._fq(key, self.obs_key_in))
211
+ v = self.v_proj(self._fq(value, self.obs_value_in))
212
+
213
+ state = self._get_input_buffer(incremental_state)
214
+ if incremental_state is not None and state is None:
215
+ state = {}
216
+
217
+ # Capture "new" K/V for this call BEFORE concatenating with cache
218
+ new_k_bh: Optional[torch.Tensor] = None
219
+ new_v_bh: Optional[torch.Tensor] = None
220
+
221
+ # Fold heads
222
+ q = self._fq(self._fold_heads(q, B), self.obs_q_fold)
223
+ if state is not None and "prev_key" in state and static_kv:
224
+ # Cross-attention static_kv path: reuse cached KV; there is no new KV this call.
225
+ k = None
226
+ v = None
227
+ if k is not None:
228
+ k = self._fq(self._fold_heads(k, B), self.obs_k_fold) # [B*H, Tnew, Dh]
229
+ if return_new_kv:
230
+ new_k_bh = k.contiguous()
231
+ if v is not None:
232
+ v = self._fq(self._fold_heads(v, B), self.obs_v_fold) # [B*H, Tnew, Dh]
233
+ if return_new_kv:
234
+ new_v_bh = v.contiguous()
235
+
236
+ # Append/reuse cache
237
+ if state is not None:
238
+ pk = state.get("prev_key")
239
+ pv = state.get("prev_value")
240
+ if pk is not None:
241
+ pk = pk.view(B * self.num_heads, -1, self.head_dim)
242
+ k = pk if static_kv else torch.cat([pk, k], dim=1)
243
+ if pv is not None:
244
+ pv = pv.view(B * self.num_heads, -1, self.head_dim)
245
+ v = pv if static_kv else torch.cat([pv, v], dim=1)
246
+
247
+ assert k is not None and v is not None
248
+ Ts = k.size(1)
249
+
250
+ # Scaled dot-product
251
+ scale = self._fq(self.scale_const, self.obs_scale).to(q.dtype)
252
+ logits_raw = self._fq(
253
+ torch.bmm(q, k.transpose(1, 2)), self.obs_logits_raw
254
+ ) # [B*H,Tq,Ts]
255
+ logits = self._fq(logits_raw * scale, self.obs_logits)
256
+
257
+ assert isinstance(self.causal_mask_template, torch.Tensor)
258
+ # Masks
259
+ device = logits.device
260
+ if attn_mask is None and self.use_static_causal:
261
+ # Incremental decoding aware slicing:
262
+ # align the causal row(s) to the current time indices
263
+ start_q = max(Ts - Tq, 0)
264
+ cm = self.causal_mask_template[..., start_q : start_q + Tq, :Ts].to(
265
+ device=device, dtype=logits.dtype
266
+ )
267
+ attn_mask = cm.squeeze(0).squeeze(0) # [Tq,Ts]
268
+
269
+ if attn_mask is not None:
270
+ # Bool/byte mask -> additive float with large negatives
271
+ if not torch.is_floating_point(attn_mask):
272
+ fill = self.causal_mask_template.new_tensor(self.mask_fill_value)
273
+ attn_mask = torch.where(
274
+ attn_mask.to(torch.bool), fill, fill.new_zeros(())
275
+ )
276
+ attn_mask = self._fq(attn_mask, self.obs_causal_mask)
277
+ assert isinstance(attn_mask, torch.Tensor)
278
+
279
+ if not self.assume_additive_key_padding:
280
+ # attn_mask -> [B*H,Tq,Ts]
281
+ if attn_mask.dim() == 2:
282
+ add_mask = attn_mask.unsqueeze(0).expand(logits.size(0), -1, -1)
283
+ elif attn_mask.dim() == 3:
284
+ add_mask = (
285
+ attn_mask.unsqueeze(1)
286
+ .expand(B, self.num_heads, Tq, Ts)
287
+ .contiguous()
288
+ )
289
+ add_mask = add_mask.view(B * self.num_heads, Tq, Ts)
290
+ else:
291
+ raise RuntimeError("attn_mask must be [T,S] or [B,T,S]")
292
+ else:
293
+ add_mask = attn_mask
294
+ logits = self._fq(logits + add_mask, self.obs_attn_mask_add)
295
+
296
+ if key_padding_mask is not None:
297
+ if not torch.is_floating_point(key_padding_mask):
298
+ fill = self.causal_mask_template.new_tensor(self.mask_fill_value)
299
+ kpm = torch.where(
300
+ key_padding_mask.to(torch.bool), fill, fill.new_zeros(())
301
+ )
302
+ else:
303
+ kpm = key_padding_mask
304
+ kpm = self._fq(kpm, self.obs_kpm_in)
305
+
306
+ if not self.assume_additive_key_padding:
307
+ # key_padding_mask: additive float already
308
+ kpm = kpm.to(dtype=logits.dtype, device=device)
309
+ if kpm.dim() == 2: # [B,S]
310
+ kpm = (
311
+ kpm.view(B, 1, 1, Ts)
312
+ .expand(B, self.num_heads, Tq, Ts)
313
+ .contiguous()
314
+ )
315
+ kpm = kpm.view(B * self.num_heads, Tq, Ts)
316
+ elif kpm.dim() == 3: # [B,T,S]
317
+ kpm = (
318
+ kpm.unsqueeze(1).expand(B, self.num_heads, Tq, Ts).contiguous()
319
+ )
320
+ kpm = kpm.view(B * self.num_heads, Tq, Ts)
321
+ else:
322
+ raise RuntimeError(
323
+ "key_padding_mask must be [B,S] or [B,T,S] (additive)"
324
+ )
325
+ logits = self._fq(logits + kpm, self.obs_kp_mask_add)
326
+
327
+ if before_softmax:
328
+ if return_new_kv:
329
+ return logits, v, new_k_bh, new_v_bh
330
+ return logits, v
331
+
332
+ # Softmax (float32) -> back to q.dtype
333
+ attn_probs = torch.softmax(logits, dim=-1, dtype=torch.float32).to(q.dtype)
334
+ attn_probs = self._fq(attn_probs, self.obs_softmax)
335
+
336
+ # Context + output proj
337
+ ctx = self._fq(torch.bmm(attn_probs, v), self.obs_attn_out) # [B*H,Tq,Dh]
338
+ ctx = self._unfold_heads(ctx, B, Tq) # [Tq,B,E]
339
+ out = self.out_proj(ctx)
340
+
341
+ # Weights (optional)
342
+ attn_weights_out: Optional[torch.Tensor] = None
343
+ if need_weights:
344
+ aw = (
345
+ torch.softmax(logits, dim=-1, dtype=torch.float32)
346
+ .view(B, self.num_heads, Tq, Ts)
347
+ .transpose(1, 0)
348
+ )
349
+ if not need_head_weights:
350
+ aw = aw.mean(dim=1) # [B,Tq,Ts]
351
+ attn_weights_out = aw
352
+
353
+ # Cache write
354
+ if state is not None:
355
+ state["prev_key"] = k.view(B, self.num_heads, -1, self.head_dim).detach()
356
+ state["prev_value"] = v.view(B, self.num_heads, -1, self.head_dim).detach()
357
+ self._set_input_buffer(incremental_state, state)
358
+
359
+ if return_new_kv:
360
+ return out, attn_weights_out, new_k_bh, new_v_bh
361
+ return out, attn_weights_out
362
+
363
+ def _all_observers(self):
364
+ yield from (
365
+ self.obs_query_in,
366
+ self.obs_key_in,
367
+ self.obs_value_in,
368
+ self.obs_kpm_in,
369
+ self.obs_causal_mask,
370
+ self.obs_q_fold,
371
+ self.obs_k_fold,
372
+ self.obs_v_fold,
373
+ self.obs_scale,
374
+ self.obs_logits_raw,
375
+ self.obs_logits,
376
+ self.obs_attn_mask_add,
377
+ self.obs_kp_mask_add,
378
+ self.obs_softmax,
379
+ self.obs_attn_out,
380
+ )
381
+ for m in (self.q_proj, self.k_proj, self.v_proj, self.out_proj):
382
+ if isinstance(m, QuantModuleBase):
383
+ yield from m._all_observers()
@@ -28,9 +28,13 @@ _CORE_MODULES = (
28
28
  "tico.experimental.quantization.ptq.wrappers.nn.quant_layernorm",
29
29
  "tico.experimental.quantization.ptq.wrappers.nn.quant_linear",
30
30
  "tico.experimental.quantization.ptq.wrappers.nn.quant_silu",
31
+ # llama
31
32
  "tico.experimental.quantization.ptq.wrappers.llama.quant_attn",
32
33
  "tico.experimental.quantization.ptq.wrappers.llama.quant_decoder_layer",
33
34
  "tico.experimental.quantization.ptq.wrappers.llama.quant_mlp",
35
+ # fairseq
36
+ "tico.experimental.quantization.ptq.wrappers.fairseq.quant_encoder_layer",
37
+ "tico.experimental.quantization.ptq.wrappers.fairseq.quant_mha",
34
38
  # add future core wrappers here
35
39
  )
36
40
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: tico
3
- Version: 0.1.0.dev250907
3
+ Version: 0.1.0.dev250909
4
4
  Summary: Convert exported Torch module to circle
5
5
  Home-page: UNKNOWN
6
6
  License: UNKNOWN
@@ -1,4 +1,4 @@
1
- tico/__init__.py,sha256=MfAz3yLfVqU9e5bpBfh5Go7pzCFn18RkWuKz6GkfIqo,1883
1
+ tico/__init__.py,sha256=c5spmq5DrUrTLuWPal98sdmfFYPjuRym0xcEnK9Am_U,1883
2
2
  tico/pt2_to_circle.py,sha256=gu3MD4Iqc0zMZcCZ2IT8oGbyj21CTSbT3Rgd9s2B_9A,2767
3
3
  tico/config/__init__.py,sha256=xZzCXjZ84qE-CsBi-dfaL05bqpQ3stKKfTXhnrJRyVs,142
4
4
  tico/config/base.py,sha256=q5xMqGxTUZs4mFqt5c7i_y9U00fYgdMGl9nUqIVMlCo,1248
@@ -6,7 +6,7 @@ tico/config/factory.py,sha256=il0zqB6Lm5NX2LnG-TUhmiP9vVeZ_3TucJMorVZIodY,1324
6
6
  tico/config/v1.py,sha256=O1jzpUBDwoWpLohEpI08pJNwVB-yz3ufPrQm2_XWq4Y,1108
7
7
  tico/experimental/__init__.py,sha256=IO6FP_xYbGy0dW0HL26GXD3ouxARaxCK7bz9dn4blPQ,26
8
8
  tico/experimental/quantization/__init__.py,sha256=IaJPZegVJp0P3luutBo907Kp5sOJensE1Mm-XBG_jBs,122
9
- tico/experimental/quantization/config.py,sha256=1bCSAUI043Kbq08j59mb-K1cP2lmBMbekh8p3hNK6b8,1675
9
+ tico/experimental/quantization/config.py,sha256=nMepa_H471t7f3bKMvR8cZUZgruy_8kdb147rBkTWCQ,2004
10
10
  tico/experimental/quantization/public_interface.py,sha256=4-v9VXsokRG2-UUYYHd_MlbHxChqdGI5iuySyYDY_Pw,4420
11
11
  tico/experimental/quantization/quantizer.py,sha256=_2pDtWFKDCuKfYF2bptOwIYsa0VFNFM1ZNgi8_OGvHM,2365
12
12
  tico/experimental/quantization/algorithm/__init__.py,sha256=IO6FP_xYbGy0dW0HL26GXD3ouxARaxCK7bz9dn4blPQ,26
@@ -37,9 +37,9 @@ tico/experimental/quantization/algorithm/pt2e/annotation/op/sub.py,sha256=4z8HoY
37
37
  tico/experimental/quantization/algorithm/pt2e/transformation/__init__.py,sha256=IO6FP_xYbGy0dW0HL26GXD3ouxARaxCK7bz9dn4blPQ,26
38
38
  tico/experimental/quantization/algorithm/pt2e/transformation/convert_scalars_to_attrs.py,sha256=Idtoya2RcGKlgUJgC9WqNz0jH3gf6ViuPmsD9ySHbls,2253
39
39
  tico/experimental/quantization/algorithm/smoothquant/__init__.py,sha256=IO6FP_xYbGy0dW0HL26GXD3ouxARaxCK7bz9dn4blPQ,26
40
- tico/experimental/quantization/algorithm/smoothquant/observer.py,sha256=yi9nR_BEuxKVjgFcYPeldhXlEbE9V-0r4kRRPcI3C70,2639
41
- tico/experimental/quantization/algorithm/smoothquant/quantizer.py,sha256=rQMtnqM1dBzjhY-KJRp3TWZSW-dzXrs5N5FagXzh0IQ,2564
42
- tico/experimental/quantization/algorithm/smoothquant/smooth_quant.py,sha256=O1h7IojcsJaprFvRM9tsAuR3q7vTjKKRi88jD-nH79Y,6175
40
+ tico/experimental/quantization/algorithm/smoothquant/observer.py,sha256=OWBKQ3ox6PqeqgevxOjpXvb7uApoqE4YbUBelGhVSN8,3435
41
+ tico/experimental/quantization/algorithm/smoothquant/quantizer.py,sha256=QuZBi24L-LYI26nwZd6JmTdokxr6-l_vIgZvWVdqx_o,3637
42
+ tico/experimental/quantization/algorithm/smoothquant/smooth_quant.py,sha256=fxCy4m-BsSjraciSVPFlPhgsOT46RjrOgczQGb7B9TA,11561
43
43
  tico/experimental/quantization/evaluation/__init__.py,sha256=IO6FP_xYbGy0dW0HL26GXD3ouxARaxCK7bz9dn4blPQ,26
44
44
  tico/experimental/quantization/evaluation/backend.py,sha256=CZL9rZOA0t8cH7PHp6u9l7dGqWNvTj9bKOvwo0PVul0,692
45
45
  tico/experimental/quantization/evaluation/evaluate.py,sha256=kfa_GvFaX6DoSTAmuCImMJqF2jgqtnor5UpC7wVmGPI,7877
@@ -84,7 +84,10 @@ tico/experimental/quantization/ptq/wrappers/__init__.py,sha256=IO6FP_xYbGy0dW0HL
84
84
  tico/experimental/quantization/ptq/wrappers/ptq_wrapper.py,sha256=F9sK_DiRaXiGNHULcwIbs5EUtHz6ZJ7N4r5CWTTfhsM,2442
85
85
  tico/experimental/quantization/ptq/wrappers/quant_elementwise.py,sha256=LhEoobfvto6zKrBOKL4gmxfFFc31jHzyQV_zfps-iQM,3604
86
86
  tico/experimental/quantization/ptq/wrappers/quant_module_base.py,sha256=vkcDos_knGSS29rIZuEIWkAJLHrENbGz8nCH2-iara8,5969
87
- tico/experimental/quantization/ptq/wrappers/registry.py,sha256=wauoZdZBR15bGj1Upt9owEfFDT-Tj6HzciG9HDM1BHo,4845
87
+ tico/experimental/quantization/ptq/wrappers/registry.py,sha256=bTd1fZGCXkL4iaduKUXjWVpRXfvOaJGeurxwKJBVu6I,5019
88
+ tico/experimental/quantization/ptq/wrappers/fairseq/__init__.py,sha256=Mc8FLd9DusyB_IT1vk1OYrRkngOYnYd05IvtA9ORVQc,160
89
+ tico/experimental/quantization/ptq/wrappers/fairseq/quant_encoder_layer.py,sha256=aGr80Ku75j2H-UZ0elEa0mOQEyaAs2YJ4WJCN0lonn0,6412
90
+ tico/experimental/quantization/ptq/wrappers/fairseq/quant_mha.py,sha256=HsigmOLeacLXc46QNeFqwQ0DwKQhNrtWTKEtLJoqXoc,15562
88
91
  tico/experimental/quantization/ptq/wrappers/llama/__init__.py,sha256=IO6FP_xYbGy0dW0HL26GXD3ouxARaxCK7bz9dn4blPQ,26
89
92
  tico/experimental/quantization/ptq/wrappers/llama/quant_attn.py,sha256=-K1COLHIHfJZhQu-RE6KfJIkaL7S6yR4iUj48QkjMTw,8652
90
93
  tico/experimental/quantization/ptq/wrappers/llama/quant_decoder_layer.py,sha256=2XsIf5rcabDXXkahqriSxfo2curFq0Y5bnRPcYkJPg8,7187
@@ -247,9 +250,9 @@ tico/utils/mx/__init__.py,sha256=IO6FP_xYbGy0dW0HL26GXD3ouxARaxCK7bz9dn4blPQ,26
247
250
  tico/utils/mx/elemwise_ops.py,sha256=V6glyAHsVR1joqpsgnNytatCD_ew92xNWZ19UFDoMTA,10281
248
251
  tico/utils/mx/formats.py,sha256=uzNWyu-1onUlwQfX5cZ6fZSUfHMRqorper7_T1k3jfk,3404
249
252
  tico/utils/mx/mx_ops.py,sha256=RcfUTYVi-wilGB2sC35OeARdwDqnixv7dG5iyZ-fQT8,8555
250
- tico-0.1.0.dev250907.dist-info/LICENSE,sha256=kp4JLII7bzRhPb0CPD5XTDZMh22BQ7h3k3B7t8TiSbw,12644
251
- tico-0.1.0.dev250907.dist-info/METADATA,sha256=XGjEP9uNW7vXB1SKhhrraD27QYxb1QaxJZuRWFtYlT0,8450
252
- tico-0.1.0.dev250907.dist-info/WHEEL,sha256=G16H4A3IeoQmnOrYV4ueZGKSjhipXx8zc8nu9FGlvMA,92
253
- tico-0.1.0.dev250907.dist-info/entry_points.txt,sha256=kBKYSS_IYrSXmUYevmmepqIVPScq5vF8ulQRu3I_Zf0,59
254
- tico-0.1.0.dev250907.dist-info/top_level.txt,sha256=oqs7UPoNSKZEwqsX8B-KAWdQwfAa7i60pbxW_Jk7P3w,5
255
- tico-0.1.0.dev250907.dist-info/RECORD,,
253
+ tico-0.1.0.dev250909.dist-info/LICENSE,sha256=kp4JLII7bzRhPb0CPD5XTDZMh22BQ7h3k3B7t8TiSbw,12644
254
+ tico-0.1.0.dev250909.dist-info/METADATA,sha256=udUCPXehe7bxZ27PQKpcxub4Q6VSoaoEmckvpdx5FPo,8450
255
+ tico-0.1.0.dev250909.dist-info/WHEEL,sha256=G16H4A3IeoQmnOrYV4ueZGKSjhipXx8zc8nu9FGlvMA,92
256
+ tico-0.1.0.dev250909.dist-info/entry_points.txt,sha256=kBKYSS_IYrSXmUYevmmepqIVPScq5vF8ulQRu3I_Zf0,59
257
+ tico-0.1.0.dev250909.dist-info/top_level.txt,sha256=oqs7UPoNSKZEwqsX8B-KAWdQwfAa7i60pbxW_Jk7P3w,5
258
+ tico-0.1.0.dev250909.dist-info/RECORD,,