tico 0.1.0.dev250904__py3-none-any.whl → 0.1.0.dev250908__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tico/__init__.py +1 -1
- tico/experimental/quantization/algorithm/smoothquant/observer.py +26 -8
- tico/experimental/quantization/algorithm/smoothquant/quantizer.py +22 -1
- tico/experimental/quantization/algorithm/smoothquant/smooth_quant.py +188 -25
- tico/experimental/quantization/config.py +7 -1
- tico/experimental/quantization/ptq/wrappers/fairseq/__init__.py +5 -0
- tico/experimental/quantization/ptq/wrappers/fairseq/quant_mha.py +383 -0
- tico/experimental/quantization/ptq/wrappers/registry.py +3 -0
- {tico-0.1.0.dev250904.dist-info → tico-0.1.0.dev250908.dist-info}/METADATA +1 -1
- {tico-0.1.0.dev250904.dist-info → tico-0.1.0.dev250908.dist-info}/RECORD +14 -12
- {tico-0.1.0.dev250904.dist-info → tico-0.1.0.dev250908.dist-info}/LICENSE +0 -0
- {tico-0.1.0.dev250904.dist-info → tico-0.1.0.dev250908.dist-info}/WHEEL +0 -0
- {tico-0.1.0.dev250904.dist-info → tico-0.1.0.dev250908.dist-info}/entry_points.txt +0 -0
- {tico-0.1.0.dev250904.dist-info → tico-0.1.0.dev250908.dist-info}/top_level.txt +0 -0
tico/__init__.py
CHANGED
@@ -13,7 +13,7 @@
|
|
13
13
|
# limitations under the License.
|
14
14
|
|
15
15
|
import functools
|
16
|
-
from typing import Any, Dict, List
|
16
|
+
from typing import Any, Dict, List, Literal
|
17
17
|
|
18
18
|
import torch
|
19
19
|
|
@@ -21,18 +21,24 @@ import torch
|
|
21
21
|
class ChannelwiseMaxActsObserver:
|
22
22
|
"""
|
23
23
|
Observer to calcuate channelwise maximum activation
|
24
|
+
It supports collecting activations from either module inputs or outputs.
|
24
25
|
"""
|
25
26
|
|
26
|
-
def __init__(
|
27
|
+
def __init__(
|
28
|
+
self, model: torch.nn.Module, acts_from: Literal["input", "output"] = "input"
|
29
|
+
):
|
27
30
|
"""
|
28
31
|
model
|
29
32
|
A torch module whose activations are to be analyzed.
|
33
|
+
acts_from
|
34
|
+
Where to hook: "input" for forward-pre-hook, "output" for forward-hook.
|
30
35
|
hooks
|
31
|
-
A list to store the hooks
|
36
|
+
A list to store the hooks registered to collect activation statistics.
|
32
37
|
max_acts
|
33
|
-
A dictionary to store the
|
38
|
+
A dictionary to store the per-channel maxima.
|
34
39
|
"""
|
35
40
|
self.model = model
|
41
|
+
self.acts_from: Literal["input", "output"] = acts_from
|
36
42
|
self.hooks: List[Any] = []
|
37
43
|
self.max_acts: Dict[str, torch.Tensor] = {}
|
38
44
|
|
@@ -62,13 +68,25 @@ class ChannelwiseMaxActsObserver:
|
|
62
68
|
input = input[0]
|
63
69
|
stat_tensor(name, input)
|
64
70
|
|
71
|
+
def stat_output_hook(m, input, output, name):
|
72
|
+
if isinstance(output, tuple):
|
73
|
+
output = output[0]
|
74
|
+
stat_tensor(name, output)
|
75
|
+
|
65
76
|
for name, m in self.model.named_modules():
|
66
77
|
if isinstance(m, torch.nn.Linear):
|
67
|
-
self.
|
68
|
-
|
69
|
-
|
78
|
+
if self.acts_from == "input":
|
79
|
+
self.hooks.append(
|
80
|
+
m.register_forward_pre_hook(
|
81
|
+
functools.partial(stat_input_hook, name=name)
|
82
|
+
)
|
83
|
+
)
|
84
|
+
else: # "output"
|
85
|
+
self.hooks.append(
|
86
|
+
m.register_forward_hook(
|
87
|
+
functools.partial(stat_output_hook, name=name)
|
88
|
+
)
|
70
89
|
)
|
71
|
-
)
|
72
90
|
|
73
91
|
def remove(self):
|
74
92
|
for hook in self.hooks:
|
@@ -30,6 +30,25 @@ from tico.experimental.quantization.quantizer import BaseQuantizer
|
|
30
30
|
class SmoothQuantQuantizer(BaseQuantizer):
|
31
31
|
"""
|
32
32
|
Quantizer for applying the SmoothQuant algorithm
|
33
|
+
|
34
|
+
Q) Why allow choosing between input and output activations?
|
35
|
+
|
36
|
+
SmoothQuant relies on channel-wise activation statistics to balance
|
37
|
+
weights and activations. In practice, there are two natural sources:
|
38
|
+
|
39
|
+
- "input": captures the tensor right before a Linear layer
|
40
|
+
(forward-pre-hook). This matches the original SmoothQuant paper
|
41
|
+
and focuses on scaling the raw hidden state.
|
42
|
+
|
43
|
+
- "output": captures the tensor right after a Linear layer
|
44
|
+
(forward-hook). This can better reflect post-weight dynamics,
|
45
|
+
especially when subsequent operations (bias, activation functions)
|
46
|
+
dominate the dynamic range.
|
47
|
+
|
48
|
+
Allowing both options provides flexibility: depending on model
|
49
|
+
architecture and calibration data, one may yield lower error than
|
50
|
+
the other. The default remains "input" for compatibility, but "output"
|
51
|
+
can be selected to empirically reduce error or runtime overhead.
|
33
52
|
"""
|
34
53
|
|
35
54
|
def __init__(self, config: SmoothQuantConfig):
|
@@ -37,6 +56,7 @@ class SmoothQuantQuantizer(BaseQuantizer):
|
|
37
56
|
|
38
57
|
self.alpha = config.alpha
|
39
58
|
self.custom_alpha_map = config.custom_alpha_map
|
59
|
+
self.acts_from = config.acts_from # "input" (default) or "output"
|
40
60
|
self.observer: Optional[ChannelwiseMaxActsObserver] = None
|
41
61
|
|
42
62
|
@torch.no_grad()
|
@@ -55,7 +75,8 @@ class SmoothQuantQuantizer(BaseQuantizer):
|
|
55
75
|
Returns:
|
56
76
|
The model prepared for SmoothQuant quantization.
|
57
77
|
"""
|
58
|
-
|
78
|
+
# Attach hooks according to `config.acts_from`
|
79
|
+
self.observer = ChannelwiseMaxActsObserver(model, acts_from=self.acts_from)
|
59
80
|
self.observer.attach()
|
60
81
|
|
61
82
|
return model
|
@@ -12,7 +12,7 @@
|
|
12
12
|
# See the License for the specific language governing permissions and
|
13
13
|
# limitations under the License.
|
14
14
|
|
15
|
-
from typing import Dict, List, Optional
|
15
|
+
from typing import Callable, Dict, List, Optional
|
16
16
|
|
17
17
|
import torch
|
18
18
|
|
@@ -110,6 +110,185 @@ def smooth_weights(
|
|
110
110
|
back_m.weight.mul_(scales.view(1, -1)) # type: ignore[operator]
|
111
111
|
|
112
112
|
|
113
|
+
# TODO Split the files per model
|
114
|
+
# ────────────────────────────────────────────────────────────
|
115
|
+
# fairseq ReLU bridge (input-hook stats) helpers
|
116
|
+
# ────────────────────────────────────────────────────────────
|
117
|
+
|
118
|
+
|
119
|
+
@torch.no_grad()
|
120
|
+
def _compute_s_for_linear(
|
121
|
+
linear_like: torch.nn.Module, # 2D weight [out, in]
|
122
|
+
activation_max: torch.Tensor, # shape [in]
|
123
|
+
alpha: float,
|
124
|
+
) -> torch.Tensor:
|
125
|
+
"""
|
126
|
+
s = (amax^alpha / w_col_max^(1-alpha))
|
127
|
+
- amax: channel-wise max of the input to this module
|
128
|
+
- w_col_max: max(|W|) per input column
|
129
|
+
"""
|
130
|
+
if not hasattr(linear_like, "weight"):
|
131
|
+
raise RuntimeError(f"{type(linear_like).__name__} has no 'weight' attribute.")
|
132
|
+
W = linear_like.weight # [out, in]
|
133
|
+
assert isinstance(W, torch.Tensor)
|
134
|
+
if W.ndim != 2:
|
135
|
+
raise RuntimeError(
|
136
|
+
f"Expected 2D weight, got {W.ndim}D for {type(linear_like).__name__}"
|
137
|
+
)
|
138
|
+
|
139
|
+
device, dtype = W.device, W.dtype
|
140
|
+
amax = activation_max.to(device=device, dtype=dtype)
|
141
|
+
|
142
|
+
if amax.numel() != W.shape[1]:
|
143
|
+
raise ValueError(
|
144
|
+
f"activation_max numel({amax.numel()}) != in_features({W.shape[1]})"
|
145
|
+
)
|
146
|
+
|
147
|
+
w_col_max = W.abs().max(dim=0)[0].clamp(min=1e-5) # [in]
|
148
|
+
s = (amax.pow(alpha) / w_col_max.pow(1.0 - alpha)).clamp(min=1e-5) # [in]
|
149
|
+
return s
|
150
|
+
|
151
|
+
|
152
|
+
@torch.no_grad()
|
153
|
+
def _fuse_relu_bridge_no_runtime_mul(
|
154
|
+
fc1: torch.nn.Module,
|
155
|
+
fc2: torch.nn.Module,
|
156
|
+
s_hidden: torch.Tensor,
|
157
|
+
):
|
158
|
+
"""
|
159
|
+
Fuse scaling across fc1 → ReLU → fc2 without runtime multiplies:
|
160
|
+
- fc1 rows *= 1/s, (fc1.bias *= 1/s)
|
161
|
+
- fc2 cols *= s
|
162
|
+
Assumes middle activation is ReLU (positive homogeneous).
|
163
|
+
"""
|
164
|
+
if not hasattr(fc1, "weight") or not hasattr(fc2, "weight"):
|
165
|
+
raise RuntimeError("fc1/fc2 must have 'weight' attributes.")
|
166
|
+
|
167
|
+
W1, W2 = fc1.weight, fc2.weight
|
168
|
+
assert isinstance(W1, torch.Tensor) and isinstance(W2, torch.Tensor)
|
169
|
+
if W1.ndim != 2 or W2.ndim != 2:
|
170
|
+
raise RuntimeError("fc1/fc2 weights must be 2D.")
|
171
|
+
|
172
|
+
hidden = W1.shape[0]
|
173
|
+
if W2.shape[1] != hidden or s_hidden.numel() != hidden:
|
174
|
+
raise ValueError(
|
175
|
+
f"Dimension mismatch: hidden={hidden}, W2.in={W2.shape[1]}, s={s_hidden.numel()}"
|
176
|
+
)
|
177
|
+
|
178
|
+
s = s_hidden.to(device=W1.device, dtype=W1.dtype).clamp(min=1e-5) # [hidden]
|
179
|
+
inv_s = (1.0 / s).clamp(min=1e-5)
|
180
|
+
|
181
|
+
# fc1: row-wise scale
|
182
|
+
W1.mul_(inv_s.view(-1, 1))
|
183
|
+
if hasattr(fc1, "bias") and getattr(fc1, "bias") is not None:
|
184
|
+
assert isinstance(fc1.bias, torch.Tensor)
|
185
|
+
fc1.bias.mul_(inv_s)
|
186
|
+
|
187
|
+
# fc2: column-wise scale
|
188
|
+
W2.mul_(s.view(1, -1))
|
189
|
+
|
190
|
+
|
191
|
+
# ────────────────────────────────────────────────────────────
|
192
|
+
# Per-layer appliers (uniform protocol): return True if applied, else False
|
193
|
+
# ────────────────────────────────────────────────────────────
|
194
|
+
|
195
|
+
|
196
|
+
@torch.no_grad()
|
197
|
+
def _apply_if_llama_decoder(
|
198
|
+
name: str,
|
199
|
+
module: torch.nn.Module,
|
200
|
+
activation_max: Dict[str, torch.Tensor],
|
201
|
+
alpha_to_apply: float,
|
202
|
+
) -> bool:
|
203
|
+
"""
|
204
|
+
Apply LLaMA decoder-layer smoothing (input-hook stats).
|
205
|
+
Returns True if this handler applied smoothing to `module`.
|
206
|
+
"""
|
207
|
+
try:
|
208
|
+
from transformers.models.llama.modeling_llama import ( # type: ignore
|
209
|
+
LlamaDecoderLayer,
|
210
|
+
)
|
211
|
+
except Exception:
|
212
|
+
return False
|
213
|
+
|
214
|
+
if not isinstance(module, LlamaDecoderLayer):
|
215
|
+
return False
|
216
|
+
|
217
|
+
attn_ln = module.input_layernorm
|
218
|
+
qkv = [
|
219
|
+
module.self_attn.q_proj,
|
220
|
+
module.self_attn.k_proj,
|
221
|
+
module.self_attn.v_proj,
|
222
|
+
]
|
223
|
+
# Input-hook stats for q_proj input
|
224
|
+
qkv_input_scales = activation_max[name + ".self_attn.q_proj"]
|
225
|
+
smooth_weights(attn_ln, qkv, qkv_input_scales, alpha_to_apply)
|
226
|
+
|
227
|
+
ffn_ln = module.post_attention_layernorm
|
228
|
+
fcs = [module.mlp.gate_proj, module.mlp.up_proj]
|
229
|
+
# Input-hook stats for gate_proj input
|
230
|
+
fcs_input_scales = activation_max[name + ".mlp.gate_proj"]
|
231
|
+
smooth_weights(ffn_ln, fcs, fcs_input_scales, alpha_to_apply)
|
232
|
+
|
233
|
+
return True
|
234
|
+
|
235
|
+
|
236
|
+
@torch.no_grad()
|
237
|
+
def _apply_if_fairseq_relu_bridge(
|
238
|
+
name: str,
|
239
|
+
module: torch.nn.Module,
|
240
|
+
activation_max: Dict[str, torch.Tensor],
|
241
|
+
alpha_to_apply: float,
|
242
|
+
) -> bool:
|
243
|
+
"""
|
244
|
+
Apply fairseq Transformer (Encoder/Decoder) ReLU-FFN bridge fusion
|
245
|
+
using input-hook stats at '{name}.fc1'. Returns True if applied.
|
246
|
+
"""
|
247
|
+
try:
|
248
|
+
from fairseq.modules.transformer_layer import (
|
249
|
+
TransformerDecoderLayerBase,
|
250
|
+
TransformerEncoderLayerBase,
|
251
|
+
) # type: ignore
|
252
|
+
except Exception:
|
253
|
+
return False
|
254
|
+
|
255
|
+
if not isinstance(
|
256
|
+
module, (TransformerEncoderLayerBase, TransformerDecoderLayerBase)
|
257
|
+
):
|
258
|
+
return False
|
259
|
+
|
260
|
+
# Only when FFN activation is ReLU (positive homogeneity)
|
261
|
+
act_fn = getattr(module, "activation_fn", None)
|
262
|
+
is_relu = (act_fn is torch.nn.functional.relu) or getattr(
|
263
|
+
act_fn, "__name__", ""
|
264
|
+
) == "relu"
|
265
|
+
if not is_relu:
|
266
|
+
return False
|
267
|
+
|
268
|
+
fc1_key = f"{name}.fc1"
|
269
|
+
amax2 = activation_max.get(fc1_key)
|
270
|
+
if amax2 is None:
|
271
|
+
return False
|
272
|
+
|
273
|
+
fc1 = getattr(module, "fc1", None)
|
274
|
+
fc2 = getattr(module, "fc2", None)
|
275
|
+
if fc1 is None or fc2 is None or not hasattr(fc2, "weight") or fc2.weight.ndim != 2:
|
276
|
+
return False
|
277
|
+
|
278
|
+
s_hidden = _compute_s_for_linear(fc2, amax2, alpha_to_apply) # [hidden]
|
279
|
+
_fuse_relu_bridge_no_runtime_mul(fc1, fc2, s_hidden)
|
280
|
+
return True
|
281
|
+
|
282
|
+
|
283
|
+
# Registry of appliers (order matters: try LLaMA first, then fairseq)
|
284
|
+
_APPLIERS: List[
|
285
|
+
Callable[[str, torch.nn.Module, Dict[str, torch.Tensor], float], bool]
|
286
|
+
] = [
|
287
|
+
_apply_if_llama_decoder,
|
288
|
+
_apply_if_fairseq_relu_bridge,
|
289
|
+
]
|
290
|
+
|
291
|
+
|
113
292
|
@torch.no_grad()
|
114
293
|
def apply_smoothing(
|
115
294
|
model: torch.nn.Module,
|
@@ -133,32 +312,16 @@ def apply_smoothing(
|
|
133
312
|
Layers specified in this dictionary will use the corresponding alpha
|
134
313
|
value instead of the default.
|
135
314
|
"""
|
136
|
-
from transformers.models.llama.modeling_llama import LlamaDecoderLayer
|
137
|
-
|
138
315
|
for name, module in model.named_modules():
|
139
|
-
alpha_to_apply =
|
140
|
-
|
141
|
-
|
316
|
+
alpha_to_apply = (
|
317
|
+
custom_alpha_map.get(name, alpha) if custom_alpha_map else alpha
|
318
|
+
)
|
142
319
|
if alpha_to_apply > 1.0:
|
143
320
|
raise RuntimeError(
|
144
321
|
f"Alpha value cannot exceed 1.0. Given alpha: {alpha_to_apply}"
|
145
322
|
)
|
146
|
-
|
147
|
-
#
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
qkv = [
|
152
|
-
module.self_attn.q_proj,
|
153
|
-
module.self_attn.k_proj,
|
154
|
-
module.self_attn.v_proj,
|
155
|
-
]
|
156
|
-
|
157
|
-
qkv_input_scales = activation_max[name + ".self_attn.q_proj"]
|
158
|
-
smooth_weights(attn_ln, qkv, qkv_input_scales, alpha_to_apply)
|
159
|
-
|
160
|
-
ffn_ln = module.post_attention_layernorm
|
161
|
-
fcs = [module.mlp.gate_proj, module.mlp.up_proj]
|
162
|
-
fcs_input_scales = activation_max[name + ".mlp.gate_proj"]
|
163
|
-
|
164
|
-
smooth_weights(ffn_ln, fcs, fcs_input_scales, alpha_to_apply)
|
323
|
+
|
324
|
+
# Try each applier until one succeeds.
|
325
|
+
for applier in _APPLIERS:
|
326
|
+
if applier(name, module, activation_max, alpha_to_apply):
|
327
|
+
break # applied → stop trying others
|
@@ -13,7 +13,7 @@
|
|
13
13
|
# limitations under the License.
|
14
14
|
|
15
15
|
from abc import ABC, abstractmethod
|
16
|
-
from typing import Dict, Optional
|
16
|
+
from typing import Dict, Literal, Optional
|
17
17
|
|
18
18
|
|
19
19
|
class BaseConfig(ABC):
|
@@ -60,9 +60,15 @@ class SmoothQuantConfig(BaseConfig):
|
|
60
60
|
self,
|
61
61
|
alpha: float = 0.5,
|
62
62
|
custom_alpha_map: Optional[Dict[str, float]] = None,
|
63
|
+
acts_from: Literal["input", "output"] = "input",
|
63
64
|
):
|
64
65
|
self.alpha = alpha
|
65
66
|
self.custom_alpha_map = custom_alpha_map
|
67
|
+
# Where to collect activation statistics from:
|
68
|
+
# - "input": use forward-pre-hook (Tensor before the Linear op)
|
69
|
+
# - "output": use forward-hook (Tensor after the Linear op)
|
70
|
+
# Default is "input".
|
71
|
+
self.acts_from = acts_from
|
66
72
|
|
67
73
|
@property
|
68
74
|
def name(self) -> str:
|
@@ -0,0 +1,383 @@
|
|
1
|
+
# Copyright (c) 2025 Samsung Electronics Co., Ltd. All Rights Reserved
|
2
|
+
#
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at
|
6
|
+
#
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
#
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
# See the License for the specific language governing permissions and
|
13
|
+
# limitations under the License.
|
14
|
+
#
|
15
|
+
# -----------------------------------------------------------------------------
|
16
|
+
# This file includes modifications based on fairseq
|
17
|
+
# (https://github.com/facebookresearch/fairseq), originally licensed under
|
18
|
+
# the MIT License. See the LICENSE file in the fairseq repository for details.
|
19
|
+
# -----------------------------------------------------------------------------
|
20
|
+
|
21
|
+
from typing import Dict, Optional, Tuple, Union
|
22
|
+
|
23
|
+
import torch
|
24
|
+
import torch.nn as nn
|
25
|
+
import torch.nn.functional as F
|
26
|
+
|
27
|
+
from tico.experimental.quantization.ptq.quant_config import QuantConfig
|
28
|
+
from tico.experimental.quantization.ptq.wrappers.ptq_wrapper import PTQWrapper
|
29
|
+
from tico.experimental.quantization.ptq.wrappers.quant_module_base import (
|
30
|
+
QuantModuleBase,
|
31
|
+
)
|
32
|
+
from tico.experimental.quantization.ptq.wrappers.registry import try_register
|
33
|
+
|
34
|
+
|
35
|
+
@try_register("fairseq.modules.multihead_attention.MultiheadAttention")
|
36
|
+
class QuantFairseqMultiheadAttention(QuantModuleBase):
|
37
|
+
"""
|
38
|
+
Quant-aware drop-in for Fairseq MultiheadAttention.
|
39
|
+
|
40
|
+
- No xFormers / no torch F.multi_head_attention_forward fast-path.
|
41
|
+
- Self/cross attention + minimal incremental KV cache.
|
42
|
+
- Causal mask is pre-built statically; `key_padding_mask` is additive float.
|
43
|
+
- I/O shape: [T, B, C]
|
44
|
+
|
45
|
+
Runtime optimization flags
|
46
|
+
--------------------------
|
47
|
+
use_static_causal : bool
|
48
|
+
If True, reuse a precomputed upper-triangular causal mask template
|
49
|
+
instead of rebuilding it each forward step. Reduces per-step mask
|
50
|
+
construction overhead during incremental decoding.
|
51
|
+
|
52
|
+
assume_additive_key_padding : bool
|
53
|
+
If True, assume the `key_padding_mask` is already an additive float
|
54
|
+
tensor (large negative values at padded positions). Skips conversion
|
55
|
+
from boolean masks, reducing runtime overhead.
|
56
|
+
"""
|
57
|
+
|
58
|
+
def __init__(
|
59
|
+
self,
|
60
|
+
fp_attn: nn.Module,
|
61
|
+
*,
|
62
|
+
qcfg: Optional[QuantConfig] = None,
|
63
|
+
fp_name: Optional[str] = None,
|
64
|
+
max_seq: int = 4096,
|
65
|
+
use_static_causal: bool = False,
|
66
|
+
mask_fill_value: float = -120.0,
|
67
|
+
assume_additive_key_padding: bool = False,
|
68
|
+
):
|
69
|
+
super().__init__(qcfg, fp_name=fp_name)
|
70
|
+
|
71
|
+
self.use_static_causal = use_static_causal
|
72
|
+
self.mask_fill_value = mask_fill_value
|
73
|
+
self.assume_additive_key_padding = assume_additive_key_padding
|
74
|
+
self.embed_dim: int = int(fp_attn.embed_dim) # type: ignore[arg-type]
|
75
|
+
self.num_heads: int = int(fp_attn.num_heads) # type: ignore[arg-type]
|
76
|
+
self.head_dim: int = self.embed_dim // self.num_heads
|
77
|
+
assert self.head_dim * self.num_heads == self.embed_dim
|
78
|
+
|
79
|
+
self.self_attention: bool = bool(getattr(fp_attn, "self_attention", False))
|
80
|
+
self.encoder_decoder_attention: bool = bool(
|
81
|
+
getattr(fp_attn, "encoder_decoder_attention", False)
|
82
|
+
)
|
83
|
+
assert self.self_attention != self.encoder_decoder_attention
|
84
|
+
|
85
|
+
# PTQ-wrapped projections
|
86
|
+
qc = qcfg.child("q_proj") if qcfg else None
|
87
|
+
kc = qcfg.child("k_proj") if qcfg else None
|
88
|
+
vc = qcfg.child("v_proj") if qcfg else None
|
89
|
+
oc = qcfg.child("out_proj") if qcfg else None
|
90
|
+
assert hasattr(fp_attn, "q_proj") and hasattr(fp_attn, "k_proj")
|
91
|
+
assert hasattr(fp_attn, "v_proj") and hasattr(fp_attn, "out_proj")
|
92
|
+
assert isinstance(fp_attn.q_proj, nn.Module) and isinstance(
|
93
|
+
fp_attn.k_proj, nn.Module
|
94
|
+
)
|
95
|
+
assert isinstance(fp_attn.v_proj, nn.Module) and isinstance(
|
96
|
+
fp_attn.out_proj, nn.Module
|
97
|
+
)
|
98
|
+
self.q_proj = PTQWrapper(fp_attn.q_proj, qcfg=qc, fp_name=f"{fp_name}.q_proj")
|
99
|
+
self.k_proj = PTQWrapper(fp_attn.k_proj, qcfg=kc, fp_name=f"{fp_name}.k_proj")
|
100
|
+
self.v_proj = PTQWrapper(fp_attn.v_proj, qcfg=vc, fp_name=f"{fp_name}.v_proj")
|
101
|
+
self.out_proj = PTQWrapper(
|
102
|
+
fp_attn.out_proj, qcfg=oc, fp_name=f"{fp_name}.out_proj"
|
103
|
+
)
|
104
|
+
|
105
|
+
# scale & static causal mask
|
106
|
+
self.register_buffer(
|
107
|
+
"scale_const", torch.tensor(self.head_dim**-0.5), persistent=False
|
108
|
+
)
|
109
|
+
mask = torch.full((1, 1, max_seq, max_seq), float(self.mask_fill_value))
|
110
|
+
mask.triu_(1)
|
111
|
+
self.register_buffer("causal_mask_template", mask, persistent=False)
|
112
|
+
|
113
|
+
# observers (no *_proj_out here; PTQWrapper handles module outputs)
|
114
|
+
mk = self._make_obs
|
115
|
+
self.obs_query_in = mk("query_in")
|
116
|
+
self.obs_key_in = mk("key_in")
|
117
|
+
self.obs_value_in = mk("value_in")
|
118
|
+
self.obs_kpm_in = mk("kpm_in")
|
119
|
+
self.obs_causal_mask = mk("causal_mask")
|
120
|
+
self.obs_q_fold = mk("q_fold")
|
121
|
+
self.obs_k_fold = mk("k_fold")
|
122
|
+
self.obs_v_fold = mk("v_fold")
|
123
|
+
self.obs_scale = mk("scale")
|
124
|
+
self.obs_logits_raw = mk("logits_raw")
|
125
|
+
self.obs_logits = mk("logits_scaled")
|
126
|
+
self.obs_attn_mask_add = mk("obs_attn_mask_add")
|
127
|
+
self.obs_kp_mask_add = mk("obs_kp_mask_add")
|
128
|
+
self.obs_softmax = mk("softmax")
|
129
|
+
self.obs_attn_out = mk("attn_out")
|
130
|
+
|
131
|
+
safe_name = (
|
132
|
+
fp_name if (fp_name not in (None, "", "None")) else f"QuantFsMHA_{id(self)}"
|
133
|
+
)
|
134
|
+
assert safe_name is not None
|
135
|
+
self._state_key = safe_name + ".attn_state"
|
136
|
+
|
137
|
+
def _get_input_buffer(
|
138
|
+
self,
|
139
|
+
incremental_state: Optional[Dict[str, Dict[str, Optional[torch.Tensor]]]],
|
140
|
+
) -> Optional[Dict[str, Optional[torch.Tensor]]]:
|
141
|
+
"""Return saved KV/mask dict or None."""
|
142
|
+
if incremental_state is None:
|
143
|
+
return None
|
144
|
+
return incremental_state.get(self._state_key, None)
|
145
|
+
|
146
|
+
def _set_input_buffer(
|
147
|
+
self,
|
148
|
+
incremental_state: Optional[Dict[str, Dict[str, Optional[torch.Tensor]]]],
|
149
|
+
buffer: Dict[str, Optional[torch.Tensor]],
|
150
|
+
):
|
151
|
+
"""Store KV/mask dict in incremental_state."""
|
152
|
+
if incremental_state is not None:
|
153
|
+
incremental_state[self._state_key] = buffer
|
154
|
+
return incremental_state
|
155
|
+
|
156
|
+
# ---- utils ----
|
157
|
+
def _fold_heads(self, x: torch.Tensor, B: int) -> torch.Tensor:
|
158
|
+
# [T,B,E] -> [B*H, T, Dh]
|
159
|
+
T = x.size(0)
|
160
|
+
x = x.view(T, B, self.num_heads, self.head_dim).permute(1, 2, 0, 3).contiguous()
|
161
|
+
return x.view(B * self.num_heads, T, self.head_dim)
|
162
|
+
|
163
|
+
def _unfold_heads(self, x: torch.Tensor, B: int, T: int) -> torch.Tensor:
|
164
|
+
# [B*H, T, Dh] -> [T,B,E]
|
165
|
+
x = x.view(B, self.num_heads, T, self.head_dim).permute(2, 0, 1, 3).contiguous()
|
166
|
+
return x.view(T, B, self.embed_dim)
|
167
|
+
|
168
|
+
def forward(
|
169
|
+
self,
|
170
|
+
query: torch.Tensor, # [Tq,B,C]
|
171
|
+
key: Optional[torch.Tensor],
|
172
|
+
value: Optional[torch.Tensor],
|
173
|
+
key_padding_mask: Optional[
|
174
|
+
torch.Tensor
|
175
|
+
] = None, # additive float (e.g. -120 at pads)
|
176
|
+
incremental_state: Optional[
|
177
|
+
Dict[str, Dict[str, Optional[torch.Tensor]]]
|
178
|
+
] = None,
|
179
|
+
need_weights: bool = False,
|
180
|
+
static_kv: bool = False,
|
181
|
+
attn_mask: Optional[torch.Tensor] = None, # if None -> internal causal
|
182
|
+
before_softmax: bool = False,
|
183
|
+
need_head_weights: bool = False,
|
184
|
+
return_new_kv: bool = False,
|
185
|
+
) -> Union[
|
186
|
+
Tuple[torch.Tensor, Optional[torch.Tensor]],
|
187
|
+
Tuple[
|
188
|
+
torch.Tensor,
|
189
|
+
Optional[torch.Tensor],
|
190
|
+
Optional[torch.Tensor],
|
191
|
+
Optional[torch.Tensor],
|
192
|
+
],
|
193
|
+
]:
|
194
|
+
|
195
|
+
if need_head_weights:
|
196
|
+
need_weights = True
|
197
|
+
|
198
|
+
Tq, B, _ = query.shape
|
199
|
+
if self.self_attention:
|
200
|
+
key = query if key is None else key
|
201
|
+
value = query if value is None else value
|
202
|
+
else:
|
203
|
+
assert key is not None and value is not None
|
204
|
+
|
205
|
+
Tk, Bk, _ = key.shape
|
206
|
+
Tv, Bv, _ = value.shape
|
207
|
+
assert B == Bk == Bv
|
208
|
+
|
209
|
+
q = self.q_proj(self._fq(query, self.obs_query_in))
|
210
|
+
k = self.k_proj(self._fq(key, self.obs_key_in))
|
211
|
+
v = self.v_proj(self._fq(value, self.obs_value_in))
|
212
|
+
|
213
|
+
state = self._get_input_buffer(incremental_state)
|
214
|
+
if incremental_state is not None and state is None:
|
215
|
+
state = {}
|
216
|
+
|
217
|
+
# Capture "new" K/V for this call BEFORE concatenating with cache
|
218
|
+
new_k_bh: Optional[torch.Tensor] = None
|
219
|
+
new_v_bh: Optional[torch.Tensor] = None
|
220
|
+
|
221
|
+
# Fold heads
|
222
|
+
q = self._fq(self._fold_heads(q, B), self.obs_q_fold)
|
223
|
+
if state is not None and "prev_key" in state and static_kv:
|
224
|
+
# Cross-attention static_kv path: reuse cached KV; there is no new KV this call.
|
225
|
+
k = None
|
226
|
+
v = None
|
227
|
+
if k is not None:
|
228
|
+
k = self._fq(self._fold_heads(k, B), self.obs_k_fold) # [B*H, Tnew, Dh]
|
229
|
+
if return_new_kv:
|
230
|
+
new_k_bh = k.contiguous()
|
231
|
+
if v is not None:
|
232
|
+
v = self._fq(self._fold_heads(v, B), self.obs_v_fold) # [B*H, Tnew, Dh]
|
233
|
+
if return_new_kv:
|
234
|
+
new_v_bh = v.contiguous()
|
235
|
+
|
236
|
+
# Append/reuse cache
|
237
|
+
if state is not None:
|
238
|
+
pk = state.get("prev_key")
|
239
|
+
pv = state.get("prev_value")
|
240
|
+
if pk is not None:
|
241
|
+
pk = pk.view(B * self.num_heads, -1, self.head_dim)
|
242
|
+
k = pk if static_kv else torch.cat([pk, k], dim=1)
|
243
|
+
if pv is not None:
|
244
|
+
pv = pv.view(B * self.num_heads, -1, self.head_dim)
|
245
|
+
v = pv if static_kv else torch.cat([pv, v], dim=1)
|
246
|
+
|
247
|
+
assert k is not None and v is not None
|
248
|
+
Ts = k.size(1)
|
249
|
+
|
250
|
+
# Scaled dot-product
|
251
|
+
scale = self._fq(self.scale_const, self.obs_scale).to(q.dtype)
|
252
|
+
logits_raw = self._fq(
|
253
|
+
torch.bmm(q, k.transpose(1, 2)), self.obs_logits_raw
|
254
|
+
) # [B*H,Tq,Ts]
|
255
|
+
logits = self._fq(logits_raw * scale, self.obs_logits)
|
256
|
+
|
257
|
+
assert isinstance(self.causal_mask_template, torch.Tensor)
|
258
|
+
# Masks
|
259
|
+
device = logits.device
|
260
|
+
if attn_mask is None and self.use_static_causal:
|
261
|
+
# Incremental decoding aware slicing:
|
262
|
+
# align the causal row(s) to the current time indices
|
263
|
+
start_q = max(Ts - Tq, 0)
|
264
|
+
cm = self.causal_mask_template[..., start_q : start_q + Tq, :Ts].to(
|
265
|
+
device=device, dtype=logits.dtype
|
266
|
+
)
|
267
|
+
attn_mask = cm.squeeze(0).squeeze(0) # [Tq,Ts]
|
268
|
+
|
269
|
+
if attn_mask is not None:
|
270
|
+
# Bool/byte mask -> additive float with large negatives
|
271
|
+
if not torch.is_floating_point(attn_mask):
|
272
|
+
fill = self.causal_mask_template.new_tensor(self.mask_fill_value)
|
273
|
+
attn_mask = torch.where(
|
274
|
+
attn_mask.to(torch.bool), fill, fill.new_zeros(())
|
275
|
+
)
|
276
|
+
attn_mask = self._fq(attn_mask, self.obs_causal_mask)
|
277
|
+
assert isinstance(attn_mask, torch.Tensor)
|
278
|
+
|
279
|
+
if not self.assume_additive_key_padding:
|
280
|
+
# attn_mask -> [B*H,Tq,Ts]
|
281
|
+
if attn_mask.dim() == 2:
|
282
|
+
add_mask = attn_mask.unsqueeze(0).expand(logits.size(0), -1, -1)
|
283
|
+
elif attn_mask.dim() == 3:
|
284
|
+
add_mask = (
|
285
|
+
attn_mask.unsqueeze(1)
|
286
|
+
.expand(B, self.num_heads, Tq, Ts)
|
287
|
+
.contiguous()
|
288
|
+
)
|
289
|
+
add_mask = add_mask.view(B * self.num_heads, Tq, Ts)
|
290
|
+
else:
|
291
|
+
raise RuntimeError("attn_mask must be [T,S] or [B,T,S]")
|
292
|
+
else:
|
293
|
+
add_mask = attn_mask
|
294
|
+
logits = self._fq(logits + add_mask, self.obs_attn_mask_add)
|
295
|
+
|
296
|
+
if key_padding_mask is not None:
|
297
|
+
if not torch.is_floating_point(key_padding_mask):
|
298
|
+
fill = self.causal_mask_template.new_tensor(self.mask_fill_value)
|
299
|
+
kpm = torch.where(
|
300
|
+
key_padding_mask.to(torch.bool), fill, fill.new_zeros(())
|
301
|
+
)
|
302
|
+
else:
|
303
|
+
kpm = key_padding_mask
|
304
|
+
kpm = self._fq(kpm, self.obs_kpm_in)
|
305
|
+
|
306
|
+
if not self.assume_additive_key_padding:
|
307
|
+
# key_padding_mask: additive float already
|
308
|
+
kpm = kpm.to(dtype=logits.dtype, device=device)
|
309
|
+
if kpm.dim() == 2: # [B,S]
|
310
|
+
kpm = (
|
311
|
+
kpm.view(B, 1, 1, Ts)
|
312
|
+
.expand(B, self.num_heads, Tq, Ts)
|
313
|
+
.contiguous()
|
314
|
+
)
|
315
|
+
kpm = kpm.view(B * self.num_heads, Tq, Ts)
|
316
|
+
elif kpm.dim() == 3: # [B,T,S]
|
317
|
+
kpm = (
|
318
|
+
kpm.unsqueeze(1).expand(B, self.num_heads, Tq, Ts).contiguous()
|
319
|
+
)
|
320
|
+
kpm = kpm.view(B * self.num_heads, Tq, Ts)
|
321
|
+
else:
|
322
|
+
raise RuntimeError(
|
323
|
+
"key_padding_mask must be [B,S] or [B,T,S] (additive)"
|
324
|
+
)
|
325
|
+
logits = self._fq(logits + kpm, self.obs_kp_mask_add)
|
326
|
+
|
327
|
+
if before_softmax:
|
328
|
+
if return_new_kv:
|
329
|
+
return logits, v, new_k_bh, new_v_bh
|
330
|
+
return logits, v
|
331
|
+
|
332
|
+
# Softmax (float32) -> back to q.dtype
|
333
|
+
attn_probs = torch.softmax(logits, dim=-1, dtype=torch.float32).to(q.dtype)
|
334
|
+
attn_probs = self._fq(attn_probs, self.obs_softmax)
|
335
|
+
|
336
|
+
# Context + output proj
|
337
|
+
ctx = self._fq(torch.bmm(attn_probs, v), self.obs_attn_out) # [B*H,Tq,Dh]
|
338
|
+
ctx = self._unfold_heads(ctx, B, Tq) # [Tq,B,E]
|
339
|
+
out = self.out_proj(ctx)
|
340
|
+
|
341
|
+
# Weights (optional)
|
342
|
+
attn_weights_out: Optional[torch.Tensor] = None
|
343
|
+
if need_weights:
|
344
|
+
aw = (
|
345
|
+
torch.softmax(logits, dim=-1, dtype=torch.float32)
|
346
|
+
.view(B, self.num_heads, Tq, Ts)
|
347
|
+
.transpose(1, 0)
|
348
|
+
)
|
349
|
+
if not need_head_weights:
|
350
|
+
aw = aw.mean(dim=1) # [B,Tq,Ts]
|
351
|
+
attn_weights_out = aw
|
352
|
+
|
353
|
+
# Cache write
|
354
|
+
if state is not None:
|
355
|
+
state["prev_key"] = k.view(B, self.num_heads, -1, self.head_dim).detach()
|
356
|
+
state["prev_value"] = v.view(B, self.num_heads, -1, self.head_dim).detach()
|
357
|
+
self._set_input_buffer(incremental_state, state)
|
358
|
+
|
359
|
+
if return_new_kv:
|
360
|
+
return out, attn_weights_out, new_k_bh, new_v_bh
|
361
|
+
return out, attn_weights_out
|
362
|
+
|
363
|
+
def _all_observers(self):
|
364
|
+
yield from (
|
365
|
+
self.obs_query_in,
|
366
|
+
self.obs_key_in,
|
367
|
+
self.obs_value_in,
|
368
|
+
self.obs_kpm_in,
|
369
|
+
self.obs_causal_mask,
|
370
|
+
self.obs_q_fold,
|
371
|
+
self.obs_k_fold,
|
372
|
+
self.obs_v_fold,
|
373
|
+
self.obs_scale,
|
374
|
+
self.obs_logits_raw,
|
375
|
+
self.obs_logits,
|
376
|
+
self.obs_attn_mask_add,
|
377
|
+
self.obs_kp_mask_add,
|
378
|
+
self.obs_softmax,
|
379
|
+
self.obs_attn_out,
|
380
|
+
)
|
381
|
+
for m in (self.q_proj, self.k_proj, self.v_proj, self.out_proj):
|
382
|
+
if isinstance(m, QuantModuleBase):
|
383
|
+
yield from m._all_observers()
|
@@ -28,9 +28,12 @@ _CORE_MODULES = (
|
|
28
28
|
"tico.experimental.quantization.ptq.wrappers.nn.quant_layernorm",
|
29
29
|
"tico.experimental.quantization.ptq.wrappers.nn.quant_linear",
|
30
30
|
"tico.experimental.quantization.ptq.wrappers.nn.quant_silu",
|
31
|
+
# llama
|
31
32
|
"tico.experimental.quantization.ptq.wrappers.llama.quant_attn",
|
32
33
|
"tico.experimental.quantization.ptq.wrappers.llama.quant_decoder_layer",
|
33
34
|
"tico.experimental.quantization.ptq.wrappers.llama.quant_mlp",
|
35
|
+
# fairseq
|
36
|
+
"tico.experimental.quantization.ptq.wrappers.fairseq.quant_mha",
|
34
37
|
# add future core wrappers here
|
35
38
|
)
|
36
39
|
|
@@ -1,4 +1,4 @@
|
|
1
|
-
tico/__init__.py,sha256=
|
1
|
+
tico/__init__.py,sha256=KNsLIqvVA_qyf9VOIt_PmHk0L_QUjJqnldhE8_X4n6I,1883
|
2
2
|
tico/pt2_to_circle.py,sha256=gu3MD4Iqc0zMZcCZ2IT8oGbyj21CTSbT3Rgd9s2B_9A,2767
|
3
3
|
tico/config/__init__.py,sha256=xZzCXjZ84qE-CsBi-dfaL05bqpQ3stKKfTXhnrJRyVs,142
|
4
4
|
tico/config/base.py,sha256=q5xMqGxTUZs4mFqt5c7i_y9U00fYgdMGl9nUqIVMlCo,1248
|
@@ -6,7 +6,7 @@ tico/config/factory.py,sha256=il0zqB6Lm5NX2LnG-TUhmiP9vVeZ_3TucJMorVZIodY,1324
|
|
6
6
|
tico/config/v1.py,sha256=O1jzpUBDwoWpLohEpI08pJNwVB-yz3ufPrQm2_XWq4Y,1108
|
7
7
|
tico/experimental/__init__.py,sha256=IO6FP_xYbGy0dW0HL26GXD3ouxARaxCK7bz9dn4blPQ,26
|
8
8
|
tico/experimental/quantization/__init__.py,sha256=IaJPZegVJp0P3luutBo907Kp5sOJensE1Mm-XBG_jBs,122
|
9
|
-
tico/experimental/quantization/config.py,sha256=
|
9
|
+
tico/experimental/quantization/config.py,sha256=nMepa_H471t7f3bKMvR8cZUZgruy_8kdb147rBkTWCQ,2004
|
10
10
|
tico/experimental/quantization/public_interface.py,sha256=4-v9VXsokRG2-UUYYHd_MlbHxChqdGI5iuySyYDY_Pw,4420
|
11
11
|
tico/experimental/quantization/quantizer.py,sha256=_2pDtWFKDCuKfYF2bptOwIYsa0VFNFM1ZNgi8_OGvHM,2365
|
12
12
|
tico/experimental/quantization/algorithm/__init__.py,sha256=IO6FP_xYbGy0dW0HL26GXD3ouxARaxCK7bz9dn4blPQ,26
|
@@ -37,9 +37,9 @@ tico/experimental/quantization/algorithm/pt2e/annotation/op/sub.py,sha256=4z8HoY
|
|
37
37
|
tico/experimental/quantization/algorithm/pt2e/transformation/__init__.py,sha256=IO6FP_xYbGy0dW0HL26GXD3ouxARaxCK7bz9dn4blPQ,26
|
38
38
|
tico/experimental/quantization/algorithm/pt2e/transformation/convert_scalars_to_attrs.py,sha256=Idtoya2RcGKlgUJgC9WqNz0jH3gf6ViuPmsD9ySHbls,2253
|
39
39
|
tico/experimental/quantization/algorithm/smoothquant/__init__.py,sha256=IO6FP_xYbGy0dW0HL26GXD3ouxARaxCK7bz9dn4blPQ,26
|
40
|
-
tico/experimental/quantization/algorithm/smoothquant/observer.py,sha256=
|
41
|
-
tico/experimental/quantization/algorithm/smoothquant/quantizer.py,sha256=
|
42
|
-
tico/experimental/quantization/algorithm/smoothquant/smooth_quant.py,sha256=
|
40
|
+
tico/experimental/quantization/algorithm/smoothquant/observer.py,sha256=OWBKQ3ox6PqeqgevxOjpXvb7uApoqE4YbUBelGhVSN8,3435
|
41
|
+
tico/experimental/quantization/algorithm/smoothquant/quantizer.py,sha256=QuZBi24L-LYI26nwZd6JmTdokxr6-l_vIgZvWVdqx_o,3637
|
42
|
+
tico/experimental/quantization/algorithm/smoothquant/smooth_quant.py,sha256=fxCy4m-BsSjraciSVPFlPhgsOT46RjrOgczQGb7B9TA,11561
|
43
43
|
tico/experimental/quantization/evaluation/__init__.py,sha256=IO6FP_xYbGy0dW0HL26GXD3ouxARaxCK7bz9dn4blPQ,26
|
44
44
|
tico/experimental/quantization/evaluation/backend.py,sha256=CZL9rZOA0t8cH7PHp6u9l7dGqWNvTj9bKOvwo0PVul0,692
|
45
45
|
tico/experimental/quantization/evaluation/evaluate.py,sha256=kfa_GvFaX6DoSTAmuCImMJqF2jgqtnor5UpC7wVmGPI,7877
|
@@ -84,7 +84,9 @@ tico/experimental/quantization/ptq/wrappers/__init__.py,sha256=IO6FP_xYbGy0dW0HL
|
|
84
84
|
tico/experimental/quantization/ptq/wrappers/ptq_wrapper.py,sha256=F9sK_DiRaXiGNHULcwIbs5EUtHz6ZJ7N4r5CWTTfhsM,2442
|
85
85
|
tico/experimental/quantization/ptq/wrappers/quant_elementwise.py,sha256=LhEoobfvto6zKrBOKL4gmxfFFc31jHzyQV_zfps-iQM,3604
|
86
86
|
tico/experimental/quantization/ptq/wrappers/quant_module_base.py,sha256=vkcDos_knGSS29rIZuEIWkAJLHrENbGz8nCH2-iara8,5969
|
87
|
-
tico/experimental/quantization/ptq/wrappers/registry.py,sha256=
|
87
|
+
tico/experimental/quantization/ptq/wrappers/registry.py,sha256=cblE6dpLlK1lh61Xpqxcr2Vwi9XuqdrV5Y7TPSjpETQ,4940
|
88
|
+
tico/experimental/quantization/ptq/wrappers/fairseq/__init__.py,sha256=Mc8FLd9DusyB_IT1vk1OYrRkngOYnYd05IvtA9ORVQc,160
|
89
|
+
tico/experimental/quantization/ptq/wrappers/fairseq/quant_mha.py,sha256=HsigmOLeacLXc46QNeFqwQ0DwKQhNrtWTKEtLJoqXoc,15562
|
88
90
|
tico/experimental/quantization/ptq/wrappers/llama/__init__.py,sha256=IO6FP_xYbGy0dW0HL26GXD3ouxARaxCK7bz9dn4blPQ,26
|
89
91
|
tico/experimental/quantization/ptq/wrappers/llama/quant_attn.py,sha256=-K1COLHIHfJZhQu-RE6KfJIkaL7S6yR4iUj48QkjMTw,8652
|
90
92
|
tico/experimental/quantization/ptq/wrappers/llama/quant_decoder_layer.py,sha256=2XsIf5rcabDXXkahqriSxfo2curFq0Y5bnRPcYkJPg8,7187
|
@@ -247,9 +249,9 @@ tico/utils/mx/__init__.py,sha256=IO6FP_xYbGy0dW0HL26GXD3ouxARaxCK7bz9dn4blPQ,26
|
|
247
249
|
tico/utils/mx/elemwise_ops.py,sha256=V6glyAHsVR1joqpsgnNytatCD_ew92xNWZ19UFDoMTA,10281
|
248
250
|
tico/utils/mx/formats.py,sha256=uzNWyu-1onUlwQfX5cZ6fZSUfHMRqorper7_T1k3jfk,3404
|
249
251
|
tico/utils/mx/mx_ops.py,sha256=RcfUTYVi-wilGB2sC35OeARdwDqnixv7dG5iyZ-fQT8,8555
|
250
|
-
tico-0.1.0.
|
251
|
-
tico-0.1.0.
|
252
|
-
tico-0.1.0.
|
253
|
-
tico-0.1.0.
|
254
|
-
tico-0.1.0.
|
255
|
-
tico-0.1.0.
|
252
|
+
tico-0.1.0.dev250908.dist-info/LICENSE,sha256=kp4JLII7bzRhPb0CPD5XTDZMh22BQ7h3k3B7t8TiSbw,12644
|
253
|
+
tico-0.1.0.dev250908.dist-info/METADATA,sha256=SBFNzxkvYrsWshqSGGkmtVoHtDk1Rp9wVNfhdIRsFdg,8450
|
254
|
+
tico-0.1.0.dev250908.dist-info/WHEEL,sha256=G16H4A3IeoQmnOrYV4ueZGKSjhipXx8zc8nu9FGlvMA,92
|
255
|
+
tico-0.1.0.dev250908.dist-info/entry_points.txt,sha256=kBKYSS_IYrSXmUYevmmepqIVPScq5vF8ulQRu3I_Zf0,59
|
256
|
+
tico-0.1.0.dev250908.dist-info/top_level.txt,sha256=oqs7UPoNSKZEwqsX8B-KAWdQwfAa7i60pbxW_Jk7P3w,5
|
257
|
+
tico-0.1.0.dev250908.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|