tico 0.1.0.dev251106__py3-none-any.whl → 0.2.0.dev260122__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. tico/__init__.py +2 -2
  2. tico/_version.py +1 -0
  3. tico/passes/convert_conv3d_to_conv2d.py +435 -0
  4. tico/passes/convert_sym_size_to_circle_shape.py +99 -0
  5. tico/passes/decompose_batch_norm.py +9 -5
  6. tico/passes/lower_copy.py +95 -0
  7. tico/passes/ops.py +4 -0
  8. tico/quantization/algorithm/fpi_gptq/fpi_gptq.py +251 -0
  9. tico/quantization/algorithm/fpi_gptq/quantizer.py +180 -0
  10. tico/quantization/algorithm/gptq/gptq.py +231 -11
  11. tico/quantization/algorithm/gptq/quantizer.py +18 -6
  12. tico/quantization/config/{pt2e.py → fpi_gptq.py} +11 -4
  13. tico/quantization/config/gptq.py +27 -4
  14. tico/quantization/public_interface.py +0 -10
  15. tico/quantization/wrapq/quantizer.py +2 -0
  16. tico/quantization/wrapq/wrappers/quant_elementwise.py +51 -11
  17. tico/serialize/operators/adapters/onert/llama_attention.py +51 -0
  18. tico/serialize/operators/op_attention.py +58 -0
  19. tico/serialize/operators/op_circle_shape.py +64 -0
  20. tico/serialize/operators/op_dequantize_per_channel.py +1 -0
  21. tico/serialize/operators/op_dequantize_per_tensor.py +1 -0
  22. tico/serialize/operators/op_transpose_conv.py +66 -50
  23. tico/utils/convert.py +16 -1
  24. tico/utils/padding.py +13 -5
  25. tico/utils/record_input.py +2 -2
  26. tico/utils/register_custom_op.py +63 -0
  27. tico/utils/validate_args_kwargs.py +49 -4
  28. tico-0.2.0.dev260122.dist-info/METADATA +631 -0
  29. {tico-0.1.0.dev251106.dist-info → tico-0.2.0.dev260122.dist-info}/RECORD +35 -46
  30. {tico-0.1.0.dev251106.dist-info → tico-0.2.0.dev260122.dist-info}/WHEEL +1 -1
  31. {tico-0.1.0.dev251106.dist-info → tico-0.2.0.dev260122.dist-info}/entry_points.txt +0 -1
  32. tico/quantization/algorithm/pt2e/annotation/annotator.py +0 -208
  33. tico/quantization/algorithm/pt2e/annotation/config.py +0 -26
  34. tico/quantization/algorithm/pt2e/annotation/op/__init__.py +0 -21
  35. tico/quantization/algorithm/pt2e/annotation/op/adaptive_avg_pool2d.py +0 -63
  36. tico/quantization/algorithm/pt2e/annotation/op/add.py +0 -55
  37. tico/quantization/algorithm/pt2e/annotation/op/conv2d.py +0 -90
  38. tico/quantization/algorithm/pt2e/annotation/op/div.py +0 -55
  39. tico/quantization/algorithm/pt2e/annotation/op/linear.py +0 -92
  40. tico/quantization/algorithm/pt2e/annotation/op/mean.py +0 -51
  41. tico/quantization/algorithm/pt2e/annotation/op/mul.py +0 -55
  42. tico/quantization/algorithm/pt2e/annotation/op/relu6.py +0 -51
  43. tico/quantization/algorithm/pt2e/annotation/op/rsqrt.py +0 -51
  44. tico/quantization/algorithm/pt2e/annotation/op/sub.py +0 -55
  45. tico/quantization/algorithm/pt2e/annotation/spec.py +0 -45
  46. tico/quantization/algorithm/pt2e/annotation/utils.py +0 -88
  47. tico/quantization/algorithm/pt2e/quantizer.py +0 -81
  48. tico/quantization/algorithm/pt2e/transformation/__init__.py +0 -1
  49. tico/quantization/algorithm/pt2e/transformation/convert_scalars_to_attrs.py +0 -58
  50. tico/quantization/algorithm/pt2e/utils.py +0 -135
  51. tico/serialize/operators/op_copy.py +0 -187
  52. tico-0.1.0.dev251106.dist-info/METADATA +0 -392
  53. /tico/quantization/algorithm/{pt2e → fpi_gptq}/__init__.py +0 -0
  54. /tico/{quantization/algorithm/pt2e/annotation → serialize/operators/adapters/onert}/__init__.py +0 -0
  55. {tico-0.1.0.dev251106.dist-info → tico-0.2.0.dev260122.dist-info/licenses}/LICENSE +0 -0
  56. {tico-0.1.0.dev251106.dist-info → tico-0.2.0.dev260122.dist-info}/top_level.txt +0 -0
tico/passes/ops.py CHANGED
@@ -38,6 +38,10 @@ class AtenOps:
38
38
  torch.ops.aten.conv1d.default,
39
39
  torch.ops.aten.conv1d.padding,
40
40
  ]
41
+ self.conv3d = [
42
+ torch.ops.aten.conv3d.default,
43
+ torch.ops.aten.conv3d.padding,
44
+ ]
41
45
  self.detach = [
42
46
  torch.ops.aten.detach_.default,
43
47
  torch.ops.aten.detach.default,
@@ -0,0 +1,251 @@
1
+ # Copyright IST-DASLab. 2025. (commit: 2d65066). GitHub repository.
2
+ # Retrieved from https://github.com/IST-DASLab/gptq. Licensed under the
3
+ # Apache License 2.0.
4
+
5
+ # Copyright (c) 2025 Samsung Electronics Co., Ltd. All Rights Reserved
6
+ #
7
+ # Licensed under the Apache License, Version 2.0 (the "License");
8
+ # you may not use this file except in compliance with the License.
9
+ # You may obtain a copy of the License at
10
+ #
11
+ # http://www.apache.org/licenses/LICENSE-2.0
12
+ #
13
+ # Unless required by applicable law or agreed to in writing, software
14
+ # distributed under the License is distributed on an "AS IS" BASIS,
15
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16
+ # See the License for the specific language governing permissions and
17
+ # limitations under the License.
18
+
19
+ # https://github.com/IST-DASLab/gptq/blob/2d65066/gptq.py
20
+
21
+ import math
22
+ import time
23
+ from typing import Optional
24
+
25
+ import torch
26
+ import torch.nn as nn
27
+
28
+ from tico.quantization.algorithm.gptq.gptq import (
29
+ conv2d_weights_to_convtranspose2d_weights,
30
+ convtranspose2d_weights_to_conv2d_weights,
31
+ get_matmul_input_for_convtranspose2d,
32
+ )
33
+
34
+ from tico.quantization.algorithm.gptq.quant import quantize, Quantizer
35
+
36
+
37
+ def iterate_GPTQ(scale, zero, maxq, W, Hinv, max_num_of_iters=50):
38
+
39
+ cur_weights = W.clone()
40
+ mults = torch.pow(torch.diag(Hinv), -1)
41
+ Hinv_U = torch.triu(Hinv, diagonal=1)
42
+
43
+ init_weights = W.clone()
44
+ for _ in range(max_num_of_iters):
45
+ cur_Q = quantize(cur_weights, scale, zero, maxq)
46
+
47
+ d_W = torch.mul((cur_weights - cur_Q), mults)
48
+ cur_weights = init_weights - torch.matmul(d_W, Hinv_U)
49
+ del d_W, cur_Q
50
+ d_W = cur_Q = None
51
+
52
+ del init_weights
53
+ init_weights = None
54
+
55
+ cur_Q = quantize(cur_weights, scale, zero, maxq)
56
+
57
+ return cur_Q, cur_weights
58
+
59
+
60
+ class FPI_GPTQ:
61
+ def __init__(self, layer):
62
+ self.layer = layer
63
+ self.dev = self.layer.weight.device
64
+ W = layer.weight.data.clone()
65
+ if isinstance(self.layer, nn.Conv2d) or isinstance(self.layer, nn.Conv1d):
66
+ W = W.flatten(1)
67
+ elif isinstance(self.layer, nn.ConvTranspose2d):
68
+ W = convtranspose2d_weights_to_conv2d_weights(self.layer, W)
69
+ W = W.flatten(1)
70
+
71
+ self.rows = W.shape[0]
72
+ self.columns = W.shape[1]
73
+ self.H: Optional[torch.Tensor] = torch.zeros(
74
+ (self.columns, self.columns), device=self.dev
75
+ )
76
+ self.nsamples = 0
77
+ self.quantizer: Quantizer = Quantizer()
78
+
79
+ def add_batch(self, inp, out):
80
+ if len(inp.shape) == 2:
81
+ inp = inp.unsqueeze(0)
82
+ tmp = inp.shape[0]
83
+ if isinstance(self.layer, nn.Linear):
84
+ if len(inp.shape) > 2:
85
+ inp = inp.reshape((-1, inp.shape[-1]))
86
+ inp = inp.t()
87
+ if isinstance(self.layer, nn.Conv2d):
88
+ unfold = nn.Unfold(
89
+ self.layer.kernel_size,
90
+ dilation=self.layer.dilation,
91
+ padding=self.layer.padding,
92
+ stride=self.layer.stride,
93
+ )
94
+
95
+ if self.layer.groups != 1:
96
+ # the idea behind conversion of depthwise convolution to matmul is described here
97
+ # https://discuss.pytorch.org/t/conv1d-implementation-using-torch-nn-functional-unfold/109643/2
98
+ # although depthwise convolution is equal to a set of MatMuls
99
+ # (please note `w.view(1, groups, out_channels // groups, -1)` in the reference above is not just w.flatten(1))
100
+ # we can approximate groupwise Hessians with their mean
101
+ # so that we will have just a single Hessian and the usual GPTQ applies
102
+ inp = inp.reshape(
103
+ inp.size(0) * self.layer.groups,
104
+ inp.size(1) // self.layer.groups,
105
+ inp.shape[2],
106
+ inp.shape[3],
107
+ ) # inp.shape == (batch*groups, in_channels / groups, H, W) to meet Groupwise-wise Convolution, so that each group is colvolved with its own filter
108
+
109
+ inp = unfold(
110
+ inp
111
+ ) # inp.shape == (batch*groups, k_h*k_w*in_channels / groups, flattened_patches)
112
+ inp = inp.permute(
113
+ [1, 0, 2]
114
+ ) # inp.shape == (k_h*k_w*in_channels / groups, batch * groups, flattened_patches)
115
+ inp = inp.flatten(
116
+ 1
117
+ ) # inp.shape == (k_h*k_w*in_channels / groups, batch * groups * flattened_patches)
118
+ # so inp.matmul(inp.t()).shape == (k_x*k_y*in_channels / groups, k_x*k_y*in_channels / groups) == W.flatten(1)
119
+
120
+ if isinstance(self.layer, nn.Conv1d):
121
+ # nn.Conv1d is basically the same as nn.Conv2d so we can use the same idea as for nn.Conv2d
122
+ # TODO reduce code duplication
123
+ # represent conv1d as conv2d(1, k) on reshaped_input(batch, in_channels, 1, L)
124
+ unfold = nn.Unfold(
125
+ (1, self.layer.kernel_size[0]),
126
+ dilation=(1, self.layer.dilation[0]),
127
+ padding=(0, self.layer.padding[0]),
128
+ stride=(1, self.layer.stride[0]),
129
+ )
130
+ if self.layer.groups != 1:
131
+ # please see Conv2D for additional info
132
+ inp = inp.reshape(
133
+ inp.size(0) * self.layer.groups,
134
+ inp.size(1) // self.layer.groups,
135
+ inp.shape[2],
136
+ ) # inp.shape == (batch*groups, in_channels / groups, L) to meet Groupwise-wise Convolution, so that each group is colvolved with its own filter
137
+
138
+ inp = inp.unsqueeze(
139
+ -2
140
+ ) # (batch*groups, in_channels / groups, L)->(batch*groups, in_channels / groups, 1, L), valid for Conv2D
141
+ inp = unfold(inp)
142
+ inp = inp.permute([1, 0, 2])
143
+ inp = inp.flatten(1)
144
+ if isinstance(self.layer, nn.ConvTranspose2d):
145
+ inp = get_matmul_input_for_convtranspose2d(self.layer, inp)
146
+
147
+ self.H *= self.nsamples / (self.nsamples + tmp)
148
+ self.nsamples += tmp
149
+ inp = math.sqrt(2 / self.nsamples) * inp.float()
150
+ self.H += inp.matmul(inp.t())
151
+
152
+ def fasterquant(
153
+ self,
154
+ percdamp=0.01,
155
+ verbose=False,
156
+ ):
157
+ W = self.layer.weight.data.clone()
158
+ if isinstance(self.layer, nn.Conv2d) or isinstance(self.layer, nn.Conv1d):
159
+ W = W.flatten(1)
160
+ elif isinstance(self.layer, nn.ConvTranspose2d):
161
+ W = convtranspose2d_weights_to_conv2d_weights(self.layer, W)
162
+ conv2d_shape = W.shape
163
+ W = W.flatten(1) # reshaped to matrix (OUT_channels x the_rest)
164
+
165
+ W = W.float()
166
+ tick = time.time()
167
+ if not self.quantizer.ready():
168
+ self.quantizer.find_params(W, weight=True)
169
+
170
+ H = self.H
171
+ del self.H
172
+ assert isinstance(H, torch.Tensor)
173
+ dead = torch.diag(H) == 0
174
+ H[dead, dead] = 1
175
+ W[:, dead] = 0
176
+
177
+ # actorder
178
+ perm = torch.argsort(torch.diag(H), descending=True)
179
+ W = W[:, perm]
180
+ H = H[perm][:, perm]
181
+ invperm = torch.argsort(perm)
182
+
183
+ Q = torch.zeros_like(W)
184
+
185
+ damp = percdamp * torch.mean(torch.diag(H))
186
+ diag = torch.arange(self.columns, device=self.dev)
187
+ H[diag, diag] += damp
188
+ H = torch.linalg.cholesky(H)
189
+ assert isinstance(H, torch.Tensor)
190
+ H = torch.cholesky_inverse(H)
191
+ H = torch.linalg.cholesky(H, upper=True)
192
+ Hinv = H
193
+
194
+ Q, W = iterate_GPTQ(
195
+ self.quantizer.scale,
196
+ self.quantizer.zero,
197
+ self.quantizer.maxq,
198
+ W,
199
+ Hinv=Hinv,
200
+ max_num_of_iters=min(
201
+ 50, self.columns
202
+ ), # we don't need to iterate more than self.columns
203
+ )
204
+
205
+ if torch.cuda.is_available():
206
+ torch.cuda.synchronize()
207
+ if verbose:
208
+ print("time %.2f" % (time.time() - tick))
209
+ Losses = 0.5 * ((Q - W) / torch.diag(Hinv)) ** 2
210
+ print("error", torch.sum(Losses).item())
211
+
212
+ Q = Q[:, invperm]
213
+
214
+ if isinstance(self.layer, nn.Conv2d) or isinstance(self.layer, nn.Conv1d):
215
+ Q[:, dead] = quantize(
216
+ self.layer.weight.flatten(1)[:, dead],
217
+ self.quantizer.scale,
218
+ self.quantizer.zero,
219
+ self.quantizer.maxq,
220
+ )
221
+ elif isinstance(self.layer, nn.ConvTranspose2d):
222
+ Q[:, dead] = quantize(
223
+ convtranspose2d_weights_to_conv2d_weights(
224
+ self.layer, self.layer.weight.data
225
+ ).flatten(1)[:, dead],
226
+ self.quantizer.scale,
227
+ self.quantizer.zero,
228
+ self.quantizer.maxq,
229
+ )
230
+ else:
231
+ Q[:, dead] = quantize(
232
+ self.layer.weight[:, dead],
233
+ self.quantizer.scale,
234
+ self.quantizer.zero,
235
+ self.quantizer.maxq,
236
+ )
237
+
238
+ if isinstance(self.layer, nn.ConvTranspose2d):
239
+ Q_conv2d = Q.reshape(conv2d_shape).to(self.layer.weight.data.dtype)
240
+ self.layer.weight.data = conv2d_weights_to_convtranspose2d_weights(
241
+ self.layer, Q_conv2d
242
+ )
243
+ else:
244
+ self.layer.weight.data = Q.reshape(self.layer.weight.shape).to(
245
+ self.layer.weight.data.dtype
246
+ )
247
+
248
+ def free(self):
249
+ self.H = None
250
+ if torch.cuda.is_available():
251
+ torch.cuda.empty_cache()
@@ -0,0 +1,180 @@
1
+ # Copyright (c) 2024 Intel Corporation
2
+ # Copyright (c) 2025 Samsung Electronics Co., Ltd. All Rights Reserved
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ from typing import Any, Dict
17
+
18
+ import torch
19
+ from tqdm.auto import tqdm
20
+
21
+ from tico.quantization.algorithm.fpi_gptq.fpi_gptq import FPI_GPTQ
22
+ from tico.quantization.algorithm.gptq.quantizer import GPTQQuantizer
23
+ from tico.quantization.algorithm.gptq.utils import (
24
+ find_layers,
25
+ gather_single_batch_from_dict,
26
+ gather_single_batch_from_list,
27
+ )
28
+ from tico.quantization.config.fpi_gptq import FPIGPTQConfig
29
+ from tico.quantization.quantizer_registry import register_quantizer
30
+
31
+
32
+ @register_quantizer(FPIGPTQConfig)
33
+ class FPIGPTQQuantizer(GPTQQuantizer):
34
+ """
35
+ Quantizer for applying the Fixed Point Iteration GPTQ algorithm (FPIGPTQ)
36
+ This implementation expects the same steps as GPTQQuantizer.
37
+ It should produce results very close to reference GPTQ but much faster when running on cuda.
38
+ """
39
+
40
+ def __init__(self, config: FPIGPTQConfig):
41
+ super().__init__(config)
42
+
43
+ @torch.no_grad()
44
+ def convert(self, model):
45
+
46
+ # Restore original forwards (we no longer want to stop after first layer)
47
+ assert self._orig_model_forward is not None
48
+ model.forward = self._orig_model_forward
49
+ assert (
50
+ self._first_layer_ref is not None and self._orig_layer_forward is not None
51
+ )
52
+ self._first_layer_ref.forward = self._orig_layer_forward
53
+
54
+ gptq_conf = self.config
55
+ assert isinstance(gptq_conf, FPIGPTQConfig)
56
+ # Disable use_cache during calibration
57
+ if hasattr(model, "config") and hasattr(model.config, "use_cache"):
58
+ orig_use_cache = model.config.use_cache
59
+ model.config.use_cache = False
60
+ else:
61
+ orig_use_cache = None
62
+
63
+ # Identify layers
64
+ if hasattr(model, "model"):
65
+ target_layers = model.model.layers
66
+ else:
67
+ target_layers = [model]
68
+
69
+ quantizers: Dict[str, Any] = {}
70
+ for l_idx, layer in enumerate(
71
+ tqdm(
72
+ target_layers,
73
+ desc="Quantizing layers",
74
+ unit="layer",
75
+ disable=not gptq_conf.show_progress,
76
+ )
77
+ ):
78
+ # 1) Identify quantizable submodules within the layer
79
+ full = find_layers(
80
+ layer,
81
+ layers=[
82
+ torch.nn.Linear,
83
+ torch.nn.Conv2d,
84
+ torch.nn.Conv1d,
85
+ torch.nn.ConvTranspose2d,
86
+ ],
87
+ )
88
+ sequential = [list(full.keys())]
89
+
90
+ # 2) Set up (as in GPTQ)
91
+ for names in sequential:
92
+ subset = {n: full[n] for n in names}
93
+
94
+ gptq: Dict[str, FPI_GPTQ] = {}
95
+ for name in subset:
96
+ gptq[name] = FPI_GPTQ(subset[name])
97
+ gptq[name].quantizer.configure(
98
+ bits=8, perchannel=True, sym=False, mse=False
99
+ )
100
+
101
+ # Hook to collect (inp, out) for GPTQ
102
+ def add_batch(name):
103
+ def _hook(_, inp, out):
104
+ gptq[name].add_batch(inp[0].data, out.data)
105
+
106
+ return _hook
107
+
108
+ handles = []
109
+ for name in subset:
110
+ handles.append(subset[name].register_forward_hook(add_batch(name)))
111
+
112
+ # Run layer forward over all cached batches to build Hessian/statistics
113
+ batch_num = self.num_batches
114
+ for batch_idx in tqdm(
115
+ range(batch_num),
116
+ desc=f"[L{l_idx}] collecting",
117
+ leave=False,
118
+ unit="batch",
119
+ disable=not gptq_conf.show_progress,
120
+ ):
121
+ cache_args_batch = gather_single_batch_from_list(
122
+ self.cache_args, batch_idx
123
+ )
124
+ cache_kwargs_batch = gather_single_batch_from_dict(
125
+ self.cache_kwargs, batch_idx
126
+ )
127
+ layer(*cache_args_batch, **cache_kwargs_batch)
128
+
129
+ # Remove handles
130
+ for h in handles:
131
+ h.remove()
132
+
133
+ # 3) Quantize each submodule
134
+ for name in subset:
135
+ if gptq_conf.verbose:
136
+ print(f"[Layer {l_idx}] {name} -> Quantizing ...")
137
+ gptq[name].fasterquant(
138
+ percdamp=0.01,
139
+ verbose=gptq_conf.verbose,
140
+ )
141
+ quantizers[f"model.layers.{l_idx}.{name}"] = gptq[name].quantizer
142
+ gptq[name].free()
143
+
144
+ # 4) After quantization, re-run the layer to produce outputs for the next layer
145
+ for batch_idx in tqdm(
146
+ range(batch_num),
147
+ desc=f"[L{l_idx}] re-forward",
148
+ leave=False,
149
+ unit="batch",
150
+ disable=not gptq_conf.show_progress,
151
+ ):
152
+ cache_args_batch = gather_single_batch_from_list(
153
+ self.cache_args, batch_idx
154
+ )
155
+ cache_kwargs_batch = gather_single_batch_from_dict(
156
+ self.cache_kwargs, batch_idx
157
+ )
158
+ outs = layer(*cache_args_batch, **cache_kwargs_batch)
159
+ # LLaMA's decoder layer return type differs across Transformers versions:
160
+ # some return a tuple (hidden_states, ...), others return just a tensor.
161
+ # This line ensures we always take the first element when it's a tuple.
162
+ outs = outs[0] if isinstance(outs, tuple) else outs
163
+ # Update inputs for next iteration.
164
+ self.cache_args[0][batch_idx] = outs
165
+
166
+ if torch.cuda.is_available():
167
+ torch.cuda.empty_cache()
168
+
169
+ # Restore the original cache configuration.
170
+ if orig_use_cache is not None:
171
+ model.config.use_cache = orig_use_cache
172
+
173
+ # Clear caches to free memory
174
+ self.cache_args.clear()
175
+ self.cache_kwargs.clear()
176
+ self.num_batches = 0
177
+
178
+ model.quantizers = quantizers
179
+
180
+ return model