tico 0.1.0.dev251123__py3-none-any.whl → 0.2.0.dev260122__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tico/__init__.py +2 -2
- tico/_version.py +1 -0
- tico/passes/convert_conv3d_to_conv2d.py +435 -0
- tico/passes/convert_sym_size_to_circle_shape.py +99 -0
- tico/passes/decompose_batch_norm.py +9 -5
- tico/passes/lower_copy.py +95 -0
- tico/passes/ops.py +4 -0
- tico/quantization/algorithm/fpi_gptq/fpi_gptq.py +87 -12
- tico/quantization/algorithm/fpi_gptq/quantizer.py +9 -8
- tico/quantization/algorithm/gptq/gptq.py +211 -12
- tico/quantization/algorithm/gptq/quantizer.py +18 -12
- tico/quantization/config/fpi_gptq.py +3 -0
- tico/quantization/config/gptq.py +27 -4
- tico/quantization/public_interface.py +0 -10
- tico/quantization/wrapq/quantizer.py +2 -0
- tico/serialize/operators/adapters/onert/llama_attention.py +51 -0
- tico/serialize/operators/op_attention.py +58 -0
- tico/serialize/operators/op_circle_shape.py +64 -0
- tico/serialize/operators/op_dequantize_per_channel.py +1 -0
- tico/serialize/operators/op_dequantize_per_tensor.py +1 -0
- tico/serialize/operators/op_transpose_conv.py +66 -50
- tico/utils/convert.py +16 -1
- tico/utils/padding.py +13 -5
- tico/utils/record_input.py +2 -2
- tico/utils/register_custom_op.py +63 -0
- tico/utils/validate_args_kwargs.py +49 -4
- tico-0.2.0.dev260122.dist-info/METADATA +631 -0
- {tico-0.1.0.dev251123.dist-info → tico-0.2.0.dev260122.dist-info}/RECORD +33 -48
- {tico-0.1.0.dev251123.dist-info → tico-0.2.0.dev260122.dist-info}/WHEEL +1 -1
- {tico-0.1.0.dev251123.dist-info → tico-0.2.0.dev260122.dist-info}/entry_points.txt +0 -1
- tico/quantization/algorithm/pt2e/annotation/__init__.py +0 -1
- tico/quantization/algorithm/pt2e/annotation/annotator.py +0 -208
- tico/quantization/algorithm/pt2e/annotation/config.py +0 -26
- tico/quantization/algorithm/pt2e/annotation/op/__init__.py +0 -21
- tico/quantization/algorithm/pt2e/annotation/op/adaptive_avg_pool2d.py +0 -63
- tico/quantization/algorithm/pt2e/annotation/op/add.py +0 -55
- tico/quantization/algorithm/pt2e/annotation/op/conv2d.py +0 -90
- tico/quantization/algorithm/pt2e/annotation/op/div.py +0 -55
- tico/quantization/algorithm/pt2e/annotation/op/linear.py +0 -92
- tico/quantization/algorithm/pt2e/annotation/op/mean.py +0 -51
- tico/quantization/algorithm/pt2e/annotation/op/mul.py +0 -55
- tico/quantization/algorithm/pt2e/annotation/op/relu6.py +0 -51
- tico/quantization/algorithm/pt2e/annotation/op/rsqrt.py +0 -51
- tico/quantization/algorithm/pt2e/annotation/op/sub.py +0 -55
- tico/quantization/algorithm/pt2e/annotation/spec.py +0 -45
- tico/quantization/algorithm/pt2e/annotation/utils.py +0 -88
- tico/quantization/algorithm/pt2e/quantizer.py +0 -81
- tico/quantization/algorithm/pt2e/transformation/__init__.py +0 -1
- tico/quantization/algorithm/pt2e/transformation/convert_scalars_to_attrs.py +0 -58
- tico/quantization/algorithm/pt2e/utils.py +0 -135
- tico/quantization/config/pt2e.py +0 -25
- tico/serialize/operators/op_copy.py +0 -187
- tico-0.1.0.dev251123.dist-info/METADATA +0 -392
- /tico/{quantization/algorithm/pt2e → serialize/operators/adapters/onert}/__init__.py +0 -0
- {tico-0.1.0.dev251123.dist-info → tico-0.2.0.dev260122.dist-info/licenses}/LICENSE +0 -0
- {tico-0.1.0.dev251123.dist-info → tico-0.2.0.dev260122.dist-info}/top_level.txt +0 -0
tico/passes/ops.py
CHANGED
|
@@ -38,6 +38,10 @@ class AtenOps:
|
|
|
38
38
|
torch.ops.aten.conv1d.default,
|
|
39
39
|
torch.ops.aten.conv1d.padding,
|
|
40
40
|
]
|
|
41
|
+
self.conv3d = [
|
|
42
|
+
torch.ops.aten.conv3d.default,
|
|
43
|
+
torch.ops.aten.conv3d.padding,
|
|
44
|
+
]
|
|
41
45
|
self.detach = [
|
|
42
46
|
torch.ops.aten.detach_.default,
|
|
43
47
|
torch.ops.aten.detach.default,
|
|
@@ -25,6 +25,12 @@ from typing import Optional
|
|
|
25
25
|
import torch
|
|
26
26
|
import torch.nn as nn
|
|
27
27
|
|
|
28
|
+
from tico.quantization.algorithm.gptq.gptq import (
|
|
29
|
+
conv2d_weights_to_convtranspose2d_weights,
|
|
30
|
+
convtranspose2d_weights_to_conv2d_weights,
|
|
31
|
+
get_matmul_input_for_convtranspose2d,
|
|
32
|
+
)
|
|
33
|
+
|
|
28
34
|
from tico.quantization.algorithm.gptq.quant import quantize, Quantizer
|
|
29
35
|
|
|
30
36
|
|
|
@@ -56,11 +62,12 @@ class FPI_GPTQ:
|
|
|
56
62
|
self.layer = layer
|
|
57
63
|
self.dev = self.layer.weight.device
|
|
58
64
|
W = layer.weight.data.clone()
|
|
59
|
-
if isinstance(self.layer, nn.Conv2d):
|
|
65
|
+
if isinstance(self.layer, nn.Conv2d) or isinstance(self.layer, nn.Conv1d):
|
|
66
|
+
W = W.flatten(1)
|
|
67
|
+
elif isinstance(self.layer, nn.ConvTranspose2d):
|
|
68
|
+
W = convtranspose2d_weights_to_conv2d_weights(self.layer, W)
|
|
60
69
|
W = W.flatten(1)
|
|
61
70
|
|
|
62
|
-
if isinstance(self.layer, nn.Conv1d):
|
|
63
|
-
W = W.t()
|
|
64
71
|
self.rows = W.shape[0]
|
|
65
72
|
self.columns = W.shape[1]
|
|
66
73
|
self.H: Optional[torch.Tensor] = torch.zeros(
|
|
@@ -73,7 +80,7 @@ class FPI_GPTQ:
|
|
|
73
80
|
if len(inp.shape) == 2:
|
|
74
81
|
inp = inp.unsqueeze(0)
|
|
75
82
|
tmp = inp.shape[0]
|
|
76
|
-
if isinstance(self.layer, nn.Linear)
|
|
83
|
+
if isinstance(self.layer, nn.Linear):
|
|
77
84
|
if len(inp.shape) > 2:
|
|
78
85
|
inp = inp.reshape((-1, inp.shape[-1]))
|
|
79
86
|
inp = inp.t()
|
|
@@ -85,9 +92,57 @@ class FPI_GPTQ:
|
|
|
85
92
|
stride=self.layer.stride,
|
|
86
93
|
)
|
|
87
94
|
|
|
95
|
+
if self.layer.groups != 1:
|
|
96
|
+
# the idea behind conversion of depthwise convolution to matmul is described here
|
|
97
|
+
# https://discuss.pytorch.org/t/conv1d-implementation-using-torch-nn-functional-unfold/109643/2
|
|
98
|
+
# although depthwise convolution is equal to a set of MatMuls
|
|
99
|
+
# (please note `w.view(1, groups, out_channels // groups, -1)` in the reference above is not just w.flatten(1))
|
|
100
|
+
# we can approximate groupwise Hessians with their mean
|
|
101
|
+
# so that we will have just a single Hessian and the usual GPTQ applies
|
|
102
|
+
inp = inp.reshape(
|
|
103
|
+
inp.size(0) * self.layer.groups,
|
|
104
|
+
inp.size(1) // self.layer.groups,
|
|
105
|
+
inp.shape[2],
|
|
106
|
+
inp.shape[3],
|
|
107
|
+
) # inp.shape == (batch*groups, in_channels / groups, H, W) to meet Groupwise-wise Convolution, so that each group is colvolved with its own filter
|
|
108
|
+
|
|
109
|
+
inp = unfold(
|
|
110
|
+
inp
|
|
111
|
+
) # inp.shape == (batch*groups, k_h*k_w*in_channels / groups, flattened_patches)
|
|
112
|
+
inp = inp.permute(
|
|
113
|
+
[1, 0, 2]
|
|
114
|
+
) # inp.shape == (k_h*k_w*in_channels / groups, batch * groups, flattened_patches)
|
|
115
|
+
inp = inp.flatten(
|
|
116
|
+
1
|
|
117
|
+
) # inp.shape == (k_h*k_w*in_channels / groups, batch * groups * flattened_patches)
|
|
118
|
+
# so inp.matmul(inp.t()).shape == (k_x*k_y*in_channels / groups, k_x*k_y*in_channels / groups) == W.flatten(1)
|
|
119
|
+
|
|
120
|
+
if isinstance(self.layer, nn.Conv1d):
|
|
121
|
+
# nn.Conv1d is basically the same as nn.Conv2d so we can use the same idea as for nn.Conv2d
|
|
122
|
+
# TODO reduce code duplication
|
|
123
|
+
# represent conv1d as conv2d(1, k) on reshaped_input(batch, in_channels, 1, L)
|
|
124
|
+
unfold = nn.Unfold(
|
|
125
|
+
(1, self.layer.kernel_size[0]),
|
|
126
|
+
dilation=(1, self.layer.dilation[0]),
|
|
127
|
+
padding=(0, self.layer.padding[0]),
|
|
128
|
+
stride=(1, self.layer.stride[0]),
|
|
129
|
+
)
|
|
130
|
+
if self.layer.groups != 1:
|
|
131
|
+
# please see Conv2D for additional info
|
|
132
|
+
inp = inp.reshape(
|
|
133
|
+
inp.size(0) * self.layer.groups,
|
|
134
|
+
inp.size(1) // self.layer.groups,
|
|
135
|
+
inp.shape[2],
|
|
136
|
+
) # inp.shape == (batch*groups, in_channels / groups, L) to meet Groupwise-wise Convolution, so that each group is colvolved with its own filter
|
|
137
|
+
|
|
138
|
+
inp = inp.unsqueeze(
|
|
139
|
+
-2
|
|
140
|
+
) # (batch*groups, in_channels / groups, L)->(batch*groups, in_channels / groups, 1, L), valid for Conv2D
|
|
88
141
|
inp = unfold(inp)
|
|
89
142
|
inp = inp.permute([1, 0, 2])
|
|
90
143
|
inp = inp.flatten(1)
|
|
144
|
+
if isinstance(self.layer, nn.ConvTranspose2d):
|
|
145
|
+
inp = get_matmul_input_for_convtranspose2d(self.layer, inp)
|
|
91
146
|
|
|
92
147
|
self.H *= self.nsamples / (self.nsamples + tmp)
|
|
93
148
|
self.nsamples += tmp
|
|
@@ -100,10 +155,13 @@ class FPI_GPTQ:
|
|
|
100
155
|
verbose=False,
|
|
101
156
|
):
|
|
102
157
|
W = self.layer.weight.data.clone()
|
|
103
|
-
if isinstance(self.layer, nn.Conv2d):
|
|
158
|
+
if isinstance(self.layer, nn.Conv2d) or isinstance(self.layer, nn.Conv1d):
|
|
104
159
|
W = W.flatten(1)
|
|
105
|
-
|
|
106
|
-
W = W
|
|
160
|
+
elif isinstance(self.layer, nn.ConvTranspose2d):
|
|
161
|
+
W = convtranspose2d_weights_to_conv2d_weights(self.layer, W)
|
|
162
|
+
conv2d_shape = W.shape
|
|
163
|
+
W = W.flatten(1) # reshaped to matrix (OUT_channels x the_rest)
|
|
164
|
+
|
|
107
165
|
W = W.float()
|
|
108
166
|
tick = time.time()
|
|
109
167
|
if not self.quantizer.ready():
|
|
@@ -139,7 +197,9 @@ class FPI_GPTQ:
|
|
|
139
197
|
self.quantizer.maxq,
|
|
140
198
|
W,
|
|
141
199
|
Hinv=Hinv,
|
|
142
|
-
max_num_of_iters=
|
|
200
|
+
max_num_of_iters=min(
|
|
201
|
+
50, self.columns
|
|
202
|
+
), # we don't need to iterate more than self.columns
|
|
143
203
|
)
|
|
144
204
|
|
|
145
205
|
if torch.cuda.is_available():
|
|
@@ -151,13 +211,22 @@ class FPI_GPTQ:
|
|
|
151
211
|
|
|
152
212
|
Q = Q[:, invperm]
|
|
153
213
|
|
|
154
|
-
if isinstance(self.layer, nn.Conv2d):
|
|
214
|
+
if isinstance(self.layer, nn.Conv2d) or isinstance(self.layer, nn.Conv1d):
|
|
155
215
|
Q[:, dead] = quantize(
|
|
156
216
|
self.layer.weight.flatten(1)[:, dead],
|
|
157
217
|
self.quantizer.scale,
|
|
158
218
|
self.quantizer.zero,
|
|
159
219
|
self.quantizer.maxq,
|
|
160
220
|
)
|
|
221
|
+
elif isinstance(self.layer, nn.ConvTranspose2d):
|
|
222
|
+
Q[:, dead] = quantize(
|
|
223
|
+
convtranspose2d_weights_to_conv2d_weights(
|
|
224
|
+
self.layer, self.layer.weight.data
|
|
225
|
+
).flatten(1)[:, dead],
|
|
226
|
+
self.quantizer.scale,
|
|
227
|
+
self.quantizer.zero,
|
|
228
|
+
self.quantizer.maxq,
|
|
229
|
+
)
|
|
161
230
|
else:
|
|
162
231
|
Q[:, dead] = quantize(
|
|
163
232
|
self.layer.weight[:, dead],
|
|
@@ -166,9 +235,15 @@ class FPI_GPTQ:
|
|
|
166
235
|
self.quantizer.maxq,
|
|
167
236
|
)
|
|
168
237
|
|
|
169
|
-
|
|
170
|
-
self.layer.weight.data.dtype
|
|
171
|
-
|
|
238
|
+
if isinstance(self.layer, nn.ConvTranspose2d):
|
|
239
|
+
Q_conv2d = Q.reshape(conv2d_shape).to(self.layer.weight.data.dtype)
|
|
240
|
+
self.layer.weight.data = conv2d_weights_to_convtranspose2d_weights(
|
|
241
|
+
self.layer, Q_conv2d
|
|
242
|
+
)
|
|
243
|
+
else:
|
|
244
|
+
self.layer.weight.data = Q.reshape(self.layer.weight.shape).to(
|
|
245
|
+
self.layer.weight.data.dtype
|
|
246
|
+
)
|
|
172
247
|
|
|
173
248
|
def free(self):
|
|
174
249
|
self.H = None
|
|
@@ -76,14 +76,15 @@ class FPIGPTQQuantizer(GPTQQuantizer):
|
|
|
76
76
|
)
|
|
77
77
|
):
|
|
78
78
|
# 1) Identify quantizable submodules within the layer
|
|
79
|
-
full = find_layers(
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
79
|
+
full = find_layers(
|
|
80
|
+
layer,
|
|
81
|
+
layers=[
|
|
82
|
+
torch.nn.Linear,
|
|
83
|
+
torch.nn.Conv2d,
|
|
84
|
+
torch.nn.Conv1d,
|
|
85
|
+
torch.nn.ConvTranspose2d,
|
|
86
|
+
],
|
|
87
|
+
)
|
|
87
88
|
sequential = [list(full.keys())]
|
|
88
89
|
|
|
89
90
|
# 2) Set up (as in GPTQ)
|
|
@@ -31,16 +31,147 @@ torch.backends.cuda.matmul.allow_tf32 = False
|
|
|
31
31
|
torch.backends.cudnn.allow_tf32 = False
|
|
32
32
|
|
|
33
33
|
|
|
34
|
+
def convtranspose2d_weights_to_conv2d_weights(layer, w) -> torch.Tensor:
|
|
35
|
+
if layer.groups == 1:
|
|
36
|
+
# the last two dimensions of w is (k_h, k_w) to get equivalent Conv2D we need to flip them to get `w_conv2D_equivalent_to_w[i, j] = w_conv[k_h - i - 1, k_w - j - 1]`
|
|
37
|
+
# the first two dimensions of w is (input_channels, output_channels), so we need to transpose them as Conv2D weights should be in the (output_channels, input_channels) form
|
|
38
|
+
# please see https://github.com/pytorch/pytorch/blob/d38164a545b4a4e4e0cf73ce67173f70574890b6/torch/nn/modules/conv.py#L1059-L1061 for additional info
|
|
39
|
+
w_conv_transposed = w.transpose(1, 0).flip((-2, -1))
|
|
40
|
+
else:
|
|
41
|
+
# basically it's the same as for `layer.groups == 1` but groupwise
|
|
42
|
+
in_channels, out_channels, kernel_h, kernel_w = layer.weight.shape
|
|
43
|
+
out_channels *= layer.groups
|
|
44
|
+
w_conv_transposed = torch.zeros(
|
|
45
|
+
out_channels, in_channels // layer.groups, kernel_h, kernel_w
|
|
46
|
+
)
|
|
47
|
+
for i in range(0, layer.groups):
|
|
48
|
+
w_conv_transposed[
|
|
49
|
+
i
|
|
50
|
+
* out_channels
|
|
51
|
+
// layer.groups : (i + 1)
|
|
52
|
+
* out_channels
|
|
53
|
+
// layer.groups,
|
|
54
|
+
:,
|
|
55
|
+
:,
|
|
56
|
+
:,
|
|
57
|
+
] = (
|
|
58
|
+
w[
|
|
59
|
+
i
|
|
60
|
+
* in_channels
|
|
61
|
+
// layer.groups : (i + 1)
|
|
62
|
+
* in_channels
|
|
63
|
+
// layer.groups,
|
|
64
|
+
:,
|
|
65
|
+
:,
|
|
66
|
+
:,
|
|
67
|
+
]
|
|
68
|
+
.transpose(1, 0)
|
|
69
|
+
.flip((-2, -1))
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
return w_conv_transposed
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def conv2d_weights_to_convtranspose2d_weights(orig_layer, w) -> torch.Tensor:
|
|
76
|
+
# this is just an inverse of convtranspose2d_weights_to_conv2d_weights
|
|
77
|
+
if orig_layer.groups > 1:
|
|
78
|
+
in_channels, out_channels, _, _ = orig_layer.weight.shape
|
|
79
|
+
out_channels *= orig_layer.groups
|
|
80
|
+
w_conv_transposed = torch.zeros_like(orig_layer.weight)
|
|
81
|
+
for i in range(0, orig_layer.groups):
|
|
82
|
+
w_conv_transposed[
|
|
83
|
+
i
|
|
84
|
+
* in_channels
|
|
85
|
+
// orig_layer.groups : (i + 1)
|
|
86
|
+
* in_channels
|
|
87
|
+
// orig_layer.groups,
|
|
88
|
+
:,
|
|
89
|
+
:,
|
|
90
|
+
:,
|
|
91
|
+
] = (
|
|
92
|
+
w[
|
|
93
|
+
i
|
|
94
|
+
* out_channels
|
|
95
|
+
// orig_layer.groups : (i + 1)
|
|
96
|
+
* out_channels
|
|
97
|
+
// orig_layer.groups,
|
|
98
|
+
:,
|
|
99
|
+
:,
|
|
100
|
+
:,
|
|
101
|
+
]
|
|
102
|
+
.transpose(1, 0)
|
|
103
|
+
.flip((-2, -1))
|
|
104
|
+
)
|
|
105
|
+
else:
|
|
106
|
+
w_conv_transposed = w.transpose(1, 0).flip((-2, -1))
|
|
107
|
+
|
|
108
|
+
return w_conv_transposed
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def get_matmul_input_for_convtranspose2d(layer, inp):
|
|
112
|
+
# Please see https://github.com/pytorch/pytorch/blob/d38164a545b4a4e4e0cf73ce67173f70574890b6/torch/nn/modules/conv.py#L996-L998 for padding
|
|
113
|
+
strided_pad = (
|
|
114
|
+
layer.dilation[0] * (layer.kernel_size[0] - 1) - layer.padding[0],
|
|
115
|
+
layer.dilation[1] * (layer.kernel_size[1] - 1) - layer.padding[1],
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
# interleave input with zero rows and columns according to stride
|
|
119
|
+
# Please see https://github.com/pytorch/pytorch/blob/d38164a545b4a4e4e0cf73ce67173f70574890b6/torch/nn/modules/conv.py#L991-L994 for more info
|
|
120
|
+
inp_strided = torch.zeros(
|
|
121
|
+
inp.shape[0],
|
|
122
|
+
inp.shape[1],
|
|
123
|
+
layer.stride[0] * (inp.shape[2] - 1) + 2 * strided_pad[0] + 1,
|
|
124
|
+
layer.stride[1] * (inp.shape[3] - 1) + 2 * strided_pad[1] + 1,
|
|
125
|
+
device=inp.device,
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
indices = torch.arange(0, inp.shape[2], device=inp.device)
|
|
129
|
+
# insert original input values according to stride to meet https://github.com/pytorch/pytorch/blob/d38164a545b4a4e4e0cf73ce67173f70574890b6/torch/nn/modules/conv.py#L991-L994
|
|
130
|
+
inp_strided[
|
|
131
|
+
:,
|
|
132
|
+
:,
|
|
133
|
+
layer.stride[0] * indices + strided_pad[0],
|
|
134
|
+
strided_pad[1] : -strided_pad[1] : layer.stride[1],
|
|
135
|
+
] = inp[:, :, indices, :]
|
|
136
|
+
del inp
|
|
137
|
+
inp = (
|
|
138
|
+
inp_strided # so the rest is just processing for Conv2D with transposed weights
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
# TODO reduce code duplication with Conv2D
|
|
142
|
+
unfold = nn.Unfold(
|
|
143
|
+
layer.kernel_size,
|
|
144
|
+
dilation=layer.dilation,
|
|
145
|
+
padding=(
|
|
146
|
+
0,
|
|
147
|
+
0,
|
|
148
|
+
), # equivalent Conv2D has (0, 0) padding for input_strided as input
|
|
149
|
+
stride=(1, 1), # equivalent Conv2D has (1, 1) stride for input_strided as input
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
if layer.groups != 1:
|
|
153
|
+
inp = inp.reshape(
|
|
154
|
+
inp.size(0) * layer.groups,
|
|
155
|
+
inp.size(1) // layer.groups,
|
|
156
|
+
inp.shape[2],
|
|
157
|
+
inp.shape[3],
|
|
158
|
+
) # inp.shape == (batch*groups, in_channels / groups, H, W) to meet Groupwise-wise Convolution, so that each group is colvolved with its own filter
|
|
159
|
+
|
|
160
|
+
inp = unfold(inp).permute([1, 0, 2]).flatten(1)
|
|
161
|
+
return inp
|
|
162
|
+
|
|
163
|
+
|
|
34
164
|
class GPTQ:
|
|
35
165
|
def __init__(self, layer):
|
|
36
166
|
self.layer = layer
|
|
37
167
|
self.dev = self.layer.weight.device
|
|
38
168
|
W = layer.weight.data.clone()
|
|
39
|
-
if isinstance(self.layer, nn.Conv2d):
|
|
169
|
+
if isinstance(self.layer, nn.Conv2d) or isinstance(self.layer, nn.Conv1d):
|
|
170
|
+
W = W.flatten(1) # reshaped to matrix (OUT_channels x the_rest)
|
|
171
|
+
elif isinstance(self.layer, nn.ConvTranspose2d):
|
|
172
|
+
W = convtranspose2d_weights_to_conv2d_weights(self.layer, W)
|
|
40
173
|
W = W.flatten(1)
|
|
41
174
|
|
|
42
|
-
if isinstance(self.layer, nn.Conv1d):
|
|
43
|
-
W = W.t()
|
|
44
175
|
self.rows = W.shape[0]
|
|
45
176
|
self.columns = W.shape[1]
|
|
46
177
|
self.H: Optional[torch.Tensor] = torch.zeros(
|
|
@@ -53,7 +184,7 @@ class GPTQ:
|
|
|
53
184
|
if len(inp.shape) == 2:
|
|
54
185
|
inp = inp.unsqueeze(0)
|
|
55
186
|
tmp = inp.shape[0]
|
|
56
|
-
if isinstance(self.layer, nn.Linear)
|
|
187
|
+
if isinstance(self.layer, nn.Linear):
|
|
57
188
|
if len(inp.shape) > 2:
|
|
58
189
|
inp = inp.reshape((-1, inp.shape[-1]))
|
|
59
190
|
inp = inp.t()
|
|
@@ -65,10 +196,59 @@ class GPTQ:
|
|
|
65
196
|
stride=self.layer.stride,
|
|
66
197
|
)
|
|
67
198
|
|
|
199
|
+
if self.layer.groups != 1:
|
|
200
|
+
# the idea behind conversion of depthwise convolution to matmul is described here
|
|
201
|
+
# https://discuss.pytorch.org/t/conv1d-implementation-using-torch-nn-functional-unfold/109643/2
|
|
202
|
+
# although depthwise convolution is equal to a set of MatMuls
|
|
203
|
+
# (please note `w.view(1, groups, out_channels // groups, -1)` in the reference above is not just w.flatten(1))
|
|
204
|
+
# we can approximate groupwise Hessians with their mean
|
|
205
|
+
# so that we will have just a single Hessian and the usual GPTQ applies
|
|
206
|
+
inp = inp.reshape(
|
|
207
|
+
inp.size(0) * self.layer.groups,
|
|
208
|
+
inp.size(1) // self.layer.groups,
|
|
209
|
+
inp.shape[2],
|
|
210
|
+
inp.shape[3],
|
|
211
|
+
) # inp.shape == (batch*groups, in_channels / groups, H, W) to meet Groupwise-wise Convolution, so that each group is colvolved with its own filter
|
|
212
|
+
|
|
213
|
+
inp = unfold(
|
|
214
|
+
inp
|
|
215
|
+
) # inp.shape == (batch*groups, k_h*k_w*in_channels / groups, flattened_patches)
|
|
216
|
+
inp = inp.permute(
|
|
217
|
+
[1, 0, 2]
|
|
218
|
+
) # inp.shape == (k_h*k_w*in_channels / groups, batch * groups, flattened_patches)
|
|
219
|
+
inp = inp.flatten(
|
|
220
|
+
1
|
|
221
|
+
) # inp.shape == (k_h*k_w*in_channels / groups, batch * groups * flattened_patches)
|
|
222
|
+
# so inp.matmul(inp.t()).shape == (k_x*k_y*in_channels / groups, k_x*k_y*in_channels / groups) == W.flatten(1)
|
|
223
|
+
|
|
224
|
+
if isinstance(self.layer, nn.Conv1d):
|
|
225
|
+
# nn.Conv1d is basically the same as nn.Conv2d so we can use the same idea as for nn.Conv2d
|
|
226
|
+
# TODO reduce code duplication
|
|
227
|
+
# represent conv1d as conv2d(1, k) on reshaped_input(batch, in_channels, 1, L)
|
|
228
|
+
unfold = nn.Unfold(
|
|
229
|
+
(1, self.layer.kernel_size[0]),
|
|
230
|
+
dilation=(1, self.layer.dilation[0]),
|
|
231
|
+
padding=(0, self.layer.padding[0]),
|
|
232
|
+
stride=(1, self.layer.stride[0]),
|
|
233
|
+
)
|
|
234
|
+
if self.layer.groups != 1:
|
|
235
|
+
# please see Conv2D for additional info
|
|
236
|
+
inp = inp.reshape(
|
|
237
|
+
inp.size(0) * self.layer.groups,
|
|
238
|
+
inp.size(1) // self.layer.groups,
|
|
239
|
+
inp.shape[2],
|
|
240
|
+
) # inp.shape == (batch*groups, in_channels / groups, L) to meet Groupwise-wise Convolution, so that each group is colvolved with its own filter
|
|
241
|
+
|
|
242
|
+
inp = inp.unsqueeze(
|
|
243
|
+
-2
|
|
244
|
+
) # (batch*groups, in_channels / groups, L)->(batch*groups, in_channels / groups, 1, L), valid for Conv2D
|
|
68
245
|
inp = unfold(inp)
|
|
69
246
|
inp = inp.permute([1, 0, 2])
|
|
70
247
|
inp = inp.flatten(1)
|
|
71
248
|
|
|
249
|
+
if isinstance(self.layer, nn.ConvTranspose2d):
|
|
250
|
+
inp = get_matmul_input_for_convtranspose2d(self.layer, inp)
|
|
251
|
+
|
|
72
252
|
self.H *= self.nsamples / (self.nsamples + tmp)
|
|
73
253
|
self.nsamples += tmp
|
|
74
254
|
inp = math.sqrt(2 / self.nsamples) * inp.float()
|
|
@@ -84,10 +264,13 @@ class GPTQ:
|
|
|
84
264
|
verbose=False,
|
|
85
265
|
):
|
|
86
266
|
W = self.layer.weight.data.clone()
|
|
87
|
-
if isinstance(self.layer, nn.Conv2d):
|
|
88
|
-
W = W.flatten(1)
|
|
89
|
-
|
|
90
|
-
W = W
|
|
267
|
+
if isinstance(self.layer, nn.Conv2d) or isinstance(self.layer, nn.Conv1d):
|
|
268
|
+
W = W.flatten(1) # reshaped to matrix (OUT_channels x the_rest)
|
|
269
|
+
elif isinstance(self.layer, nn.ConvTranspose2d):
|
|
270
|
+
W = convtranspose2d_weights_to_conv2d_weights(self.layer, W)
|
|
271
|
+
conv2d_shape = W.shape
|
|
272
|
+
W = W.flatten(1) # reshaped to matrix (OUT_channels x the_rest)
|
|
273
|
+
|
|
91
274
|
W = W.float()
|
|
92
275
|
tick = time.time()
|
|
93
276
|
if not self.quantizer.ready():
|
|
@@ -181,7 +364,7 @@ class GPTQ:
|
|
|
181
364
|
if actorder:
|
|
182
365
|
Q = Q[:, invperm]
|
|
183
366
|
|
|
184
|
-
if isinstance(self.layer, nn.Conv2d):
|
|
367
|
+
if isinstance(self.layer, nn.Conv2d) or isinstance(self.layer, nn.Conv1d):
|
|
185
368
|
if groupsize == -1: # TODO support groupsize != -1
|
|
186
369
|
Q[:, dead] = quantize(
|
|
187
370
|
self.layer.weight.flatten(1)[:, dead],
|
|
@@ -189,6 +372,16 @@ class GPTQ:
|
|
|
189
372
|
self.quantizer.zero,
|
|
190
373
|
self.quantizer.maxq,
|
|
191
374
|
)
|
|
375
|
+
elif isinstance(self.layer, nn.ConvTranspose2d):
|
|
376
|
+
if groupsize == -1: # TODO support groupsize != -1
|
|
377
|
+
Q[:, dead] = quantize(
|
|
378
|
+
convtranspose2d_weights_to_conv2d_weights(
|
|
379
|
+
self.layer, self.layer.weight.data
|
|
380
|
+
).flatten(1)[:, dead],
|
|
381
|
+
self.quantizer.scale,
|
|
382
|
+
self.quantizer.zero,
|
|
383
|
+
self.quantizer.maxq,
|
|
384
|
+
)
|
|
192
385
|
else:
|
|
193
386
|
if groupsize == -1: # TODO support groupsize != -1
|
|
194
387
|
Q[:, dead] = quantize(
|
|
@@ -202,9 +395,15 @@ class GPTQ:
|
|
|
202
395
|
groupsize == -1 or torch.sum(dead) == 0
|
|
203
396
|
) # TODO `dead` elements should be RTN quantized for groupwise
|
|
204
397
|
|
|
205
|
-
|
|
206
|
-
self.layer.weight.data.dtype
|
|
207
|
-
|
|
398
|
+
if isinstance(self.layer, nn.ConvTranspose2d):
|
|
399
|
+
Q_conv2d = Q.reshape(conv2d_shape).to(self.layer.weight.data.dtype)
|
|
400
|
+
self.layer.weight.data = conv2d_weights_to_convtranspose2d_weights(
|
|
401
|
+
self.layer, Q_conv2d
|
|
402
|
+
)
|
|
403
|
+
else:
|
|
404
|
+
self.layer.weight.data = Q.reshape(self.layer.weight.shape).to(
|
|
405
|
+
self.layer.weight.data.dtype
|
|
406
|
+
)
|
|
208
407
|
|
|
209
408
|
def free(self):
|
|
210
409
|
self.H = None
|
|
@@ -170,6 +170,7 @@ class GPTQQuantizer(BaseQuantizer):
|
|
|
170
170
|
|
|
171
171
|
gptq_conf = self.config
|
|
172
172
|
assert isinstance(gptq_conf, GPTQConfig)
|
|
173
|
+
gptq_conf.validate()
|
|
173
174
|
# Disable use_cache during calibration
|
|
174
175
|
if hasattr(model, "config") and hasattr(model.config, "use_cache"):
|
|
175
176
|
orig_use_cache = model.config.use_cache
|
|
@@ -193,13 +194,15 @@ class GPTQQuantizer(BaseQuantizer):
|
|
|
193
194
|
)
|
|
194
195
|
):
|
|
195
196
|
# 1) Identify quantizable submodules within the layer
|
|
196
|
-
full = find_layers(
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
197
|
+
full = find_layers(
|
|
198
|
+
layer,
|
|
199
|
+
layers=[
|
|
200
|
+
torch.nn.Linear,
|
|
201
|
+
torch.nn.Conv2d,
|
|
202
|
+
torch.nn.Conv1d,
|
|
203
|
+
torch.nn.ConvTranspose2d,
|
|
204
|
+
],
|
|
205
|
+
)
|
|
203
206
|
sequential = [list(full.keys())]
|
|
204
207
|
|
|
205
208
|
# 2) Set up GPTQ objects and gather stats
|
|
@@ -210,7 +213,10 @@ class GPTQQuantizer(BaseQuantizer):
|
|
|
210
213
|
for name in subset:
|
|
211
214
|
gptq[name] = GPTQ(subset[name])
|
|
212
215
|
gptq[name].quantizer.configure(
|
|
213
|
-
bits=
|
|
216
|
+
bits=gptq_conf.weight_bits,
|
|
217
|
+
perchannel=gptq_conf.perchannel,
|
|
218
|
+
sym=gptq_conf.symmetric,
|
|
219
|
+
mse=gptq_conf.mse,
|
|
214
220
|
)
|
|
215
221
|
|
|
216
222
|
# Hook to collect (inp, out) for GPTQ
|
|
@@ -250,10 +256,10 @@ class GPTQQuantizer(BaseQuantizer):
|
|
|
250
256
|
if gptq_conf.verbose:
|
|
251
257
|
print(f"[Layer {l_idx}] {name} -> Quantizing ...")
|
|
252
258
|
gptq[name].fasterquant(
|
|
253
|
-
percdamp=
|
|
254
|
-
groupsize
|
|
255
|
-
actorder=
|
|
256
|
-
static_groups=
|
|
259
|
+
percdamp=gptq_conf.percdamp,
|
|
260
|
+
groupsize=gptq_conf.groupsize,
|
|
261
|
+
actorder=gptq_conf.actorder,
|
|
262
|
+
static_groups=gptq_conf.static_groups,
|
|
257
263
|
verbose=gptq_conf.verbose,
|
|
258
264
|
)
|
|
259
265
|
quantizers[f"model.layers.{l_idx}.{name}"] = gptq[name].quantizer
|
|
@@ -12,9 +12,12 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
+
from dataclasses import dataclass
|
|
16
|
+
|
|
15
17
|
from tico.quantization.config.gptq import GPTQConfig
|
|
16
18
|
|
|
17
19
|
|
|
20
|
+
@dataclass
|
|
18
21
|
class FPIGPTQConfig(GPTQConfig):
|
|
19
22
|
"""
|
|
20
23
|
Configuration for FPIGPTQ (Fixed Point Iteration).
|
tico/quantization/config/gptq.py
CHANGED
|
@@ -12,18 +12,41 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
+
from dataclasses import dataclass
|
|
16
|
+
|
|
15
17
|
from tico.quantization.config.base import BaseConfig
|
|
16
18
|
|
|
17
19
|
|
|
20
|
+
@dataclass
|
|
18
21
|
class GPTQConfig(BaseConfig):
|
|
19
22
|
"""
|
|
20
|
-
Configuration for GPTQ.
|
|
23
|
+
Configuration for GPTQ weight quantization.
|
|
21
24
|
"""
|
|
22
25
|
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
+
# general
|
|
27
|
+
verbose: bool = False
|
|
28
|
+
show_progress: bool = True
|
|
29
|
+
|
|
30
|
+
# quantizer.configure params (weight quantization spec)
|
|
31
|
+
weight_bits: int = 8
|
|
32
|
+
perchannel: bool = True
|
|
33
|
+
symmetric: bool = False
|
|
34
|
+
mse: bool = False
|
|
35
|
+
|
|
36
|
+
# GPTQ.fasterquant params (algorithm hyperparams)
|
|
37
|
+
percdamp: float = 0.01
|
|
38
|
+
groupsize: int = -1
|
|
39
|
+
actorder: bool = True
|
|
40
|
+
static_groups: bool = False
|
|
26
41
|
|
|
27
42
|
@property
|
|
28
43
|
def name(self) -> str:
|
|
29
44
|
return "gptq"
|
|
45
|
+
|
|
46
|
+
def validate(self) -> None:
|
|
47
|
+
if self.weight_bits <= 0:
|
|
48
|
+
raise ValueError(f"weight_bits must be positive. got {self.weight_bits}")
|
|
49
|
+
if self.groupsize != -1 and self.groupsize <= 0:
|
|
50
|
+
raise ValueError(f"groupsize must be -1 or positive. got {self.groupsize}")
|
|
51
|
+
if not (0.0 < self.percdamp <= 1.0):
|
|
52
|
+
raise ValueError(f"percdamp must be in (0, 1]. got {self.percdamp}")
|
|
@@ -18,7 +18,6 @@ from typing import Any, Dict, Optional
|
|
|
18
18
|
import torch
|
|
19
19
|
|
|
20
20
|
from tico.quantization.algorithm.gptq.quantizer import GPTQQuantizer
|
|
21
|
-
from tico.quantization.algorithm.pt2e.quantizer import PT2EQuantizer
|
|
22
21
|
from tico.quantization.config.base import BaseConfig
|
|
23
22
|
from tico.quantization.quantizer import BaseQuantizer
|
|
24
23
|
from tico.quantization.quantizer_registry import get_quantizer
|
|
@@ -55,11 +54,6 @@ def prepare(
|
|
|
55
54
|
raise RuntimeError("prepare() already has been called.")
|
|
56
55
|
quantizer = get_quantizer(quant_config)
|
|
57
56
|
|
|
58
|
-
if isinstance(quantizer, PT2EQuantizer) and inplace:
|
|
59
|
-
raise RuntimeError(
|
|
60
|
-
"In-place is not supported for PT2E quantization due to limitation in the underlying Torch APIs. Please set 'inplace=False' to proceed."
|
|
61
|
-
)
|
|
62
|
-
|
|
63
57
|
model = model if inplace else copy.deepcopy(model)
|
|
64
58
|
|
|
65
59
|
model = quantizer.prepare(model, args, kwargs)
|
|
@@ -90,10 +84,6 @@ def convert(model, inplace: Optional[bool] = True):
|
|
|
90
84
|
else:
|
|
91
85
|
raise RuntimeError("Call prepare() function first.")
|
|
92
86
|
|
|
93
|
-
if isinstance(quantizer, PT2EQuantizer) and inplace:
|
|
94
|
-
raise RuntimeError(
|
|
95
|
-
"In-place is not supported for PT2E quantization due to limitation in the underlying Torch APIs. Please set 'inplace=False' to proceed."
|
|
96
|
-
)
|
|
97
87
|
# deepcopy prevents the quantizer from restoring the catcher used for calibration.
|
|
98
88
|
# TODO Revisit `inplace` policy.
|
|
99
89
|
if isinstance(quantizer, GPTQQuantizer) and not inplace:
|
|
@@ -115,6 +115,7 @@ class PTQQuantizer(BaseQuantizer):
|
|
|
115
115
|
assert not self.strict_wrap
|
|
116
116
|
wrapped = self._wrap_supported(wrapped, child_cfg)
|
|
117
117
|
root[i] = wrapped # type: ignore[index]
|
|
118
|
+
return root
|
|
118
119
|
|
|
119
120
|
if isinstance(root, nn.ModuleDict):
|
|
120
121
|
for k, child in list(root.items()):
|
|
@@ -128,6 +129,7 @@ class PTQQuantizer(BaseQuantizer):
|
|
|
128
129
|
assert not self.strict_wrap
|
|
129
130
|
wrapped = self._wrap_supported(wrapped, child_cfg)
|
|
130
131
|
root[k] = wrapped # type: ignore[index]
|
|
132
|
+
return root
|
|
131
133
|
|
|
132
134
|
# Case C: Leaf node
|
|
133
135
|
root_name = getattr(root, "_get_name", lambda: None)()
|