potnn 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- potnn/__init__.py +86 -0
- potnn/codegen/__init__.py +20 -0
- potnn/codegen/bit2.py +263 -0
- potnn/codegen/fp130.py +269 -0
- potnn/codegen/header.py +460 -0
- potnn/codegen/level5.py +393 -0
- potnn/codegen/scale.py +184 -0
- potnn/codegen/ternary.py +354 -0
- potnn/codegen/unroll.py +616 -0
- potnn/config.py +112 -0
- potnn/export.py +2196 -0
- potnn/fuse.py +167 -0
- potnn/modules/__init__.py +11 -0
- potnn/modules/add.py +114 -0
- potnn/modules/avgpool.py +173 -0
- potnn/modules/base.py +225 -0
- potnn/modules/conv.py +203 -0
- potnn/modules/conv1d.py +317 -0
- potnn/modules/depthwise.py +216 -0
- potnn/modules/linear.py +199 -0
- potnn/quantize/__init__.py +35 -0
- potnn/quantize/calibration.py +233 -0
- potnn/quantize/integer_ops.py +207 -0
- potnn/quantize/integer_sim.py +225 -0
- potnn/quantize/pot.py +455 -0
- potnn/quantize/qat.py +356 -0
- potnn/utils/__init__.py +13 -0
- potnn/utils/allocation.py +240 -0
- potnn/utils/memory.py +158 -0
- potnn/wrapper.py +304 -0
- potnn-1.0.0.dist-info/METADATA +260 -0
- potnn-1.0.0.dist-info/RECORD +35 -0
- potnn-1.0.0.dist-info/WHEEL +5 -0
- potnn-1.0.0.dist-info/licenses/LICENSE +72 -0
- potnn-1.0.0.dist-info/top_level.txt +1 -0
potnn/modules/conv.py
ADDED
|
@@ -0,0 +1,203 @@
|
|
|
1
|
+
"""PoT-quantized Conv2d layer with Integer Simulation.
|
|
2
|
+
|
|
3
|
+
v2: Added integer simulation for C-compatible QAT
|
|
4
|
+
- Forward pass can simulate C integer operations exactly
|
|
5
|
+
- Matches C inference bit-for-bit when use_integer_sim=True
|
|
6
|
+
- Eliminates QAT-C accuracy gap
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import torch
|
|
10
|
+
import torch.nn as nn
|
|
11
|
+
import torch.nn.functional as F
|
|
12
|
+
from typing import Optional, Union, Tuple
|
|
13
|
+
|
|
14
|
+
from .base import PoTLayerBase
|
|
15
|
+
from ..quantize.pot import quantize_to_pot_ste, quantize_to_pot, quantize_activation_ste, apply_5level_zero_constraint
|
|
16
|
+
from ..quantize.integer_ops import (
|
|
17
|
+
round_half_up_ste, clamp_ste,
|
|
18
|
+
fake_quantize_input, fake_quantize_input_uint8,
|
|
19
|
+
fake_requantize
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class PoTConv2d(PoTLayerBase):
|
|
24
|
+
"""Power-of-Two quantized Conv2d layer.
|
|
25
|
+
|
|
26
|
+
This layer implements a Conv2d layer with PoT weight quantization.
|
|
27
|
+
|
|
28
|
+
[Integer-Only QAT Mode]
|
|
29
|
+
The forward pass simulates C integer arithmetic EXACTLY:
|
|
30
|
+
1. Input Quantization: float -> int8 (or uint8 for first layer)
|
|
31
|
+
2. Integer Conv: int8 * int8 -> int32
|
|
32
|
+
3. Requantize: (int32 * scale_int + round) >> shift
|
|
33
|
+
4. Bias Add: + round(bias_adjusted * act_scale)
|
|
34
|
+
5. Clamp: [0, 127]
|
|
35
|
+
|
|
36
|
+
This ensures that training accuracy matches C deployment accuracy.
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
def __init__(
|
|
40
|
+
self,
|
|
41
|
+
in_channels: int,
|
|
42
|
+
out_channels: int,
|
|
43
|
+
kernel_size: Union[int, Tuple[int, int]],
|
|
44
|
+
stride: Union[int, Tuple[int, int]] = 1,
|
|
45
|
+
padding: Union[int, Tuple[int, int]] = 0,
|
|
46
|
+
dilation: Union[int, Tuple[int, int]] = 1,
|
|
47
|
+
groups: int = 1,
|
|
48
|
+
bias: bool = True,
|
|
49
|
+
encoding: str = 'unroll'
|
|
50
|
+
):
|
|
51
|
+
super().__init__(encoding)
|
|
52
|
+
|
|
53
|
+
self.in_channels = in_channels
|
|
54
|
+
self.out_channels = out_channels
|
|
55
|
+
self.kernel_size = kernel_size if isinstance(kernel_size, tuple) else (kernel_size, kernel_size)
|
|
56
|
+
self.stride = stride if isinstance(stride, tuple) else (stride, stride)
|
|
57
|
+
self.padding = padding if isinstance(padding, tuple) else (padding, padding)
|
|
58
|
+
self.dilation = dilation if isinstance(dilation, tuple) else (dilation, dilation)
|
|
59
|
+
self.groups = groups
|
|
60
|
+
|
|
61
|
+
# Initialize weight parameter
|
|
62
|
+
self.weight = nn.Parameter(torch.empty(
|
|
63
|
+
out_channels, in_channels // groups, *self.kernel_size
|
|
64
|
+
))
|
|
65
|
+
|
|
66
|
+
# Initialize bias parameter
|
|
67
|
+
if bias:
|
|
68
|
+
self.bias = nn.Parameter(torch.zeros(out_channels))
|
|
69
|
+
else:
|
|
70
|
+
self.register_parameter('bias', None)
|
|
71
|
+
|
|
72
|
+
# Initialize weights using Kaiming normal
|
|
73
|
+
nn.init.kaiming_normal_(self.weight, mode='fan_out', nonlinearity='relu')
|
|
74
|
+
|
|
75
|
+
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
|
76
|
+
"""Forward pass with three modes:
|
|
77
|
+
1. Float warmup (quantize=False): Standard conv
|
|
78
|
+
2. Float QAT (use_integer_sim=False): PoT weight + float activation
|
|
79
|
+
3. Integer sim (use_integer_sim=True): C-identical integer ops
|
|
80
|
+
"""
|
|
81
|
+
if not self.quantize:
|
|
82
|
+
# Float mode (warmup training)
|
|
83
|
+
return F.conv2d(
|
|
84
|
+
x, self.weight, self.bias,
|
|
85
|
+
self.stride, self.padding, self.dilation, self.groups
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
if not getattr(self, 'use_integer_sim', False):
|
|
89
|
+
# Float QAT: PoT weight + float activation
|
|
90
|
+
# ReLU는 모델에서 외부로 호출 (torch.relu(conv(x)))
|
|
91
|
+
w_pot = quantize_to_pot_ste(self.weight, self.alpha, encoding=self.encoding)
|
|
92
|
+
|
|
93
|
+
# 5level constraint
|
|
94
|
+
if self.encoding == '5level' and self.enforce_5level_constraint:
|
|
95
|
+
w_pot = apply_5level_zero_constraint(w_pot)
|
|
96
|
+
|
|
97
|
+
out = F.conv2d(
|
|
98
|
+
x, w_pot * self.alpha, self.bias,
|
|
99
|
+
self.stride, self.padding, self.dilation, self.groups
|
|
100
|
+
)
|
|
101
|
+
return out
|
|
102
|
+
|
|
103
|
+
# === Integer Simulation Mode (C-identical) ===
|
|
104
|
+
|
|
105
|
+
# === 1. Prepare Integer Parameters ===
|
|
106
|
+
# Always compute dynamically to ensure consistency with export
|
|
107
|
+
scale_int, shift, _ = self._compute_scale_and_shift()
|
|
108
|
+
|
|
109
|
+
is_first = self.is_first_layer.item() if self.is_first_layer is not None else False
|
|
110
|
+
is_last = self.is_last_layer.item() if self.is_last_layer is not None else False
|
|
111
|
+
|
|
112
|
+
# === 2. Input Quantization ===
|
|
113
|
+
if is_first:
|
|
114
|
+
# First layer: Input is normalized float (x - mean) / std
|
|
115
|
+
# We must simulate C behavior: raw uint8 input
|
|
116
|
+
if self.input_mean is not None and self.input_std is not None:
|
|
117
|
+
# Denormalize: x_raw = x * avg_std + mean
|
|
118
|
+
avg_std = self.input_std.mean().item()
|
|
119
|
+
mean = self.input_mean.view(1, -1, 1, 1).to(x.device)
|
|
120
|
+
x_raw = x * avg_std + mean
|
|
121
|
+
x_raw = clamp_ste(x_raw, 0.0, 1.0)
|
|
122
|
+
else:
|
|
123
|
+
x_raw = x
|
|
124
|
+
|
|
125
|
+
# Quantize to uint8 [0, 255]
|
|
126
|
+
# Match C test data generation (img * 255.0)
|
|
127
|
+
x_int = fake_quantize_input_uint8(x_raw, 255.0)
|
|
128
|
+
else:
|
|
129
|
+
# Other layers: Input is already int8 from previous layer
|
|
130
|
+
# No quantization needed
|
|
131
|
+
x_int = x
|
|
132
|
+
|
|
133
|
+
# === 3. Weight Quantization ===
|
|
134
|
+
w_pot = quantize_to_pot_ste(self.weight, self.alpha, encoding=self.encoding)
|
|
135
|
+
|
|
136
|
+
# 5level constraint (always apply for 5level encoding to match export)
|
|
137
|
+
if self.encoding == '5level':
|
|
138
|
+
w_pot = apply_5level_zero_constraint(w_pot)
|
|
139
|
+
|
|
140
|
+
if is_first:
|
|
141
|
+
# DEBUG: L0 weights
|
|
142
|
+
pass
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
# === 4. Integer Convolution ===
|
|
146
|
+
# F.conv2d with integer-valued inputs/weights -> integer-valued output (float dtype)
|
|
147
|
+
acc = F.conv2d(
|
|
148
|
+
x_int, w_pot,
|
|
149
|
+
None, # Bias added separately
|
|
150
|
+
self.stride, self.padding, self.dilation, self.groups
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
# === 5. Requantize ===
|
|
154
|
+
# (acc * scale_int + round) >> shift
|
|
155
|
+
acc_scaled = fake_requantize(acc, scale_int, shift)
|
|
156
|
+
|
|
157
|
+
# === 6. Bias Addition ===
|
|
158
|
+
if self.bias is not None:
|
|
159
|
+
act_scale = self.act_scale if self.act_scale is not None else torch.tensor(1.0)
|
|
160
|
+
|
|
161
|
+
if is_first and self.input_mean is not None and self.input_std is not None:
|
|
162
|
+
# Absorb mean/std into bias (Dynamic for training)
|
|
163
|
+
# bias_adj = bias - (mean/std) * sum(W) * alpha
|
|
164
|
+
avg_std = self.input_std.mean().item()
|
|
165
|
+
alpha = self.alpha
|
|
166
|
+
|
|
167
|
+
# Calculate weight sum per channel
|
|
168
|
+
# w_pot shape: [out, in, k, k]
|
|
169
|
+
w_sum = w_pot.sum(dim=(2, 3)) # [out, in]
|
|
170
|
+
|
|
171
|
+
# We need to sum over input channels weighted by mean[c]
|
|
172
|
+
# bias_correction = sum_c (mean[c]/avg_std * w_sum[:, c])
|
|
173
|
+
mean_vec = self.input_mean.view(1, -1).to(x.device) # [1, in]
|
|
174
|
+
bias_correction = (mean_vec / avg_std * w_sum).sum(dim=1) # [out]
|
|
175
|
+
|
|
176
|
+
bias_adjusted = self.bias - bias_correction * alpha
|
|
177
|
+
else:
|
|
178
|
+
bias_adjusted = self.bias
|
|
179
|
+
|
|
180
|
+
# Quantize bias: round(bias * act_scale)
|
|
181
|
+
bias_int = round_half_up_ste(bias_adjusted * act_scale)
|
|
182
|
+
|
|
183
|
+
# Add bias
|
|
184
|
+
acc_scaled = acc_scaled + bias_int.view(1, -1, 1, 1)
|
|
185
|
+
|
|
186
|
+
# === 7. Clamp (ReLU) ===
|
|
187
|
+
if not is_last:
|
|
188
|
+
out = clamp_ste(acc_scaled, 0.0, 127.0)
|
|
189
|
+
else:
|
|
190
|
+
out = acc_scaled
|
|
191
|
+
|
|
192
|
+
# === 8. Output ===
|
|
193
|
+
# Round to ensure exact integer (floating point precision)
|
|
194
|
+
# Use STE to maintain gradient flow during training
|
|
195
|
+
# int8 그대로 반환 (C와 동일)
|
|
196
|
+
out = round_half_up_ste(out)
|
|
197
|
+
|
|
198
|
+
return out
|
|
199
|
+
|
|
200
|
+
def extra_repr(self) -> str:
|
|
201
|
+
s = super().extra_repr()
|
|
202
|
+
s += f', quantize={self.quantize}'
|
|
203
|
+
return s
|
potnn/modules/conv1d.py
ADDED
|
@@ -0,0 +1,317 @@
|
|
|
1
|
+
"""PoT-quantized Conv1d layer with Integer Simulation.
|
|
2
|
+
|
|
3
|
+
v1: 1D convolution support for time-series and audio processing
|
|
4
|
+
- Forward pass can simulate C integer operations exactly
|
|
5
|
+
- Matches C inference bit-for-bit when use_integer_sim=True
|
|
6
|
+
- Eliminates QAT-C accuracy gap
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import torch
|
|
10
|
+
import torch.nn as nn
|
|
11
|
+
import torch.nn.functional as F
|
|
12
|
+
from typing import Optional, Union
|
|
13
|
+
|
|
14
|
+
from .base import PoTLayerBase
|
|
15
|
+
from ..quantize.pot import quantize_to_pot_ste, quantize_to_pot, quantize_activation_ste
|
|
16
|
+
from ..quantize.integer_sim import (
|
|
17
|
+
round_ste, floor_ste, clamp_ste,
|
|
18
|
+
quantize_to_int8_ste, quantize_to_uint8_ste,
|
|
19
|
+
requantize_ste
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class PoTConv1d(PoTLayerBase):
|
|
24
|
+
"""Power-of-Two quantized Conv1d layer.
|
|
25
|
+
|
|
26
|
+
This layer implements a Conv1d layer with PoT weight quantization
|
|
27
|
+
and alpha scaling. It can be used as a drop-in replacement for
|
|
28
|
+
nn.Conv1d in QAT-aware models.
|
|
29
|
+
|
|
30
|
+
Supports two modes:
|
|
31
|
+
- Float QAT (default): Standard fake quantization with float operations
|
|
32
|
+
- Integer Simulation: C-compatible integer operations for exact match
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
def __init__(
|
|
36
|
+
self,
|
|
37
|
+
in_channels: int,
|
|
38
|
+
out_channels: int,
|
|
39
|
+
kernel_size: int,
|
|
40
|
+
stride: int = 1,
|
|
41
|
+
padding: int = 0,
|
|
42
|
+
dilation: int = 1,
|
|
43
|
+
groups: int = 1,
|
|
44
|
+
bias: bool = True,
|
|
45
|
+
encoding: str = 'unroll'
|
|
46
|
+
):
|
|
47
|
+
"""Initialize PoTConv1d layer.
|
|
48
|
+
|
|
49
|
+
Args:
|
|
50
|
+
in_channels: Number of input channels
|
|
51
|
+
out_channels: Number of output channels
|
|
52
|
+
kernel_size: Size of the convolution kernel
|
|
53
|
+
stride: Stride of the convolution (default: 1)
|
|
54
|
+
padding: Zero-padding added to both sides (default: 0)
|
|
55
|
+
dilation: Spacing between kernel elements (default: 1)
|
|
56
|
+
groups: Number of blocked connections (default: 1)
|
|
57
|
+
bias: If True, adds a learnable bias (default: True)
|
|
58
|
+
encoding: Encoding type ('unroll', 'fp130', '5level', '2bit', 'ternary')
|
|
59
|
+
"""
|
|
60
|
+
super().__init__(encoding)
|
|
61
|
+
|
|
62
|
+
self.in_channels = in_channels
|
|
63
|
+
self.out_channels = out_channels
|
|
64
|
+
self.kernel_size = kernel_size
|
|
65
|
+
self.stride = stride
|
|
66
|
+
self.padding = padding
|
|
67
|
+
self.dilation = dilation
|
|
68
|
+
self.groups = groups
|
|
69
|
+
|
|
70
|
+
# Initialize weight parameter: (out_channels, in_channels/groups, kernel_size)
|
|
71
|
+
self.weight = nn.Parameter(torch.empty(
|
|
72
|
+
out_channels, in_channels // groups, self.kernel_size
|
|
73
|
+
))
|
|
74
|
+
|
|
75
|
+
# Initialize bias parameter
|
|
76
|
+
if bias:
|
|
77
|
+
self.bias = nn.Parameter(torch.zeros(out_channels))
|
|
78
|
+
else:
|
|
79
|
+
self.register_parameter('bias', None)
|
|
80
|
+
|
|
81
|
+
# Initialize weights using Kaiming normal
|
|
82
|
+
nn.init.kaiming_normal_(self.weight, mode='fan_out', nonlinearity='relu')
|
|
83
|
+
|
|
84
|
+
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
|
85
|
+
"""Forward pass with optional PoT quantization.
|
|
86
|
+
|
|
87
|
+
Args:
|
|
88
|
+
x: Input tensor of shape (N, C_in, L)
|
|
89
|
+
|
|
90
|
+
Returns:
|
|
91
|
+
Output tensor of shape (N, C_out, L_out)
|
|
92
|
+
"""
|
|
93
|
+
if not self.quantize:
|
|
94
|
+
# Float mode (warmup training)
|
|
95
|
+
return F.conv1d(
|
|
96
|
+
x, self.weight, self.bias,
|
|
97
|
+
self.stride, self.padding, self.dilation, self.groups
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
if self.use_integer_sim and self.scale_int is not None:
|
|
101
|
+
if self.training:
|
|
102
|
+
# Training: use float QAT for gradient flow
|
|
103
|
+
# Integer simulation doesn't support alpha gradients
|
|
104
|
+
return self._forward_float_qat(x)
|
|
105
|
+
else:
|
|
106
|
+
# Eval: use integer sim for C-exact match
|
|
107
|
+
return self._forward_integer_sim(x)
|
|
108
|
+
else:
|
|
109
|
+
# Standard Float QAT Mode
|
|
110
|
+
return self._forward_float_qat(x)
|
|
111
|
+
|
|
112
|
+
def _forward_float_qat(self, x: torch.Tensor) -> torch.Tensor:
|
|
113
|
+
"""Original float QAT forward.
|
|
114
|
+
|
|
115
|
+
NOTE: Input is already normalized as (data * 256/255 - mean) / avg_std
|
|
116
|
+
so mean is already subtracted. NO bias absorption needed here.
|
|
117
|
+
Bias absorption is only for Integer Sim (raw uint8 input) and C export.
|
|
118
|
+
"""
|
|
119
|
+
DEBUG_QAT = False # True로 바꾸면 QAT 디버그 출력
|
|
120
|
+
|
|
121
|
+
# PoT quantization
|
|
122
|
+
w_q = quantize_to_pot_ste(self.weight, self.alpha, encoding=self.encoding)
|
|
123
|
+
|
|
124
|
+
if DEBUG_QAT:
|
|
125
|
+
print(f"\n[DEBUG QAT] input: range=[{x.min():.4f}, {x.max():.4f}]")
|
|
126
|
+
print(f" w_q: unique_vals={torch.unique(w_q).tolist()[:10]}...")
|
|
127
|
+
print(f" alpha={self.alpha.item():.4f}")
|
|
128
|
+
print(f" w_effective (w_q*alpha): range=[{(w_q*self.alpha).min():.4f}, {(w_q*self.alpha).max():.4f}]")
|
|
129
|
+
|
|
130
|
+
# Convolution with scaled weights (NO bias adjustment - input already normalized)
|
|
131
|
+
out = F.conv1d(
|
|
132
|
+
x, w_q * self.alpha, self.bias,
|
|
133
|
+
self.stride, self.padding, self.dilation, self.groups
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
if DEBUG_QAT:
|
|
137
|
+
print(f" conv output: range=[{out.min():.4f}, {out.max():.4f}]")
|
|
138
|
+
|
|
139
|
+
# Activation quantization
|
|
140
|
+
if self.act_scale is not None:
|
|
141
|
+
out = quantize_activation_ste(out, self.act_scale)
|
|
142
|
+
if DEBUG_QAT:
|
|
143
|
+
print(f" after act_quant (scale={self.act_scale.item():.4f}): range=[{out.min():.4f}, {out.max():.4f}]")
|
|
144
|
+
|
|
145
|
+
return out
|
|
146
|
+
|
|
147
|
+
def _forward_integer_sim(self, x: torch.Tensor) -> torch.Tensor:
|
|
148
|
+
"""Integer simulation forward - matches C inference exactly.
|
|
149
|
+
|
|
150
|
+
C code equivalent:
|
|
151
|
+
// Step 1: PoT convolution
|
|
152
|
+
int32_t acc = 0;
|
|
153
|
+
for (...) {
|
|
154
|
+
acc += input[i] << k; // or -= for negative weights
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
// Step 2: Requantize
|
|
158
|
+
acc = ((int64_t)acc * scale_int + round) >> shift;
|
|
159
|
+
|
|
160
|
+
// Step 3: Add bias (with mean absorption for first layer)
|
|
161
|
+
acc += bias_int;
|
|
162
|
+
|
|
163
|
+
// Step 4: Clamp (ReLU)
|
|
164
|
+
output = clamp(acc, 0, 127); // or -128,127 if no ReLU
|
|
165
|
+
"""
|
|
166
|
+
DEBUG = False # True로 바꾸면 상세 디버그 출력
|
|
167
|
+
|
|
168
|
+
is_first = self.is_first_layer.item() if self.is_first_layer is not None else False
|
|
169
|
+
is_last = self.is_last_layer.item() if self.is_last_layer is not None else False
|
|
170
|
+
|
|
171
|
+
if DEBUG:
|
|
172
|
+
print(f"\n[DEBUG _forward_integer_sim] is_first={is_first}, is_last={is_last}")
|
|
173
|
+
print(f" input: shape={x.shape}, range=[{x.min():.4f}, {x.max():.4f}]")
|
|
174
|
+
|
|
175
|
+
# === Step 1: Quantize input to integer ===
|
|
176
|
+
if is_first:
|
|
177
|
+
# First layer: input is NORMALIZED (x - mean) / avg_std
|
|
178
|
+
# C code receives raw uint8 [0,255], so we denormalize first
|
|
179
|
+
if self.input_mean is not None and self.input_std is not None:
|
|
180
|
+
# Denormalize: x_raw = x_norm * avg_std + mean (channel-wise mean!)
|
|
181
|
+
# QAT normalized with channel-wise mean, so denorm with channel-wise mean
|
|
182
|
+
avg_std = self.input_std.mean().item()
|
|
183
|
+
mean = self.input_mean.view(1, -1, 1).to(x.device) # [1, C, 1]
|
|
184
|
+
x_raw = x * avg_std + mean # channel-wise mean
|
|
185
|
+
x_raw = torch.clamp(x_raw, 0.0, 1.0)
|
|
186
|
+
else:
|
|
187
|
+
x_raw = x
|
|
188
|
+
# [0,1] → [0,255] (uint8), /256 absorbed in shift (+8)
|
|
189
|
+
x_int = quantize_to_uint8_ste(x_raw, 256.0)
|
|
190
|
+
if DEBUG:
|
|
191
|
+
print(f" x_int (uint8): range=[{x_int.min():.0f}, {x_int.max():.0f}]")
|
|
192
|
+
else:
|
|
193
|
+
# Other layers: convert float back to int8
|
|
194
|
+
# Input was divided by prev_act_scale in previous layer
|
|
195
|
+
prev_scale = self.prev_act_scale if self.prev_act_scale is not None else torch.tensor(1.0)
|
|
196
|
+
x_int = quantize_to_int8_ste(x, prev_scale)
|
|
197
|
+
if DEBUG:
|
|
198
|
+
print(f" x_int (int8): prev_scale={prev_scale.item():.4f}, range=[{x_int.min():.0f}, {x_int.max():.0f}]")
|
|
199
|
+
|
|
200
|
+
# === Step 2: PoT Convolution (integer) ===
|
|
201
|
+
# Get PoT weights with STE for gradient flow
|
|
202
|
+
w_pot = quantize_to_pot_ste(self.weight, self.alpha, encoding=self.encoding)
|
|
203
|
+
|
|
204
|
+
if DEBUG:
|
|
205
|
+
print(f" w_pot: shape={w_pot.shape}, unique_vals={torch.unique(w_pot).tolist()[:10]}...")
|
|
206
|
+
print(f" alpha={self.alpha.item():.4f}")
|
|
207
|
+
|
|
208
|
+
# Integer convolution
|
|
209
|
+
# In C: acc += input << k (shift operation)
|
|
210
|
+
# In Python: float tensor but values are integers
|
|
211
|
+
acc = F.conv1d(
|
|
212
|
+
x_int, w_pot,
|
|
213
|
+
None, # bias added separately
|
|
214
|
+
self.stride, self.padding, self.dilation, self.groups
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
if DEBUG:
|
|
218
|
+
print(f" acc after conv: range=[{acc.min():.0f}, {acc.max():.0f}]")
|
|
219
|
+
|
|
220
|
+
# === Step 3: Requantize ===
|
|
221
|
+
# C: ((int64_t)acc * scale_int + round) >> shift
|
|
222
|
+
scale_int = self.scale_int.item() if self.scale_int is not None else 1
|
|
223
|
+
shift = self.shift.item() if self.shift is not None else 0
|
|
224
|
+
|
|
225
|
+
if DEBUG:
|
|
226
|
+
print(f" scale_int={scale_int}, shift={shift}")
|
|
227
|
+
|
|
228
|
+
acc = requantize_ste(acc, scale_int, shift)
|
|
229
|
+
|
|
230
|
+
if DEBUG:
|
|
231
|
+
print(f" acc after requantize: range=[{acc.min():.0f}, {acc.max():.0f}]")
|
|
232
|
+
|
|
233
|
+
# === Step 4: Add bias (with mean absorption for first layer) ===
|
|
234
|
+
if self.bias is not None:
|
|
235
|
+
act_scale = self.act_scale if self.act_scale is not None else torch.tensor(1.0)
|
|
236
|
+
|
|
237
|
+
if is_first:
|
|
238
|
+
# First layer: absorb mean into bias
|
|
239
|
+
# MUST match export.py absorb_standardization exactly
|
|
240
|
+
# Use avg_std to match QAT and C export
|
|
241
|
+
|
|
242
|
+
if self.input_mean is not None and self.input_std is not None:
|
|
243
|
+
avg_std = self.input_std.mean().item()
|
|
244
|
+
in_ch = w_pot.shape[1]
|
|
245
|
+
alpha = self.alpha
|
|
246
|
+
bias_adjusted = self.bias.clone()
|
|
247
|
+
|
|
248
|
+
for c in range(in_ch):
|
|
249
|
+
mean_c = self.input_mean[c].item()
|
|
250
|
+
weight_sum_c = w_pot[:, c, :].sum(dim=1) * alpha # [out_ch]
|
|
251
|
+
bias_adjusted = bias_adjusted - (mean_c / avg_std) * weight_sum_c
|
|
252
|
+
|
|
253
|
+
if DEBUG:
|
|
254
|
+
print(f" [First layer bias absorption - {in_ch} channels, avg_std={avg_std:.4f}]")
|
|
255
|
+
print(f" input_mean={self.input_mean.tolist()}")
|
|
256
|
+
print(f" original bias sample: [{self.bias[0].item():.4f}, {self.bias[1].item():.4f}, ...]")
|
|
257
|
+
print(f" adjusted bias sample: [{bias_adjusted[0].item():.4f}, {bias_adjusted[1].item():.4f}, ...]")
|
|
258
|
+
else:
|
|
259
|
+
# No standardization - use bias as-is
|
|
260
|
+
bias_adjusted = self.bias
|
|
261
|
+
if DEBUG:
|
|
262
|
+
print(f" [First layer - no standardization]")
|
|
263
|
+
|
|
264
|
+
bias_int = round_ste(bias_adjusted * act_scale)
|
|
265
|
+
|
|
266
|
+
if DEBUG:
|
|
267
|
+
print(f" bias_int sample: [{bias_int[0].item():.0f}, {bias_int[1].item():.0f}, ...]")
|
|
268
|
+
else:
|
|
269
|
+
# Other layers: simple bias scaling
|
|
270
|
+
bias_int = round_ste(self.bias * act_scale)
|
|
271
|
+
if DEBUG:
|
|
272
|
+
print(f" bias_int: act_scale={act_scale.item():.4f}, range=[{bias_int.min():.0f}, {bias_int.max():.0f}]")
|
|
273
|
+
|
|
274
|
+
# Add bias (broadcast over length dimension)
|
|
275
|
+
acc = acc + bias_int.view(1, -1, 1)
|
|
276
|
+
|
|
277
|
+
if DEBUG:
|
|
278
|
+
print(f" acc after bias: range=[{acc.min():.0f}, {acc.max():.0f}]")
|
|
279
|
+
|
|
280
|
+
# === Step 5: Clamp (ReLU) ===
|
|
281
|
+
if not is_last:
|
|
282
|
+
# ReLU: clamp to [0, 127]
|
|
283
|
+
out = clamp_ste(acc, 0.0, 127.0)
|
|
284
|
+
if DEBUG:
|
|
285
|
+
print(f" out after ReLU clamp: range=[{out.min():.0f}, {out.max():.0f}]")
|
|
286
|
+
else:
|
|
287
|
+
# Last layer: no ReLU, output raw logits
|
|
288
|
+
out = acc
|
|
289
|
+
if DEBUG:
|
|
290
|
+
print(f" out (last layer, no clamp): range=[{out.min():.0f}, {out.max():.0f}]")
|
|
291
|
+
|
|
292
|
+
# === Step 6: Convert back to float for next layer ===
|
|
293
|
+
# Next layer expects: int_value / act_scale
|
|
294
|
+
if self.act_scale is not None and not is_last:
|
|
295
|
+
out = out / self.act_scale
|
|
296
|
+
if DEBUG:
|
|
297
|
+
print(f" out after /act_scale: range=[{out.min():.4f}, {out.max():.4f}]")
|
|
298
|
+
|
|
299
|
+
return out
|
|
300
|
+
|
|
301
|
+
def extra_repr(self) -> str:
|
|
302
|
+
"""String representation of layer configuration."""
|
|
303
|
+
s = (f'{self.in_channels}, {self.out_channels}, '
|
|
304
|
+
f'kernel_size={self.kernel_size}, stride={self.stride}')
|
|
305
|
+
if self.padding != 0:
|
|
306
|
+
s += f', padding={self.padding}'
|
|
307
|
+
if self.dilation != 1:
|
|
308
|
+
s += f', dilation={self.dilation}'
|
|
309
|
+
if self.groups != 1:
|
|
310
|
+
s += f', groups={self.groups}'
|
|
311
|
+
if self.bias is None:
|
|
312
|
+
s += ', bias=False'
|
|
313
|
+
if self.quantize:
|
|
314
|
+
s += f', quantize=True, encoding={self.encoding}'
|
|
315
|
+
if self.use_integer_sim:
|
|
316
|
+
s += ', integer_sim=True'
|
|
317
|
+
return s
|