potnn 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
potnn/__init__.py ADDED
@@ -0,0 +1,86 @@
1
+ """potnn: Power-of-Two Neural Network Compiler for Ultra-Low-Cost MCUs
2
+
3
+ A PyTorch-based library for training and deploying neural networks
4
+ on MCUs without multiplication instructions, using only shifts and adds.
5
+ """
6
+
7
+ __version__ = "0.4.8"
8
+
9
+ # Import core modules
10
+ from .modules.conv import PoTConv2d
11
+ from .modules.conv1d import PoTConv1d
12
+ from .modules.depthwise import PoTDepthwiseConv2d
13
+ from .modules.linear import PoTLinear
14
+ from .modules.add import PoTAdd
15
+ from .modules.avgpool import PoTGlobalAvgPool
16
+ from .config import Config
17
+ from .export import export
18
+ from .quantize.calibration import calibrate_model
19
+ from .quantize.qat import prepare_qat, enable_integer_sim, disable_integer_sim
20
+ from .wrapper import train
21
+ from .fuse import fuse_batchnorm, check_bn_fused
22
+
23
+
24
+ def calibrate(model, data_loader, config=None, num_batches=10, mean=None, std=None):
25
+ """Calibrate model activation scales.
26
+
27
+ Two calling conventions supported:
28
+
29
+ 1. With config (recommended):
30
+ calibrate(model, loader, config, num_batches=10)
31
+
32
+ 2. Direct parameters:
33
+ calibrate(model, loader, num_batches=10, mean=[0.1307], std=[0.3081])
34
+
35
+ Args:
36
+ model: Model with PoT layers
37
+ data_loader: Calibration data loader
38
+ config: potnn.Config object (optional, extracts mean/std from it)
39
+ num_batches: Number of batches for calibration (default: 10)
40
+ mean: Dataset mean (list or float), used if config not provided
41
+ std: Dataset std (list or float), used if config not provided
42
+
43
+ Returns:
44
+ Dictionary of activation max values per layer
45
+ """
46
+ # Handle config object
47
+ if config is not None and isinstance(config, Config):
48
+ mean = config.mean if config.mean is not None else [0.0]
49
+ std = config.std if config.std is not None else [1.0]
50
+
51
+ # Handle num_batches passed as config (common mistake)
52
+ if isinstance(config, int):
53
+ num_batches = config
54
+ config = None
55
+
56
+ # Default values
57
+ if mean is None:
58
+ mean = 0.0
59
+ if std is None:
60
+ std = 1.0
61
+
62
+ return calibrate_model(model, data_loader, num_batches=num_batches, mean=mean, std=std)
63
+
64
+
65
+ __all__ = [
66
+ 'PoTConv2d',
67
+ 'PoTConv1d',
68
+ 'PoTDepthwiseConv2d',
69
+ 'PoTLinear',
70
+ 'PoTAdd',
71
+ 'PoTGlobalAvgPool',
72
+ 'Config',
73
+ 'export',
74
+ 'calibrate_model',
75
+ 'calibrate',
76
+ 'prepare_qat',
77
+ 'enable_integer_sim',
78
+ 'disable_integer_sim',
79
+ 'train',
80
+ 'fuse_batchnorm',
81
+ 'check_bn_fused',
82
+ ]
83
+
84
+ # Package metadata
85
+ __author__ = "potnn developers"
86
+ __license__ = "MIT"
@@ -0,0 +1,20 @@
1
+ """C code generation module for potnn."""
2
+
3
+ from .header import generate_c_header
4
+ from .unroll import generate_unrolled_layer
5
+ from .scale import decompose_scale_to_shifts, generate_scale_func
6
+ from .fp130 import generate_fp130_layer
7
+ from .bit2 import generate_2bit_layer
8
+ from .level5 import generate_5level_layer
9
+ from .ternary import generate_ternary_layer
10
+
11
+ __all__ = [
12
+ 'generate_c_header',
13
+ 'generate_unrolled_layer',
14
+ 'decompose_scale_to_shifts',
15
+ 'generate_scale_func',
16
+ 'generate_fp130_layer',
17
+ 'generate_2bit_layer',
18
+ 'generate_5level_layer',
19
+ 'generate_ternary_layer',
20
+ ]
potnn/codegen/bit2.py ADDED
@@ -0,0 +1,263 @@
1
+ """Generate 2-bit encoded C code for PoT layers.
2
+
3
+ 2-bit encoding: [sign(1)][shift(1)] = 2bit
4
+ - 4 levels: ±1, ±2
5
+ - No zero (DenseShift style)
6
+ - Decoding: val = (shift ? 2 : 1) * (sign ? -1 : 1)
7
+ - Minimal memory footprint
8
+ """
9
+
10
+ import numpy as np
11
+ from typing import Dict, Any, Tuple
12
+
13
+
14
+ def pack_weights_2bit(w_q: np.ndarray) -> Tuple[np.ndarray, int]:
15
+ """Pack quantized weights to 2-bit format.
16
+
17
+ Args:
18
+ w_q: Quantized weights (values in ±1, ±2)
19
+
20
+ Returns:
21
+ packed: uint8 array (4 weights per byte)
22
+ original_size: number of weights
23
+ """
24
+ # w_q shape: [out_ch, in_ch, kh, kw] or [out_features, in_features] or 3D
25
+ if len(w_q.shape) == 4:
26
+ out_ch, in_ch, kh, kw = w_q.shape
27
+ kernel_size = in_ch * kh * kw
28
+ elif len(w_q.shape) == 3:
29
+ # Conv1d
30
+ out_ch, in_ch, kw = w_q.shape
31
+ kernel_size = in_ch * kw
32
+ else:
33
+ out_ch, in_features = w_q.shape
34
+ kernel_size = in_features
35
+
36
+ # Words per filter (16 weights per uint32)
37
+ words_per_filter = (kernel_size + 15) // 16
38
+ packed = np.zeros(out_ch * words_per_filter, dtype=np.uint32)
39
+
40
+ for oc in range(out_ch):
41
+ # Get filter weights and flatten
42
+ filter_w = w_q[oc].flatten()
43
+ n = len(filter_w)
44
+
45
+ # Encode
46
+ encoded = np.zeros(n, dtype=np.uint8)
47
+ for i, w in enumerate(filter_w):
48
+ if w == 0:
49
+ w = 1 if i % 2 == 0 else -1
50
+ sign = 1 if w < 0 else 0
51
+ shift = 1 if abs(w) == 2 else 0
52
+ encoded[i] = (sign << 1) | shift
53
+
54
+ # Pack to uint32
55
+ for i in range(0, n, 16):
56
+ chunk = 0
57
+ for j in range(16):
58
+ if i + j < n:
59
+ chunk |= (int(encoded[i + j]) << (2 * j))
60
+ packed[oc * words_per_filter + (i // 16)] = chunk
61
+
62
+ return packed, out_ch * kernel_size
63
+
64
+
65
+ def generate_2bit_layer(layer_info: Dict[str, Any]) -> str:
66
+ """Generate 2-bit encoded C code for a layer.
67
+
68
+ Args:
69
+ layer_info: Dictionary with layer information
70
+
71
+ Returns:
72
+ C code for the layer
73
+ """
74
+ name = layer_info['name']
75
+ layer_type = layer_info['type']
76
+ weight = layer_info['weight']
77
+ bias = layer_info.get('bias', None)
78
+ use_relu = layer_info.get('has_relu', False)
79
+ act_scale = layer_info.get('act_scale', 1.0) or 1.0
80
+
81
+ # Get weight as numpy
82
+ if hasattr(weight, 'numpy'):
83
+ w_q = weight.numpy()
84
+ else:
85
+ w_q = np.array(weight)
86
+
87
+ # Pack weights
88
+ packed, n_weights = pack_weights_2bit(w_q)
89
+
90
+ code = f"// {name} - 2-bit encoding (4 levels: ±1, ±2)\n"
91
+ code += f"// Packed weights: {len(packed)*4} bytes ({n_weights} weights, packed as uint32)\n\n"
92
+
93
+ # Weight data
94
+ code += f"static const uint32_t {name}_weights[] = {{\n "
95
+ for i, w in enumerate(packed):
96
+ code += f"0x{w:08x}, "
97
+ if (i + 1) % 8 == 0:
98
+ code += "\n "
99
+ code += "\n};\n\n"
100
+
101
+ # Bias data (scaled by act_scale)
102
+ if bias is not None:
103
+ code += f"static const int32_t {name}_bias[] = {{\n "
104
+ for i, b in enumerate(bias):
105
+ bias_val = int(round(b.item() * act_scale))
106
+ # No clipping for int32
107
+ code += f"{bias_val}, "
108
+ if (i + 1) % 16 == 0:
109
+ code += "\n "
110
+ code += "\n};\n\n"
111
+
112
+ if 'Linear' in layer_type:
113
+ code += _generate_linear_2bit(name, w_q.shape, bias, use_relu, act_scale)
114
+ elif 'Conv2d' in layer_type:
115
+ code += _generate_conv2d_2bit(name, layer_info, bias, use_relu, act_scale)
116
+
117
+ return code
118
+
119
+
120
+ def _generate_linear_2bit(name: str, shape: tuple, bias, use_relu: bool, act_scale: float) -> str:
121
+ """Generate 2-bit Linear layer code."""
122
+ out_features, in_features = shape
123
+
124
+ code = f"static void {name}_forward(const int8_t* input, int8_t* output) {{\n"
125
+ code += f" const uint32_t* wp = {name}_weights;\n"
126
+ code += f" int32_t acc, shifted;\n"
127
+ code += f" uint32_t weight_chunk;\n"
128
+ code += f" uint8_t code, shift;\n\n"
129
+
130
+ code += f" for (int o = 0; o < {out_features}; o++) {{\n"
131
+ code += f" acc = 0;\n"
132
+ code += f" for (int i = 0; i < {in_features}; i += 16) {{\n"
133
+ code += f" weight_chunk = *wp++;\n"
134
+ code += f" \n"
135
+ code += f" // Process 16 weights from chunk (LSB first)\n"
136
+ code += f" for (int k = 0; k < 16 && (i + k) < {in_features}; k++) {{\n"
137
+ code += f" // 2 bits: [sign(1)][shift(1)]\n"
138
+ code += f" code = (weight_chunk >> (k << 1)) & 0x3;\n"
139
+ code += f" shift = code & 1; // 0 or 1\n"
140
+ code += f" shifted = (int32_t)input[i + k] << shift;\n"
141
+ code += f" acc += (code & 2) ? -shifted : shifted;\n"
142
+ code += f" }}\n"
143
+ code += f" }}\n"
144
+
145
+ # Scale
146
+ code += f" acc = scale_{name}(acc);\n"
147
+
148
+ # Bias
149
+ if bias is not None:
150
+ code += f" acc += {name}_bias[o];\n"
151
+
152
+ # ReLU
153
+ if use_relu:
154
+ code += f" if (acc < 0) acc = 0;\n"
155
+
156
+ # Clamp and store
157
+ code += f" output[o] = (int8_t)(acc > 127 ? 127 : (acc < -128 ? -128 : acc));\n"
158
+ code += f" }}\n"
159
+ code += f"}}\n\n"
160
+
161
+ return code
162
+
163
+
164
+ def _generate_conv2d_2bit(name: str, layer_info: Dict, bias, use_relu: bool, act_scale: float) -> str:
165
+ """Generate 2-bit Conv2d layer code."""
166
+ weight = layer_info['weight']
167
+ if hasattr(weight, 'shape'):
168
+ w_shape = weight.shape
169
+ else:
170
+ w_shape = np.array(weight).shape
171
+
172
+ if len(w_shape) == 4:
173
+ out_ch, in_ch, kh, kw = w_shape
174
+ elif len(w_shape) == 3:
175
+ out_ch, in_ch, kw = w_shape
176
+ kh = 1
177
+
178
+ out_h = layer_info.get('out_h', 1)
179
+ out_w = layer_info.get('out_w', 1)
180
+ in_h = layer_info.get('in_h', 1)
181
+ in_w = layer_info.get('in_w', 1)
182
+ stride = layer_info.get('stride', 1)
183
+ padding = layer_info.get('padding', 0)
184
+ groups = layer_info.get('groups', 1)
185
+
186
+ # Handle tuple parameters
187
+ if isinstance(stride, tuple): stride_h, stride_w = stride
188
+ else: stride_h = stride_w = stride
189
+
190
+ if isinstance(padding, tuple): pad_h, pad_w = padding
191
+ else: pad_h = pad_w = padding
192
+
193
+ kernel_size = in_ch * kh * kw
194
+ words_per_filter = (kernel_size + 15) // 16
195
+
196
+ code = f"static void {name}_forward(const int8_t* input, int8_t* output) {{\n"
197
+ code += f" // Conv2d: {in_ch}x{in_h}x{in_w} -> {out_ch}x{out_h}x{out_w}\n"
198
+ code += f" const uint32_t* wp_base;\n"
199
+ code += f" int32_t acc, shifted;\n"
200
+ code += f" uint32_t weight_chunk;\n"
201
+ code += f" uint8_t code_bits, shift;\n\n"
202
+
203
+ code += f" for (int oc = 0; oc < {out_ch}; oc++) {{\n"
204
+ code += f" wp_base = {name}_weights + oc * {words_per_filter};\n"
205
+ code += f" for (int oy = 0; oy < {out_h}; oy++) {{\n"
206
+ code += f" for (int ox = 0; ox < {out_w}; ox++) {{\n"
207
+ code += f" acc = 0;\n"
208
+ code += f" const uint32_t* wp = wp_base;\n"
209
+ code += f" weight_chunk = *wp++;\n"
210
+ code += f" int w_idx = 0;\n"
211
+
212
+ # Group offset calculation
213
+ channels_per_group = in_ch
214
+ if groups == out_ch:
215
+ group_stride_str = f"oc * {channels_per_group}"
216
+ elif groups > 1:
217
+ out_per_group = out_ch // groups
218
+ group_stride_str = f"(oc / {out_per_group}) * {channels_per_group}"
219
+ else:
220
+ group_stride_str = "0"
221
+
222
+ code += f" for (int ic = 0; ic < {in_ch}; ic++) {{\n"
223
+ code += f" for (int ky = 0; ky < {kh}; ky++) {{\n"
224
+ code += f" int iy = oy * {stride_h} + ky - {pad_h};\n"
225
+ code += f" int skip = (iy < 0 || iy >= {in_h});\n"
226
+ code += f" for (int kx = 0; kx < {kw}; kx++) {{\n"
227
+ code += f" int ix = ox * {stride_w} + kx - {pad_w};\n"
228
+ code += f" \n"
229
+ code += f" // Extract 2 bits\n"
230
+ code += f" if ((w_idx & 15) == 0 && w_idx > 0) weight_chunk = *wp++;\n"
231
+ code += f" code_bits = (weight_chunk >> ((w_idx & 15) << 1)) & 3;\n"
232
+ code += f" w_idx++;\n"
233
+ code += f" \n"
234
+ code += f" if (skip || ix < 0 || ix >= {in_w}) continue;\n"
235
+ code += f" \n"
236
+ code += f" // Consistent Zero Handling: Map 0 -> +1 (shift 0)\n"
237
+ code += f" shift = code_bits & 1;\n"
238
+
239
+ if group_stride_str == "0":
240
+ input_idx = f"ic * {in_h * in_w} + iy * {in_w} + ix"
241
+ else:
242
+ input_idx = f"({group_stride_str} + ic) * {in_h * in_w} + iy * {in_w} + ix"
243
+
244
+ code += f" shifted = (int32_t)input[{input_idx}] << shift;\n"
245
+ code += f" acc += (code_bits & 2) ? -shifted : shifted;\n"
246
+ code += f" }}\n"
247
+ code += f" }}\n"
248
+ code += f" }}\n"
249
+ code += f" acc = scale_{name}(acc);\n"
250
+
251
+ if bias is not None:
252
+ code += f" acc += {name}_bias[oc];\n"
253
+
254
+ if use_relu:
255
+ code += f" if (acc < 0) acc = 0;\n"
256
+
257
+ code += f" output[oc * {out_h * out_w} + oy * {out_w} + ox] = (int8_t)(acc > 127 ? 127 : (acc < -128 ? -128 : acc));\n"
258
+ code += f" }}\n"
259
+ code += f" }}\n"
260
+ code += f" }}\n"
261
+ code += f"}}\n\n"
262
+
263
+ return code
potnn/codegen/fp130.py ADDED
@@ -0,0 +1,269 @@
1
+ """Generate FP1.3.0 encoded C code for PoT layers.
2
+
3
+ FP1.3.0 encoding: [sign(1)][exp(3)] = 4bit
4
+ - 16 levels: ±1, ±2, ±4, ±8, ±16, ±32, ±64, ±128
5
+ - No zero (DenseShift style)
6
+ - Decoding: val = (1 << exp) * (sign ? -1 : 1)
7
+ """
8
+
9
+ import numpy as np
10
+ from typing import Dict, Any, Tuple
11
+
12
+
13
+ def pack_weights_fp130(w_q: np.ndarray) -> Tuple[np.ndarray, int]:
14
+ """Pack quantized weights to FP1.3.0 format.
15
+
16
+ Args:
17
+ w_q: Quantized weights (values in ±1, ±2, ±4, ±8, ±16, ±32, ±64, ±128)
18
+
19
+ Returns:
20
+ packed: uint8 array (2 weights per byte)
21
+ original_size: number of weights
22
+ """
23
+ # w_q shape: [out_ch, in_ch, kh, kw] or [out_features, in_features] or [out_ch, in_ch, kw]
24
+ if len(w_q.shape) == 4:
25
+ out_ch, in_ch, kh, kw = w_q.shape
26
+ kernel_size = in_ch * kh * kw
27
+ elif len(w_q.shape) == 3:
28
+ # Conv1d: [out_ch, in_ch, kw] -> treat as 1xKW kernel
29
+ out_ch, in_ch, kw = w_q.shape
30
+ kernel_size = in_ch * kw
31
+ else:
32
+ out_ch, in_features = w_q.shape
33
+ kernel_size = in_features
34
+
35
+ # Words per filter (8 weights per uint32)
36
+ words_per_filter = (kernel_size + 7) // 8
37
+ packed = np.zeros(out_ch * words_per_filter, dtype=np.uint32)
38
+
39
+ for oc in range(out_ch):
40
+ # Get filter weights and flatten
41
+ filter_w = w_q[oc].flatten()
42
+ n = len(filter_w)
43
+
44
+ # Encode
45
+ encoded = np.zeros(n, dtype=np.uint8)
46
+ for i, w in enumerate(filter_w):
47
+ if w == 0:
48
+ w = 1 if i % 2 == 0 else -1
49
+ sign = 1 if w < 0 else 0
50
+ exp = int(np.log2(abs(w)))
51
+ encoded[i] = (sign << 3) | (exp & 0x7)
52
+
53
+ # Pack to uint32
54
+ for i in range(0, n, 8):
55
+ chunk = 0
56
+ for j in range(8):
57
+ if i + j < n:
58
+ val = encoded[i + j]
59
+ chunk |= (int(val) << (4 * j))
60
+ packed[oc * words_per_filter + (i // 8)] = chunk
61
+
62
+ return packed, out_ch * kernel_size
63
+
64
+
65
+ def generate_fp130_layer(layer_info: Dict[str, Any]) -> str:
66
+ """Generate FP1.3.0 encoded C code for a layer.
67
+
68
+ Args:
69
+ layer_info: Dictionary with layer information
70
+
71
+ Returns:
72
+ C code for the layer
73
+ """
74
+ name = layer_info['name']
75
+ layer_type = layer_info['type']
76
+ weight = layer_info['weight']
77
+ bias = layer_info.get('bias', None)
78
+ use_relu = layer_info.get('has_relu', False)
79
+ act_scale = layer_info.get('act_scale', 1.0) or 1.0
80
+
81
+ # Get weight as numpy
82
+ if hasattr(weight, 'numpy'):
83
+ w_q = weight.numpy()
84
+ else:
85
+ w_q = np.array(weight)
86
+
87
+ # Pack weights
88
+ packed, n_weights = pack_weights_fp130(w_q)
89
+
90
+ code = f"// {name} - FP1.3.0 encoding (16 levels, no zero)\n"
91
+ code += f"// Packed weights: {len(packed)*4} bytes ({n_weights} weights, packed as uint32)\n\n"
92
+
93
+ # Weight data
94
+ code += f"static const uint32_t {name}_weights[] = {{\n "
95
+ for i, w in enumerate(packed):
96
+ code += f"0x{w:08x}, "
97
+ if (i + 1) % 8 == 0: # 8 words per line
98
+ code += "\n "
99
+ code += "\n};\n\n"
100
+
101
+ # Bias data (scaled by act_scale)
102
+ if bias is not None:
103
+ code += f"static const int32_t {name}_bias[] = {{\n "
104
+ for i, b in enumerate(bias):
105
+ bias_val = int(round(b.item() * act_scale))
106
+ # No clipping for int32
107
+ code += f"{bias_val}, "
108
+ if (i + 1) % 16 == 0:
109
+ code += "\n "
110
+ code += "\n};\n\n"
111
+
112
+ if 'Linear' in layer_type:
113
+ code += _generate_linear_fp130(name, w_q.shape, bias, use_relu, act_scale)
114
+ elif 'Conv2d' in layer_type:
115
+ code += _generate_conv2d_fp130(name, layer_info, bias, use_relu, act_scale)
116
+
117
+ return code
118
+
119
+
120
+ def _generate_linear_fp130(name: str, shape: tuple, bias, use_relu: bool, act_scale: float) -> str:
121
+ """Generate FP1.3.0 Linear layer code."""
122
+ out_features, in_features = shape
123
+
124
+ code = f"static void {name}_forward(const int8_t* input, int8_t* output) {{\n"
125
+ code += f" const uint32_t* wp = {name}_weights;\n"
126
+ code += f" int32_t acc, shifted;\n"
127
+ code += f" uint32_t weight_chunk;\n"
128
+ code += f" uint8_t sign, exp;\n\n"
129
+
130
+ code += f" for (int o = 0; o < {out_features}; o++) {{\n"
131
+ code += f" acc = 0;\n"
132
+ code += f" for (int i = 0; i < {in_features}; i += 8) {{\n"
133
+ code += f" weight_chunk = *wp++;\n"
134
+ code += f" // Process 8 weights from chunk (LSB first)\n"
135
+ code += f" for (int k = 0; k < 8 && (i + k) < {in_features}; k++) {{\n"
136
+ code += f" // Lower 4 bits: [sign(1)][exp(3)]\n"
137
+ code += f" sign = (weight_chunk >> 3) & 1; // MSB of nibble\n"
138
+ code += f" exp = weight_chunk & 0x7;\n"
139
+ code += f" \n"
140
+ code += f" shifted = (int32_t)input[i + k] << exp;\n"
141
+ code += f" acc += sign ? -shifted : shifted;\n"
142
+ code += f" \n"
143
+ code += f" weight_chunk >>= 4; // Check next weight\n"
144
+ code += f" }}\n"
145
+ code += f" }}\n"
146
+
147
+ # Scale
148
+ code += f" acc = scale_{name}(acc);\n"
149
+
150
+ # Bias
151
+ if bias is not None:
152
+ code += f" acc += {name}_bias[o];\n"
153
+
154
+ # ReLU
155
+ if use_relu:
156
+ code += f" if (acc < 0) acc = 0;\n"
157
+
158
+ # Clamp and store
159
+ code += f" output[o] = (int8_t)(acc > 127 ? 127 : (acc < -128 ? -128 : acc));\n"
160
+ code += f" }}\n"
161
+ code += f"}}\n\n"
162
+
163
+ return code
164
+
165
+
166
+ def _generate_conv2d_fp130(name: str, layer_info: Dict, bias, use_relu: bool, act_scale: float) -> str:
167
+ """Generate FP1.3.0 Conv2d layer code."""
168
+ weight = layer_info['weight']
169
+ # Handle 3D weights (Conv1d)
170
+ if hasattr(weight, 'shape'):
171
+ w_shape = weight.shape
172
+ else:
173
+ w_shape = np.array(weight).shape
174
+
175
+ if len(w_shape) == 4:
176
+ out_ch, in_ch, kh, kw = w_shape
177
+ elif len(w_shape) == 3:
178
+ out_ch, in_ch, kw = w_shape
179
+ kh = 1
180
+ # Reshape for consistency if needed, strictly we just need correct loops
181
+
182
+ out_h = layer_info.get('out_h', 1)
183
+ out_w = layer_info.get('out_w', 1)
184
+ in_h = layer_info.get('in_h', 1)
185
+ in_w = layer_info.get('in_w', 1)
186
+ stride = layer_info.get('stride', 1)
187
+ padding = layer_info.get('padding', 0)
188
+ groups = layer_info.get('groups', 1)
189
+
190
+ # Handle tuple parameters
191
+ if isinstance(stride, tuple): stride_h, stride_w = stride
192
+ else: stride_h = stride_w = stride
193
+
194
+ if isinstance(padding, tuple): pad_h, pad_w = padding
195
+ else: pad_h = pad_w = padding
196
+
197
+ kernel_size = in_ch * kh * kw
198
+
199
+ code = f"static void {name}_forward(const int8_t* input, int8_t* output) {{\n"
200
+ code += f" // Conv2d: {in_ch}x{in_h}x{in_w} -> {out_ch}x{out_h}x{out_w}\n"
201
+ code += f" const uint32_t* wp;\n"
202
+ code += f" int32_t acc, shifted;\n"
203
+ code += f" uint32_t weight_chunk, packed;\n"
204
+ code += f" uint8_t sign, exp;\n\n"
205
+
206
+ code += f" for (int oc = 0; oc < {out_ch}; oc++) {{\n"
207
+ code += f" // Calculate start of filter weights in 32-bit array\n"
208
+ code += f" // kernel_size weights total. Packed 8 per word.\n"
209
+ code += f" // (kernel_size + 7) / 8 words per filter\n"
210
+ code += f" wp = {name}_weights + oc * {(kernel_size + 7) // 8};\n"
211
+ code += f" for (int oy = 0; oy < {out_h}; oy++) {{\n"
212
+ code += f" for (int ox = 0; ox < {out_w}; ox++) {{\n"
213
+ code += f" acc = 0;\n"
214
+ code += f" int w_idx = 0;\n"
215
+
216
+ # Group offset calculation
217
+ channels_per_group = in_ch
218
+ if groups == out_ch:
219
+ group_stride_str = f"oc * {channels_per_group}"
220
+ elif groups > 1:
221
+ out_per_group = out_ch // groups
222
+ group_stride_str = f"(oc / {out_per_group}) * {channels_per_group}"
223
+ else:
224
+ group_stride_str = "0"
225
+
226
+ code += f" for (int ic = 0; ic < {in_ch}; ic++) {{\n"
227
+ code += f" for (int ky = 0; ky < {kh}; ky++) {{\n"
228
+ code += f" int iy = oy * {stride_h} + ky - {pad_h};\n"
229
+ code += f" if (iy < 0 || iy >= {in_h}) {{ w_idx += {kw}; continue; }}\n"
230
+ code += f" for (int kx = 0; kx < {kw}; kx++) {{\n"
231
+ code += f" int ix = ox * {stride_w} + kx - {pad_w};\n"
232
+ code += f" if (ix >= 0 && ix < {in_w}) {{\n"
233
+ code += f" // Random access to packed 4-bit weights\n"
234
+ code += f" weight_chunk = wp[w_idx >> 3];\n"
235
+ code += f" // Shift = (idx % 8) * 4\n"
236
+ code += f" packed = (weight_chunk >> ((w_idx & 7) << 2));\n"
237
+ code += f" \n"
238
+ code += f" // [sign(1)][exp(3)]\n"
239
+ code += f" sign = (packed >> 3) & 1;\n"
240
+ code += f" exp = packed & 0x7;\n"
241
+
242
+ if group_stride_str == "0":
243
+ input_idx = f"ic * {in_h * in_w} + iy * {in_w} + ix"
244
+ else:
245
+ input_idx = f"({group_stride_str} + ic) * {in_h * in_w} + iy * {in_w} + ix"
246
+
247
+ code += f" \n"
248
+ code += f" shifted = (int32_t)input[{input_idx}] << exp;\n"
249
+ code += f" acc += sign ? -shifted : shifted;\n"
250
+ code += f" }}\n"
251
+ code += f" w_idx++;\n"
252
+ code += f" }}\n"
253
+ code += f" }}\n"
254
+ code += f" }}\n"
255
+ code += f" acc = scale_{name}(acc);\n"
256
+
257
+ if bias is not None:
258
+ code += f" acc += {name}_bias[oc];\n"
259
+
260
+ if use_relu:
261
+ code += f" if (acc < 0) acc = 0;\n"
262
+
263
+ code += f" output[oc * {out_h * out_w} + oy * {out_w} + ox] = (int8_t)(acc > 127 ? 127 : (acc < -128 ? -128 : acc));\n"
264
+ code += f" }}\n"
265
+ code += f" }}\n"
266
+ code += f" }}\n"
267
+ code += f"}}\n\n"
268
+
269
+ return code