potnn 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,616 @@
1
+ """Generate unrolled C code for PoT layers."""
2
+
3
+ import torch
4
+ import torch.nn.functional as F
5
+ import numpy as np
6
+ from typing import Dict, Any
7
+
8
+ from ..quantize.pot import quantize_to_pot
9
+
10
+
11
+ def generate_unrolled_layer(layer_info: Dict[str, Any]) -> str:
12
+ """Generate unrolled C code for a layer.
13
+
14
+ In unrolled mode, each weight becomes a direct instruction
15
+ with the shift amount embedded as an immediate value.
16
+ Zero weights are omitted entirely.
17
+
18
+ For Conv2d: Uses C loops for spatial positions, unrolls channels.
19
+ For Linear: Fully unrolls all weights.
20
+
21
+ Args:
22
+ layer_info: Dictionary with layer information
23
+
24
+ Returns:
25
+ C code for the unrolled layer
26
+ """
27
+ name = layer_info['name']
28
+ layer_type = layer_info['type']
29
+ weight = layer_info['weight']
30
+ alpha = layer_info['alpha']
31
+ bias = layer_info.get('bias', None)
32
+ use_relu = layer_info.get('has_relu', False)
33
+ is_last = layer_info.get('is_last', False)
34
+
35
+ # Layer dimensions for Conv2d
36
+ in_h = layer_info.get('in_h', 0)
37
+ in_w = layer_info.get('in_w', 0)
38
+ out_h = layer_info.get('out_h', 0)
39
+ out_w = layer_info.get('out_w', 0)
40
+ stride = layer_info.get('stride', 1)
41
+ padding = layer_info.get('padding', 0)
42
+
43
+ # Quantize weights
44
+ with torch.no_grad():
45
+ w_q = quantize_to_pot(weight, alpha, levels=11).numpy()
46
+
47
+ # Get act_scale for bias scaling (None for last layer)
48
+ act_scale = layer_info.get('act_scale')
49
+ if act_scale is None:
50
+ act_scale = 1.0 # Last layer: no scaling
51
+
52
+ # Debug output for codegen
53
+ print(f"\n [DEBUG] Generating unrolled code for {name}:")
54
+ print(f" Weight shape: {weight.shape}")
55
+ print(f" PoT weight unique values: {sorted(set(w_q.flatten()))}")
56
+ print(f" Non-zero weights: {np.count_nonzero(w_q)} / {w_q.size}")
57
+ if bias is not None:
58
+ all_bias_scaled = [int(b.item() * act_scale + 0.5) for b in bias]
59
+ print(f" act_scale: {act_scale:.4f}")
60
+ print(f" All scaled bias values: {all_bias_scaled}")
61
+ else:
62
+ print(f" No bias")
63
+
64
+ code = f"// {name} - Unrolled (11 levels)\n"
65
+
66
+ if 'Linear' in layer_type:
67
+ code += _generate_linear_unrolled(name, w_q, bias, use_relu, act_scale)
68
+ elif 'Depthwise' in layer_type:
69
+ # DepthwiseConv2d: use baseline depthwise generator
70
+ from ..export import generate_depthwise_conv_layer
71
+ return generate_depthwise_conv_layer(layer_info, is_first_layer=False)
72
+ elif 'Conv2d' in layer_type:
73
+ groups = layer_info.get('groups', 1)
74
+ code += _generate_conv2d_unrolled(
75
+ name, w_q, bias, use_relu,
76
+ in_h, in_w, out_h, out_w, stride, padding, act_scale, groups
77
+ )
78
+
79
+ return code
80
+
81
+
82
+ def _generate_linear_unrolled(name: str, w_q: np.ndarray, bias, use_relu: bool, act_scale: float) -> str:
83
+ """Generate unrolled code for Linear layer."""
84
+ out_features, in_features = w_q.shape
85
+
86
+ code = f"static void {name}_forward(const int8_t* input, int8_t* output) {{\n"
87
+
88
+ for out_idx in range(out_features):
89
+ code += f" {{ // output[{out_idx}]\n"
90
+ code += f" int32_t acc = 0;\n"
91
+
92
+ # Generate unrolled operations for non-zero weights
93
+ for in_idx in range(in_features):
94
+ w = int(w_q[out_idx, in_idx])
95
+ if w == 0:
96
+ continue
97
+ code += _pot_operation_direct(in_idx, w)
98
+
99
+ # Apply scale first
100
+ code += f" acc = scale_{name}(acc);\n"
101
+
102
+ # Add bias after scale (scaled by act_scale)
103
+ if bias is not None:
104
+ bias_val = int(bias[out_idx].item() * act_scale + 0.5)
105
+ if bias_val != 0:
106
+ code += f" acc += {bias_val};\n"
107
+
108
+ # Apply ReLU if needed
109
+ if use_relu:
110
+ code += f" if (acc < 0) acc = 0;\n"
111
+
112
+ # Clamp and store
113
+ code += f" output[{out_idx}] = (int8_t)(acc > 127 ? 127 : (acc < -128 ? -128 : acc));\n"
114
+ code += f" }}\n"
115
+
116
+ code += "}\n\n"
117
+ return code
118
+
119
+
120
+ def _generate_conv2d_unrolled(
121
+ name: str, w_q: np.ndarray, bias, use_relu: bool,
122
+ in_h: int, in_w: int, out_h: int, out_w: int,
123
+ stride: int, padding: int, act_scale: float, groups: int
124
+ ) -> str:
125
+ """Generate Conv2d code with C loops for positions, unrolled channels.
126
+
127
+ This produces compact code like v4 generator:
128
+ - Outer loops (oy, ox) are C for-loops
129
+ - Channel/kernel operations are unrolled inside
130
+ """
131
+ out_channels, in_channels, kh, kw = w_q.shape
132
+
133
+ code = f"static void {name}_forward(const int8_t* input, int8_t* output) {{\n"
134
+ code += f" // Conv2d: {in_channels}x{in_h}x{in_w} -> {out_channels}x{out_h}x{out_w}\n"
135
+ code += f" // Kernel: {kh}x{kw}, Stride: {stride}, Padding: {padding}\n"
136
+ code += f" int32_t acc;\n\n"
137
+
138
+ # C loops for output positions
139
+ code += f" for (int oy = 0; oy < {out_h}; oy++) {{\n"
140
+ code += f" for (int ox = 0; ox < {out_w}; ox++) {{\n"
141
+
142
+ # Unroll each output channel
143
+ for oc in range(out_channels):
144
+ code += f" // Output channel {oc}\n"
145
+ code += f" acc = 0;\n"
146
+
147
+ # Unroll kernel operations
148
+ for ic in range(in_channels):
149
+ for ky in range(kh):
150
+ for kx in range(kw):
151
+ w = int(w_q[oc, ic, ky, kx])
152
+ if w == 0:
153
+ continue
154
+
155
+ # Calculate offset from (oy, ox)
156
+ # Handle padding as int or tuple
157
+ pad_h = padding[0] if isinstance(padding, tuple) else padding
158
+ pad_w = padding[1] if isinstance(padding, tuple) else padding
159
+ ky_off = ky - pad_h # -1, 0, 1 for 3x3 with pad=1
160
+ kx_off = kx - pad_w
161
+
162
+ # Build input index expression
163
+ # in_y = oy * stride + ky_off
164
+ # in_x = ox * stride + kx_off
165
+ # idx = ic * in_h * in_w + in_y * in_w + in_x
166
+
167
+
168
+ # Group offset calculation
169
+ channels_per_group = in_channels
170
+ if groups == out_channels:
171
+ # Depthwise
172
+ group_ch_offset = oc * channels_per_group
173
+ elif groups > 1:
174
+ out_per_group = out_channels // groups
175
+ group_ch_offset = (oc // out_per_group) * channels_per_group
176
+ else:
177
+ group_ch_offset = 0
178
+
179
+ ic_base = (group_ch_offset + ic) * in_h * in_w
180
+
181
+ if stride == 1:
182
+ if ky_off == 0:
183
+ y_expr = "oy"
184
+ elif ky_off > 0:
185
+ y_expr = f"oy + {ky_off}"
186
+ else:
187
+ y_expr = f"oy - {-ky_off}"
188
+
189
+ if kx_off == 0:
190
+ x_expr = "ox"
191
+ elif kx_off > 0:
192
+ x_expr = f"ox + {kx_off}"
193
+ else:
194
+ x_expr = f"ox - {-kx_off}"
195
+ else:
196
+ if ky_off == 0:
197
+ y_expr = f"oy * {stride}"
198
+ elif ky_off > 0:
199
+ y_expr = f"oy * {stride} + {ky_off}"
200
+ else:
201
+ y_expr = f"oy * {stride} - {-ky_off}"
202
+
203
+ if kx_off == 0:
204
+ x_expr = f"ox * {stride}"
205
+ elif kx_off > 0:
206
+ x_expr = f"ox * {stride} + {kx_off}"
207
+ else:
208
+ x_expr = f"ox * {stride} - {-kx_off}"
209
+
210
+ idx_expr = f"{ic_base} + ({y_expr}) * {in_w} + ({x_expr})"
211
+
212
+ # Build boundary conditions
213
+ conditions = []
214
+ if ky_off < 0:
215
+ conditions.append(f"oy >= {-ky_off}")
216
+ elif ky_off > 0:
217
+ conditions.append(f"oy < {out_h - ky_off}")
218
+
219
+ if kx_off < 0:
220
+ conditions.append(f"ox >= {-kx_off}")
221
+ elif kx_off > 0:
222
+ conditions.append(f"ox < {out_w - kx_off}")
223
+
224
+ # Generate code
225
+ op = _pot_operation_expr(idx_expr, w)
226
+
227
+ if conditions:
228
+ cond = " && ".join(conditions)
229
+ code += f" if ({cond}) {op}\n"
230
+ else:
231
+ code += f" {op}\n"
232
+
233
+ # Apply scale first
234
+ code += f" acc = scale_{name}(acc);\n"
235
+
236
+ # Add bias after scale (scaled by act_scale)
237
+ if bias is not None:
238
+ bias_val = int(bias[oc].item() * act_scale + 0.5)
239
+ if bias_val != 0:
240
+ code += f" acc += {bias_val};\n"
241
+
242
+ # Apply ReLU if needed
243
+ if use_relu:
244
+ code += f" if (acc < 0) acc = 0;\n"
245
+
246
+ # Store output
247
+ out_base = oc * out_h * out_w
248
+ code += f" output[{out_base} + oy * {out_w} + ox] = (int8_t)(acc > 127 ? 127 : (acc < -128 ? -128 : acc));\n"
249
+ code += f"\n"
250
+
251
+ code += f" }}\n"
252
+ code += f" }}\n"
253
+ code += "}\n\n"
254
+ return code
255
+
256
+
257
+ def _pot_operation_direct(idx: int, w: int) -> str:
258
+ """Generate a PoT operation with direct index."""
259
+ if w == 1:
260
+ return f" acc += (int32_t)input[{idx}];\n"
261
+ elif w == -1:
262
+ return f" acc -= (int32_t)input[{idx}];\n"
263
+ elif w == 2:
264
+ return f" acc += (int32_t)input[{idx}] << 1;\n"
265
+ elif w == -2:
266
+ return f" acc -= (int32_t)input[{idx}] << 1;\n"
267
+ elif w == 4:
268
+ return f" acc += (int32_t)input[{idx}] << 2;\n"
269
+ elif w == -4:
270
+ return f" acc -= (int32_t)input[{idx}] << 2;\n"
271
+ elif w == 8:
272
+ return f" acc += (int32_t)input[{idx}] << 3;\n"
273
+ elif w == -8:
274
+ return f" acc -= (int32_t)input[{idx}] << 3;\n"
275
+ elif w == 16:
276
+ return f" acc += (int32_t)input[{idx}] << 4;\n"
277
+ elif w == -16:
278
+ return f" acc -= (int32_t)input[{idx}] << 4;\n"
279
+ elif w == 32:
280
+ return f" acc += (int32_t)input[{idx}] << 5;\n"
281
+ elif w == -32:
282
+ return f" acc -= (int32_t)input[{idx}] << 5;\n"
283
+ elif w == 64:
284
+ return f" acc += (int32_t)input[{idx}] << 6;\n"
285
+ elif w == -64:
286
+ return f" acc -= (int32_t)input[{idx}] << 6;\n"
287
+ elif w == 128:
288
+ return f" acc += (int32_t)input[{idx}] << 7;\n"
289
+ elif w == -128:
290
+ return f" acc -= (int32_t)input[{idx}] << 7;\n"
291
+ return ""
292
+
293
+
294
+ def _pot_operation_expr(idx_expr: str, w: int) -> str:
295
+ """Generate a PoT operation with index expression."""
296
+ shift = _get_shift(abs(w))
297
+ sign = "+" if w > 0 else "-"
298
+
299
+ if shift == 0:
300
+ return f"acc {sign}= (int32_t)input[{idx_expr}];"
301
+ else:
302
+ return f"acc {sign}= (int32_t)input[{idx_expr}] << {shift};"
303
+
304
+
305
+ def _get_shift(abs_w: int) -> int:
306
+ """Get shift amount for absolute PoT value."""
307
+ if abs_w == 1:
308
+ return 0
309
+ elif abs_w == 2:
310
+ return 1
311
+ elif abs_w == 4:
312
+ return 2
313
+ elif abs_w == 8:
314
+ return 3
315
+ elif abs_w == 16:
316
+ return 4
317
+ elif abs_w == 32:
318
+ return 5
319
+ elif abs_w == 64:
320
+ return 6
321
+ elif abs_w == 128:
322
+ return 7
323
+ return 0
324
+
325
+
326
+ # =============================================================================
327
+ # OPTIMIZED UNROLL GENERATION (Zero-Padding only)
328
+ # Eliminates boundary if statements by using padded buffer
329
+ # =============================================================================
330
+
331
+ def generate_unrolled_layer_optimized(layer_info: Dict[str, Any], is_first_layer: bool = False) -> str:
332
+ """Generate optimized unrolled C code with Zero-Padding.
333
+
334
+ Optimization applied:
335
+ - Zero-Padding: Eliminates all boundary check if statements
336
+
337
+ Args:
338
+ layer_info: Dictionary with layer information
339
+ is_first_layer: If True, input type is uint8_t (image input)
340
+
341
+ Returns:
342
+ C code for the optimized layer
343
+ """
344
+ name = layer_info['name']
345
+ layer_type = layer_info['type']
346
+ weight = layer_info['weight']
347
+ alpha = layer_info['alpha']
348
+ bias = layer_info.get('bias', None)
349
+ use_relu = layer_info.get('has_relu', False)
350
+ is_last = layer_info.get('is_last', False)
351
+
352
+ # Layer dimensions
353
+ in_h = layer_info.get('in_h', 0)
354
+ in_w = layer_info.get('in_w', 0)
355
+ out_h = layer_info.get('out_h', 0)
356
+ out_w = layer_info.get('out_w', 0)
357
+ stride = layer_info.get('stride', 1)
358
+ padding = layer_info.get('padding', 0)
359
+
360
+ # Weight is already quantized to PoT values in collect_pot_layer_info()
361
+ # DO NOT call quantize_to_pot again! Just convert to numpy.
362
+ with torch.no_grad():
363
+ w_q = weight.numpy() if isinstance(weight, torch.Tensor) else weight
364
+
365
+ # Get act_scale for bias scaling
366
+ act_scale = layer_info.get('act_scale')
367
+ if act_scale is None:
368
+ act_scale = 1.0
369
+
370
+ # DEBUG: Print weight statistics
371
+ unique_vals = sorted(set(w_q.flatten().astype(int)))
372
+ print(f" [OPTIMIZED UNROLL] {name}: first_layer={is_first_layer}, unique weights={unique_vals}")
373
+
374
+ code = f"// {name} - Unrolled with Zero-Padding (11 levels)\n"
375
+
376
+ if 'Linear' in layer_type:
377
+ # Linear layers: no spatial padding needed, use original
378
+ code += _generate_linear_unrolled(name, w_q, bias, use_relu, act_scale)
379
+ elif 'Conv1d' in layer_type:
380
+ # Conv1d with Zero-Padding
381
+ in_L = layer_info.get('in_L', 0)
382
+ out_L = layer_info.get('out_L', 0)
383
+ code += _generate_conv1d_unrolled_optimized(
384
+ name, w_q, bias, use_relu,
385
+ in_L, out_L, stride, padding, act_scale,
386
+ is_first_layer=is_first_layer
387
+ )
388
+ elif 'Conv2d' in layer_type and 'Depthwise' not in layer_type:
389
+ # Standard Conv2d with Zero-Padding
390
+ code += _generate_conv2d_unrolled_optimized(
391
+ name, w_q, bias, use_relu,
392
+ in_h, in_w, out_h, out_w, stride, padding, act_scale,
393
+ is_first_layer=is_first_layer
394
+ )
395
+ else:
396
+ # DepthwiseConv2d: use baseline's dedicated depthwise generator
397
+ if 'Depthwise' in layer_type:
398
+ print(f" [INFO] {name}: {layer_type} using baseline depthwise generator")
399
+ from ..export import generate_depthwise_conv_layer
400
+ return generate_depthwise_conv_layer(layer_info, is_first_layer)
401
+ else:
402
+ # Other types: fall back to original unroll
403
+ print(f" [INFO] {name}: {layer_type} falling back to original unroll")
404
+ return generate_unrolled_layer(layer_info)
405
+
406
+ return code
407
+
408
+
409
+ def _generate_conv2d_unrolled_optimized(
410
+ name: str, w_q: np.ndarray, bias, use_relu: bool,
411
+ in_h: int, in_w: int, out_h: int, out_w: int,
412
+ stride: int, padding: int, act_scale: float,
413
+ is_first_layer: bool = False
414
+ ) -> str:
415
+ """Generate Conv2d unrolled code with Zero-Padding.
416
+
417
+ Zero-Padding eliminates all boundary if statements.
418
+
419
+ Args:
420
+ is_first_layer: If True, input type is uint8_t (image input)
421
+ """
422
+ out_ch, in_ch, kh, kw = w_q.shape
423
+
424
+ # Calculate padded dimensions
425
+ padded_h = in_h + 2 * padding
426
+ padded_w = in_w + 2 * padding
427
+ padded_size = in_ch * padded_h * padded_w
428
+
429
+ # Input type depends on whether this is the first layer
430
+ input_type = "uint8_t" if is_first_layer else "int8_t"
431
+
432
+ code = f"// Conv2d: {in_ch}x{in_h}x{in_w} -> {out_ch}x{out_h}x{out_w}\n"
433
+ code += f"// Kernel: {kh}x{kw}, Stride: {stride}, Padding: {padding}\n"
434
+ code += f"// Optimized: Zero-Padding (no boundary checks)\n\n"
435
+
436
+ code += f"static void {name}_forward(const {input_type}* input, int8_t* output) {{\n"
437
+
438
+ # Zero-padding buffer
439
+ # First layer: use int16_t to preserve uint8 values (0-255) without overflow
440
+ # Other layers: use int8_t since values are already in -128~127 range
441
+ padded_type = "int16_t" if is_first_layer else "int8_t"
442
+ padded_elem_size = 2 if is_first_layer else 1
443
+
444
+ code += f" // Zero-padding: {in_ch}x{in_h}x{in_w} -> {in_ch}x{padded_h}x{padded_w}\n"
445
+ code += f" static {padded_type} padded[{padded_size}];\n"
446
+ code += f" memset(padded, 0, {padded_size * padded_elem_size});\n\n"
447
+
448
+ # Copy input to padded buffer (no casting needed now)
449
+ code += f" for (int c = 0; c < {in_ch}; c++)\n"
450
+ code += f" for (int y = 0; y < {in_h}; y++)\n"
451
+ code += f" for (int x = 0; x < {in_w}; x++)\n"
452
+ code += f" padded[c * {padded_h * padded_w} + (y + {padding}) * {padded_w} + (x + {padding})] = input[c * {in_h * in_w} + y * {in_w} + x];\n\n"
453
+
454
+ code += f" int32_t acc;\n\n"
455
+
456
+ # C loops for output positions
457
+ code += f" for (int oy = 0; oy < {out_h}; oy++) {{\n"
458
+ code += f" for (int ox = 0; ox < {out_w}; ox++) {{\n"
459
+
460
+ # Unroll each output channel
461
+ for oc in range(out_ch):
462
+ code += f" // Output channel {oc}\n"
463
+ code += f" acc = 0;\n"
464
+
465
+ # Unroll kernel operations - NO boundary checks needed!
466
+ for ic in range(in_ch):
467
+ for ky in range(kh):
468
+ for kx in range(kw):
469
+ w = int(w_q[oc, ic, ky, kx])
470
+ if w == 0:
471
+ continue
472
+
473
+ # Direct access to padded buffer
474
+ # padded index = ic * padded_h * padded_w + (oy * stride + ky) * padded_w + (ox * stride + kx)
475
+ ic_base = ic * padded_h * padded_w
476
+
477
+ if stride == 1:
478
+ y_expr = f"oy + {ky}" if ky > 0 else "oy"
479
+ x_expr = f"ox + {kx}" if kx > 0 else "ox"
480
+ else:
481
+ y_expr = f"oy * {stride} + {ky}" if ky > 0 else f"oy * {stride}"
482
+ x_expr = f"ox * {stride} + {kx}" if kx > 0 else f"ox * {stride}"
483
+
484
+ idx_expr = f"{ic_base} + ({y_expr}) * {padded_w} + ({x_expr})"
485
+
486
+ # Generate operation without boundary check
487
+ op = _pot_operation_padded(idx_expr, w)
488
+ code += f" {op}\n"
489
+
490
+ # Apply scale
491
+ code += f" acc = scale_{name}(acc);\n"
492
+
493
+ # Add bias
494
+ if bias is not None:
495
+ bias_val = int(bias[oc].item() * act_scale + 0.5)
496
+ if bias_val != 0:
497
+ code += f" acc += {bias_val};\n"
498
+
499
+ # Apply ReLU
500
+ if use_relu:
501
+ code += f" if (acc < 0) acc = 0;\n"
502
+
503
+ # Store output
504
+ out_base = oc * out_h * out_w
505
+ code += f" output[{out_base} + oy * {out_w} + ox] = (int8_t)(acc > 127 ? 127 : (acc < -128 ? -128 : acc));\n"
506
+ code += f"\n"
507
+
508
+ code += f" }}\n"
509
+ code += f" }}\n"
510
+ code += "}\n\n"
511
+
512
+ return code
513
+
514
+
515
+ def _pot_operation_padded(idx_expr: str, w: int) -> str:
516
+ """Generate a PoT operation for padded buffer access."""
517
+ shift = _get_shift(abs(w))
518
+ sign = "+" if w > 0 else "-"
519
+
520
+ if shift == 0:
521
+ return f"acc {sign}= (int32_t)padded[{idx_expr}];"
522
+ else:
523
+ return f"acc {sign}= (int32_t)padded[{idx_expr}] << {shift};"
524
+
525
+
526
+ def _generate_conv1d_unrolled_optimized(
527
+ name: str, w_q: np.ndarray, bias, use_relu: bool,
528
+ in_L: int, out_L: int, stride: int, padding: int, act_scale: float,
529
+ is_first_layer: bool = False
530
+ ) -> str:
531
+ """Generate Conv1d unrolled code with Zero-Padding.
532
+
533
+ Zero-Padding eliminates all boundary if statements.
534
+
535
+ Args:
536
+ is_first_layer: If True, input type is uint8_t
537
+ """
538
+ out_ch, in_ch, kL = w_q.shape
539
+
540
+ # Calculate padded dimensions
541
+ padded_L = in_L + 2 * padding
542
+ padded_size = in_ch * padded_L
543
+
544
+ # Input type depends on whether this is the first layer
545
+ input_type = "uint8_t" if is_first_layer else "int8_t"
546
+
547
+ code = f"// Conv1d: {in_ch}x{in_L} -> {out_ch}x{out_L}\n"
548
+ code += f"// Kernel: {kL}, Stride: {stride}, Padding: {padding}\n"
549
+ code += f"// Optimized: Zero-Padding (no boundary checks)\n\n"
550
+
551
+ code += f"static void {name}_forward(const {input_type}* input, int8_t* output) {{\n"
552
+
553
+ # Zero-padding buffer
554
+ padded_type = "int16_t" if is_first_layer else "int8_t"
555
+ padded_elem_size = 2 if is_first_layer else 1
556
+
557
+ code += f" // Zero-padding: {in_ch}x{in_L} -> {in_ch}x{padded_L}\n"
558
+ code += f" static {padded_type} padded[{padded_size}];\n"
559
+ code += f" memset(padded, 0, {padded_size * padded_elem_size});\n\n"
560
+
561
+ # Copy input to padded buffer
562
+ code += f" for (int c = 0; c < {in_ch}; c++)\n"
563
+ code += f" for (int i = 0; i < {in_L}; i++)\n"
564
+ code += f" padded[c * {padded_L} + (i + {padding})] = input[c * {in_L} + i];\n\n"
565
+
566
+ code += f" int32_t acc;\n\n"
567
+
568
+ # Loop for output positions
569
+ code += f" for (int o = 0; o < {out_L}; o++) {{\n"
570
+
571
+ # Unroll each output channel
572
+ for oc in range(out_ch):
573
+ code += f" // Output channel {oc}\n"
574
+ code += f" acc = 0;\n"
575
+
576
+ # Unroll kernel operations - NO boundary checks needed!
577
+ for ic in range(in_ch):
578
+ for k in range(kL):
579
+ w = int(w_q[oc, ic, k])
580
+ if w == 0:
581
+ continue
582
+
583
+ # Direct access to padded buffer
584
+ ic_base = ic * padded_L
585
+
586
+ if stride == 1:
587
+ idx_expr = f"{ic_base} + o + {k}" if k > 0 else f"{ic_base} + o"
588
+ else:
589
+ idx_expr = f"{ic_base} + o * {stride} + {k}" if k > 0 else f"{ic_base} + o * {stride}"
590
+
591
+ # Generate operation without boundary check
592
+ op = _pot_operation_padded(idx_expr, w)
593
+ code += f" {op}\n"
594
+
595
+ # Apply scale
596
+ code += f" acc = scale_{name}(acc);\n"
597
+
598
+ # Add bias
599
+ if bias is not None:
600
+ bias_val = int(bias[oc].item() * act_scale + 0.5)
601
+ if bias_val != 0:
602
+ code += f" acc += {bias_val};\n"
603
+
604
+ # Apply ReLU
605
+ if use_relu:
606
+ code += f" if (acc < 0) acc = 0;\n"
607
+
608
+ # Store output
609
+ out_base = oc * out_L
610
+ code += f" output[{out_base} + o] = (int8_t)(acc > 127 ? 127 : (acc < -128 ? -128 : acc));\n"
611
+ code += f"\n"
612
+
613
+ code += f" }}\n"
614
+ code += "}\n\n"
615
+
616
+ return code