PyPI - potnn - Versions diffs - 1.0.0__py3-none-any.whl - Mend

potnn 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

potnn/__init__.py +86 -0
potnn/codegen/__init__.py +20 -0
potnn/codegen/bit2.py +263 -0
potnn/codegen/fp130.py +269 -0
potnn/codegen/header.py +460 -0
potnn/codegen/level5.py +393 -0
potnn/codegen/scale.py +184 -0
potnn/codegen/ternary.py +354 -0
potnn/codegen/unroll.py +616 -0
potnn/config.py +112 -0
potnn/export.py +2196 -0
potnn/fuse.py +167 -0
potnn/modules/__init__.py +11 -0
potnn/modules/add.py +114 -0
potnn/modules/avgpool.py +173 -0
potnn/modules/base.py +225 -0
potnn/modules/conv.py +203 -0
potnn/modules/conv1d.py +317 -0
potnn/modules/depthwise.py +216 -0
potnn/modules/linear.py +199 -0
potnn/quantize/__init__.py +35 -0
potnn/quantize/calibration.py +233 -0
potnn/quantize/integer_ops.py +207 -0
potnn/quantize/integer_sim.py +225 -0
potnn/quantize/pot.py +455 -0
potnn/quantize/qat.py +356 -0
potnn/utils/__init__.py +13 -0
potnn/utils/allocation.py +240 -0
potnn/utils/memory.py +158 -0
potnn/wrapper.py +304 -0
potnn-1.0.0.dist-info/METADATA +260 -0
potnn-1.0.0.dist-info/RECORD +35 -0
potnn-1.0.0.dist-info/WHEEL +5 -0
potnn-1.0.0.dist-info/licenses/LICENSE +72 -0
potnn-1.0.0.dist-info/top_level.txt +1 -0

potnn/quantize/integer_ops.py ADDED Viewed

@@ -0,0 +1,207 @@
+"""Integer simulation operations with Straight-Through Estimator (STE).
+This module provides the core building blocks for "Integer-Only QAT".
+All operations in the forward pass simulate C integer arithmetic exactly,
+while the backward pass allows gradients to flow for training.
+"""
+import torch
+import torch.nn.functional as F
+# =============================================================================
+# Core Rounding Functions
+# =============================================================================
+class RoundHalfUpSTE(torch.autograd.Function):
+    """Half-up rounding with STE (C style).
+    Forward: floor(x + 0.5)
+    Backward: identity (gradient passes through unchanged)
+    """
+    @staticmethod
+    def forward(ctx, x):
+        return torch.floor(x + 0.5)
+    @staticmethod
+    def backward(ctx, grad_output):
+        return grad_output
+def round_half_up_ste(x: torch.Tensor) -> torch.Tensor:
+    """Round half up with STE. Matches C behavior: (int)(x + 0.5)."""
+    return RoundHalfUpSTE.apply(x)
+class FloorSTE(torch.autograd.Function):
+    """Floor with STE."""
+    @staticmethod
+    def forward(ctx, x):
+        return torch.floor(x)
+    @staticmethod
+    def backward(ctx, grad_output):
+        return grad_output
+def floor_ste(x: torch.Tensor) -> torch.Tensor:
+    return FloorSTE.apply(x)
+class ClampSTE(torch.autograd.Function):
+    """Clamp with STE."""
+    @staticmethod
+    def forward(ctx, x, min_val, max_val):
+        return x.clamp(min_val, max_val)
+    @staticmethod
+    def backward(ctx, grad_output):
+        return grad_output, None, None
+def clamp_ste(x: torch.Tensor, min_val: float, max_val: float) -> torch.Tensor:
+    return ClampSTE.apply(x, min_val, max_val)
+# =============================================================================
+# Integer Simulation Functions
+# =============================================================================
+def fake_quantize_input(x: torch.Tensor, scale: torch.Tensor) -> torch.Tensor:
+    """Quantize float input to int8 range (simulated as float).
+    Args:
+        x: Input tensor (float)
+        scale: Input scale factor (127.0 / max_val) or similar
+    Returns:
+        Quantized tensor (float dtype, but integer values)
+    """
+    # x_int = round(x * scale)
+    # clamp to [-128, 127] (or [0, 255] for uint8 if handled externally)
+    # Here we assume signed int8 for general case, but first layer might be uint8.
+    # We'll use round_half_up_ste for consistency with C.
+    return clamp_ste(round_half_up_ste(x * scale), -128.0, 127.0)
+def fake_quantize_input_uint8(x: torch.Tensor, scale: torch.Tensor) -> torch.Tensor:
+    """Quantize float input to uint8 range [0, 255]."""
+    return clamp_ste(round_half_up_ste(x * scale), 0.0, 255.0)
+class FakeRequantizeSTE(torch.autograd.Function):
+    """Simulate C-style requantization: (acc * scale_int + round) >> shift.
+    This is the core of Integer-Only QAT.
+    """
+    @staticmethod
+    def forward(ctx, acc, scale_int, shift):
+        # acc: int32 accumulator (simulated as float)
+        # scale_int: integer scale
+        # shift: integer shift
+        # C logic:
+        # int64_t temp = (int64_t)acc * scale_int;
+        # temp += (1 << (shift - 1));  // round
+        # output = temp >> shift;
+        # Python simulation (using float for large range, but logic is integer)
+        # Note: We use float arithmetic but ensure integer results
+        if shift > 0:
+            round_const = 1 << (shift - 1)
+        else:
+            round_const = 0
+        # 1. Multiply (use double precision to avoid float32 rounding errors for large acc)
+        # acc is float32 but represents integer values.
+        # acc * scale_int can exceed 2^24 (16M), causing precision loss in float32.
+        # double (float64) has 53 bits significand, sufficient for > 10^15.
+        val = acc.double() * scale_int
+        # 2. Add round constant
+        val = val + round_const
+        # 3. Shift (floor division by 2^shift)
+        # Use integer division simulation in double
+        divisor = float(1 << shift)
+        val = torch.floor(val / divisor)
+        return val.float()
+    @staticmethod
+    def backward(ctx, grad_output):
+        # STE: Gradient flows through as if it was just multiplication by (scale_int / 2^shift)
+        # out ≈ acc * (scale_int / 2^shift)
+        # grad_acc = grad_out * (scale_int / 2^shift)
+        scale_int = ctx.saved_tensors[0] if hasattr(ctx, 'saved_tensors') else 1.0 # Context saving not implemented in staticmethod forward
+        # Actually, we need to save context. Let's redo this properly.
+        return grad_output, None, None
+# Redefine properly with context
+class FakeRequantizeSTE(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, acc, scale_int, shift):
+        ctx.save_for_backward(torch.tensor(scale_int, dtype=torch.float32, device=acc.device),
+                              torch.tensor(shift, dtype=torch.float32, device=acc.device))
+        scale_int_val = int(scale_int)
+        shift_val = int(shift)
+        if shift_val > 0:
+            round_const = 1 << (shift_val - 1)
+        else:
+            round_const = 0
+        val = acc * scale_int_val + round_const
+        divisor = float(1 << shift_val)
+        val = torch.floor(val / divisor)
+        return val
+    @staticmethod
+    def backward(ctx, grad_output):
+        scale_int, shift = ctx.saved_tensors
+        # Effective scale = scale_int / 2^shift
+        effective_scale = scale_int / (2.0 ** shift)
+        return grad_output * effective_scale, None, None
+def fake_requantize(acc: torch.Tensor, scale_int: int, shift: int) -> torch.Tensor:
+    """Simulate C-style requantization with STE."""
+    return FakeRequantizeSTE.apply(acc, float(scale_int), float(shift))
+def fake_integer_gap(x: torch.Tensor) -> torch.Tensor:
+    """Simulate C-style Global Average Pooling: (sum + 32) >> 6.
+    Assumes 8x8 input (64 elements).
+    For generic size HxW: (sum + (HW//2)) >> log2(HW)
+    """
+    # We assume the input x is already int8 (or output of previous layer)
+    # Shape: [N, C, H, W]
+    # 1. Sum over H, W
+    sum_val = x.sum(dim=(2, 3))  # [N, C]
+    # 2. Add round constant and shift
+    # We need to know the pool size.
+    # For now, let's assume 8x8=64 (shift 6) as in the specific issue.
+    # In general, this should be parameterized.
+    # But for this function, let's implement the generic logic if possible,
+    # or just the specific logic for the user's case.
+    # The user's case was (sum + 32) >> 6.
+    pool_size = x.shape[2] * x.shape[3]
+    import math
+    # Check if power of 2
+    if (pool_size & (pool_size - 1)) == 0:
+        shift = int(math.log2(pool_size))
+        round_const = 1 << (shift - 1)
+        val = sum_val + round_const
+        val = torch.floor(val / (1 << shift))
+    else:
+        # Generic division: sum / pool_size
+        # C: (sum * div_mult + round) >> div_shift
+        # For simulation, we can just do floor(sum / pool_size + 0.5) = round_half_up(sum / pool_size)
+        # But to be bit-exact with C generic implementation, we might need the mult/shift logic.
+        # For now, let's use round_half_up(mean) as a close approximation if exact params aren't available,
+        # but ideally we should use the exact logic.
+        val = round_half_up_ste(sum_val / pool_size)
+    return val

potnn/quantize/integer_sim.py ADDED Viewed

@@ -0,0 +1,225 @@
+"""Integer simulation functions for QAT.
+These functions simulate C integer operations in PyTorch while allowing
+gradient flow through Straight-Through Estimator (STE).
+C operations:
+    - round: (x + 0.5) truncation
+    - clamp: min/max saturation
+    - requantize: (acc * scale_int + round) >> shift
+Python simulation must match C bit-for-bit for QAT to be accurate.
+Usage:
+    from potnn.quantize.integer_sim import (
+        round_ste, floor_ste, clamp_ste,
+        quantize_to_int8_ste, quantize_to_uint8_ste,
+        requantize_ste, compute_scale_params
+    )
+"""
+import torch
+import torch.nn as nn
+class RoundSTE(torch.autograd.Function):
+    """Round with Straight-Through Estimator.
+    Forward: torch.round(x)
+    Backward: gradient passes through unchanged
+    """
+    @staticmethod
+    def forward(ctx, x):
+        return torch.round(x)
+    @staticmethod
+    def backward(ctx, grad_output):
+        return grad_output
+class RoundHalfUpSTE(torch.autograd.Function):
+    """Half-up rounding with STE (C style).
+    Forward: floor(x + 0.5) - matches C's (x + 0.5) truncation
+    Backward: gradient passes through unchanged
+    This matches C integer rounding:
+        (int)(x + 0.5)  for positive x
+        (x * scale + (1 << (shift-1))) >> shift
+    """
+    @staticmethod
+    def forward(ctx, x):
+        return torch.floor(x + 0.5)
+    @staticmethod
+    def backward(ctx, grad_output):
+        return grad_output
+class FloorSTE(torch.autograd.Function):
+    """Floor with Straight-Through Estimator.
+    Forward: torch.floor(x)
+    Backward: gradient passes through unchanged
+    Used for integer division: a // b = floor(a / b)
+    """
+    @staticmethod
+    def forward(ctx, x):
+        return torch.floor(x)
+    @staticmethod
+    def backward(ctx, grad_output):
+        return grad_output
+class ClampSTE(torch.autograd.Function):
+    """Clamp with Straight-Through Estimator.
+    Forward: torch.clamp(x, min_val, max_val)
+    Backward: gradient passes through unchanged
+    Note: Standard clamp has zero gradient outside [min, max].
+    STE version allows gradient to flow for training stability.
+    """
+    @staticmethod
+    def forward(ctx, x, min_val, max_val):
+        return torch.clamp(x, min_val, max_val)
+    @staticmethod
+    def backward(ctx, grad_output):
+        return grad_output, None, None
+def round_ste(x: torch.Tensor) -> torch.Tensor:
+    """Round with STE for gradient flow (C style half-up)."""
+    return RoundHalfUpSTE.apply(x)  # floor(x + 0.5) - matches C
+def round_half_up_ste(x: torch.Tensor) -> torch.Tensor:
+    """Half-up rounding with STE (C style).
+    This matches C integer rounding behavior.
+    Example: 2.5 -> 3, -2.5 -> -2
+    """
+    return RoundHalfUpSTE.apply(x)
+def floor_ste(x: torch.Tensor) -> torch.Tensor:
+    """Floor with STE for gradient flow."""
+    return FloorSTE.apply(x)
+def clamp_ste(x: torch.Tensor, min_val: float, max_val: float) -> torch.Tensor:
+    """Clamp with STE for gradient flow."""
+    return ClampSTE.apply(x, min_val, max_val)
+def quantize_to_int8_ste(x: torch.Tensor, scale: torch.Tensor) -> torch.Tensor:
+    """Quantize tensor to int8 range with STE.
+    Forward:
+        x_int = round(x * scale)
+        x_int = clamp(x_int, -128, 127)
+    Backward: gradient passes through unchanged
+    Args:
+        x: Input tensor (float)
+        scale: Quantization scale (127.0 / max_activation)
+    Returns:
+        Tensor with int8 values (but float dtype for gradient)
+    """
+    x_scaled = x * scale
+    x_rounded = round_ste(x_scaled)
+    x_clamped = clamp_ste(x_rounded, -128.0, 127.0)
+    return x_clamped
+def quantize_to_uint8_ste(x: torch.Tensor, scale: float = 256.0) -> torch.Tensor:
+    """Quantize tensor to uint8 range with STE.
+    For first layer: input [0, 1] -> [0, 255]
+    Args:
+        x: Input tensor (float, assumed [0, 1] normalized)
+        scale: Quantization scale (default 256 for /256 normalization)
+    Returns:
+        Tensor with uint8 values (but float dtype for gradient)
+    """
+    x_scaled = x * scale
+    x_rounded = round_ste(x_scaled)
+    x_clamped = clamp_ste(x_rounded, 0.0, 255.0)
+    return x_clamped
+def requantize_ste(acc: torch.Tensor, scale_int: int, shift: int) -> torch.Tensor:
+    """Simulate C requantization with STE.
+    C code:
+        out = ((int64_t)acc * scale_int + (1 << (shift-1))) >> shift
+    This is equivalent to:
+        out = floor((acc * scale_int + round_const) / divisor)
+    where round_const = 1 << (shift-1), divisor = 1 << shift
+    Args:
+        acc: Accumulator tensor (int32 range values in float tensor)
+        scale_int: Integer scale factor
+        shift: Right shift amount
+    Returns:
+        Requantized tensor (int32 range values in float tensor)
+    """
+    if shift > 0:
+        round_const = 1 << (shift - 1)
+    else:
+        round_const = 0
+    divisor = float(1 << shift)
+    numerator = acc * float(scale_int) + float(round_const)
+    result = floor_ste(numerator / divisor)
+    return result
+def compute_scale_params(combined_scale: float, target_range: tuple = (64, 512)) -> tuple:
+    """Compute integer scale and shift from float scale.
+    Find (scale_int, shift) such that:
+        scale_int / (1 << shift) ≈ combined_scale
+        target_range[0] <= scale_int <= target_range[1]
+    Args:
+        combined_scale: Float scale value (alpha * act_scale / prev_act_scale)
+        target_range: Target range for scale_int (default 64-512 to match export.py)
+    Returns:
+        (scale_int, shift) tuple
+    """
+    if combined_scale == 0:
+        return 0, 0
+    min_scale, max_scale = target_range
+    shift = 0
+    scale_magnitude = abs(combined_scale)
+    while scale_magnitude < min_scale and shift < 24:
+        scale_magnitude *= 2
+        shift += 1
+    while scale_magnitude > max_scale and shift > 0:
+        scale_magnitude /= 2
+        shift -= 1
+    scale_int = round(combined_scale * (1 << shift))
+    return scale_int, shift