PyPI - sam-gate - Versions diffs - 0.1.0__py3-none-any.whl - Mend

sam-gate 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

sam_gate/__init__.py +10 -0
sam_gate/config.py +206 -0
sam_gate/kv_cache.py +2384 -0
sam_gate/sam.py +2420 -0
sam_gate/spectral.py +1685 -0
sam_gate-0.1.0.dist-info/METADATA +28 -0
sam_gate-0.1.0.dist-info/RECORD +8 -0
sam_gate-0.1.0.dist-info/WHEEL +4 -0

sam_gate/__init__.py ADDED Viewed

@@ -0,0 +1,10 @@
+"""SAM-Gate: semantic KV compression and RCI-guided quantization."""
+from .config import SAMConfig
+from .sam import attach_semantic_hooks, verify_sam_wblk_caches
+__all__ = [
+    "SAMConfig",
+    "attach_semantic_hooks",
+    "verify_sam_wblk_caches",
+]

sam_gate/config.py ADDED Viewed

@@ -0,0 +1,206 @@
+"""
+SAM-Gate hyperparameters and defaults.
+Centralizes values that were previously scattered across the main script, CLI, and heuristics.
+Adjust here to calibrate policy, prober, demo prompts, and KV estimates.
+Typical usage:
+    from sam_gate.config import SAMConfig, DEFAULT_CLI
+    cfg = SAMConfig()
+    # or
+    cfg = SAMConfig(tau=0.03, max_ctx_flat=256)
+─── CALIBRATION (Qwen2.5-7B-Instruct) ───────────────────────────────────────────
+Thresholds below are NOT yet calibrated for Qwen2.5-7B-Instruct.
+Run calibration before use:
+    python -m sam_gate.sam --model Qwen/Qwen2.5-7B-Instruct --calibrate --verbose
+Then adjust f_flat_max / f_obs_min in SAMConfig and SAMCliDefaults according to
+the observed f(t) distribution per layer.
+Reference values from Qwen2.5-3B-Instruct (for comparison):
+    L00-L01  : f(t) ~ 2e-05 – 2e-02   (quiet layers)
+    L02-L09  : f(t) ~ 3e-02 – 1.2e+00 (medium layers)
+    L10-L22  : f(t) ~ 4e-02 – 2.9e+01 (active layers)
+    L23      : f(t) ~ 5.2e+02 – 1.4e+03 (most active layer)
+─────────────────────────────────────────────────────────────────────────────────
+"""
+from __future__ import annotations
+from dataclasses import dataclass
+# ── Prompts ──────────────────────────────────────────────────────────────────
+STRESS_TEST_PROMPT: str = (
+    "Compare Gödel incompleteness with Turing undecidability and explain the "
+    "implications for formal systems under different axiomatic assumptions."
+)
+DEFAULT_DEMO_PROMPT: str = (
+    "Explain how attention mechanisms work in transformers."
+)
+CALIBRATION_SIMPLE_PROMPT: str = "What is the capital of France?"
+# Base paragraph for --long_prompt / _make_long_prompt (repeated until target tokens reached)
+LONG_PROMPT_FILLER_BASE: str = (
+    "The relationship between computational complexity and semantic meaning in "
+    "large language models is a deeply contested area of research. Attention "
+    "mechanisms allow the model to dynamically weight different parts of the "
+    "input sequence, creating context-dependent representations at each layer. "
+    "The key-value cache stores intermediate computations to avoid redundant "
+    "forward passes during autoregressive generation. "
+)
+# Heuristic ~ chars per token for sizing long text
+LONG_PROMPT_CHARS_PER_TOKEN: float = 4.5
+# Memory test: random secret code (6 digits)
+SECRET_CODE_MIN: int = 100_000
+SECRET_CODE_MAX: int = 999_999
+# ── Attention structure (inference when model doesn't expose head_dim) ───────
+HEAD_DIM_CANDIDATES: tuple[int, ...] = (32, 64, 80, 96, 128, 256)
+# ── RealDynamicsProber ───────────────────────────────────────────────────────
+# Maximum heads used for Gram norms per step (cost vs stability)
+PROBE_HEADS_CAP: int = 4
+# ── Heuristic KV estimate (_estimate_max_kv_mb) ──────────────────────────────
+# Qwen2.5-7B-Instruct: 28 layers, 8 KV heads, head_dim=128.
+# Does not replace actual model measurement — used only for pre-run estimates.
+@dataclass
+class SAMKVHeuristicEstimate:
+    kv_heads: int = 8
+    head_dim: int = 128
+    n_layers_flat: int = 2
+    n_layers_trans: int = 22
+    n_layers_obs: int = 4
+DEFAULT_KV_HEURISTIC = SAMKVHeuristicEstimate()
+# ── Generation (benchmark / main) ────────────────────────────────────────────
+@dataclass
+class SAMGenerationDefaults:
+    max_new_tokens: int = 100
+    max_new_tokens_calibrate: int = 30
+    use_cache: bool = False
+    do_sample: bool = False
+DEFAULT_GENERATION = SAMGenerationDefaults()
+# ── CLI and interactive wizard (argparse defaults / questions) ────────────────
+# NOTE: all policy parameters here must stay aligned with SAMConfig defaults below.
+@dataclass
+class SAMCliDefaults:
+    model: str = "Qwen/Qwen2.5-7B-Instruct"
+    device: str = "cpu"
+    prompt: str = DEFAULT_DEMO_PROMPT
+    max_new_tokens: int = 100
+    tau: float = 0.05
+    flat_bits: int = 4
+    trans_bits: int = 8
+    obs_bits: int = 16
+    flat_heads: float = 0.5        # aligned with SAMConfig
+    trans_heads: float = 0.75      # aligned with SAMConfig
+    f_flat_max: float = 5.0     # ⚠ NOT calibrated for 7B — run --calibrate
+    f_obs_min: float = 500.0        # ⚠ NOT calibrated for 7B — run --calibrate
+    max_ctx_flat: int = 128        # aligned with SAMConfig
+    max_ctx_trans: int = 512       # aligned with SAMConfig
+    max_ctx_obs: int = 2048        # aligned with SAMConfig
+    long_tokens: int = 2000
+    # Wizard: suggested minimums in questions
+    wizard_max_new_tokens_min: int = 10
+    wizard_ctx_flat_min: int = 16
+    wizard_ctx_trans_min: int = 64
+DEFAULT_CLI = SAMCliDefaults()
+# ── SAMConfig — core of semantic policy ──────────────────────────────────────
+@dataclass
+class SAMConfig:
+    """
+    SAM-Gate (Semantic-Aware Memory Gate) parameters.
+    Typical calibration: --calibrate + adjust f_flat_max / f_obs_min according to
+    the observed f(t) distribution per layer.
+    ⚠ Thresholds below are NOT yet calibrated for Qwen2.5-7B-Instruct.
+    Run: python -m sam_gate.sam --model Qwen/Qwen2.5-7B-Instruct --calibrate --verbose
+    """
+    tau: float = 0.05
+    flat_bits: int = 4
+    trans_bits: int = 8
+    obs_bits: int = 16
+    flat_heads: float = 0.5
+    trans_heads: float = 0.75
+    f_flat_max: float = 1e-2       # ⚠ NOT calibrated for 7B — run --calibrate
+    f_obs_min: float = 50.0        # ⚠ NOT calibrated for 7B — run --calibrate
+    dsem_obs_thresh: float = 5.0
+    ema_alpha: float = 0.3
+    # If True: probe_from_output only on prefill; decode reuses EMA from end of prefill.
+    probe_prefill_only: bool = True
+    max_ctx_flat: int = 128        # window in flat regime
+    max_ctx_trans: int = 512       # window in transition
+    max_ctx_obs: int = 2048        # window in obstructed
+    # Decode with dense ring: use flash_attn.flash_attn_interface.flash_attn_with_kvcache
+    # (CUDA) when available — fused kernel vs SDPA + materializing contiguous K/V.
+    use_flash_attn_kvcache: bool = True
+# ── Per-layer state (initialization) ─────────────────────────────────────────
+@dataclass
+class LayerSemanticState:
+    d_eff_ratio: float = 1.0
+    f_t: float = 0.0
+    D_sem: float = 0.0
+    H_sem: float = 1.0
+    step: int = 0
+# ── Neutral policy for --calibrate (observe f(t) without compressing) ────────
+CALIBRATION_F_SENTINEL: float = 1e9
+def neutral_sam_config_for_calibration(tau: float) -> SAMConfig:
+    """SAMConfig that disables compression and windows for measuring f(t) / d_eff."""
+    return SAMConfig(
+        tau=tau,
+        flat_bits=16,
+        trans_bits=16,
+        obs_bits=16,
+        flat_heads=1.0,
+        trans_heads=1.0,
+        f_flat_max=CALIBRATION_F_SENTINEL,
+        f_obs_min=CALIBRATION_F_SENTINEL,
+        max_ctx_flat=0,
+        max_ctx_trans=0,
+        max_ctx_obs=0,
+        use_flash_attn_kvcache=False,
+    )
+def bits_to_kv_prec_str(bits: int, *, fallback: str = "int4") -> str:
+    """Symbolic name of KV precision used in MemoryPolicy.prec_str."""
+    return {4: "int4", 8: "int8", 16: "fp16"}.get(bits, fallback)