PyPI - tiny-turboquant - Versions diffs - 0.1.0__tar.gz - Mend

tiny-turboquant 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

tiny_turboquant-0.1.0/LICENSE +21 -0
tiny_turboquant-0.1.0/MANIFEST.in +6 -0
tiny_turboquant-0.1.0/PKG-INFO +109 -0
tiny_turboquant-0.1.0/README.md +64 -0
tiny_turboquant-0.1.0/benchmarks/__init__.py +0 -0
tiny_turboquant-0.1.0/benchmarks/bench_ann.py +188 -0
tiny_turboquant-0.1.0/benchmarks/bench_kv_real.py +192 -0
tiny_turboquant-0.1.0/demos/__init__.py +0 -0
tiny_turboquant-0.1.0/demos/demo1_distortion_vs_theory.py +75 -0
tiny_turboquant-0.1.0/demos/demo2_ann_vs_pq.py +133 -0
tiny_turboquant-0.1.0/demos/demo3_real_embeddings.py +151 -0
tiny_turboquant-0.1.0/demos/demo4_kv_cache.py +308 -0
tiny_turboquant-0.1.0/pyproject.toml +81 -0
tiny_turboquant-0.1.0/setup.cfg +4 -0
tiny_turboquant-0.1.0/setup.py +3 -0
tiny_turboquant-0.1.0/tests/test_tiny_turboquant.py +114 -0
tiny_turboquant-0.1.0/tiny_turboquant/__init__.py +28 -0
tiny_turboquant-0.1.0/tiny_turboquant/bitpack.py +98 -0
tiny_turboquant-0.1.0/tiny_turboquant/codebooks.py +89 -0
tiny_turboquant-0.1.0/tiny_turboquant/fwht.py +59 -0
tiny_turboquant-0.1.0/tiny_turboquant/kv_cache.py +337 -0
tiny_turboquant-0.1.0/tiny_turboquant/numpy_reference.py +271 -0
tiny_turboquant-0.1.0/tiny_turboquant/outlier_split.py +95 -0
tiny_turboquant-0.1.0/tiny_turboquant/quantizer.py +119 -0
tiny_turboquant-0.1.0/tiny_turboquant/rotation.py +80 -0
tiny_turboquant-0.1.0/tiny_turboquant.egg-info/PKG-INFO +109 -0
tiny_turboquant-0.1.0/tiny_turboquant.egg-info/SOURCES.txt +28 -0
tiny_turboquant-0.1.0/tiny_turboquant.egg-info/dependency_links.txt +1 -0
tiny_turboquant-0.1.0/tiny_turboquant.egg-info/requires.txt +24 -0
tiny_turboquant-0.1.0/tiny_turboquant.egg-info/top_level.txt +3 -0

tiny_turboquant-0.1.0/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2026 Pradeep Boopathy
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

tiny_turboquant-0.1.0/MANIFEST.in ADDED Viewed

@@ -0,0 +1,6 @@
+include README.md
+include LICENSE
+recursive-include tiny_turboquant *.py
+recursive-include tests *.py
+recursive-include demos *.py
+recursive-include benchmarks *.py

tiny_turboquant-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,109 @@
+Metadata-Version: 2.4
+Name: tiny-turboquant
+Version: 0.1.0
+Summary: Low-bit vector and KV-cache compression research toolkit for PyTorch
+Author: Pradeep Boopathy
+License-Expression: MIT
+Project-URL: Homepage, https://github.com/pradeepboopathy/tiny-turboquant
+Project-URL: Repository, https://github.com/pradeepboopathy/tiny-turboquant
+Project-URL: Issues, https://github.com/pradeepboopathy/tiny-turboquant/issues
+Keywords: quantization,kv-cache,llm,compression,vector-search,pytorch,rag,transformers
+Classifier: Development Status :: 3 - Alpha
+Classifier: Intended Audience :: Developers
+Classifier: Intended Audience :: Science/Research
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.9
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
+Classifier: Topic :: Software Development :: Libraries :: Python Modules
+Requires-Python: >=3.9
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: torch>=2.1
+Requires-Dist: numpy>=1.24
+Provides-Extra: dev
+Requires-Dist: pytest>=8; extra == "dev"
+Requires-Dist: build>=1.2; extra == "dev"
+Requires-Dist: twine>=5.0; extra == "dev"
+Requires-Dist: ruff>=0.5; extra == "dev"
+Provides-Extra: demos
+Requires-Dist: matplotlib>=3.7; extra == "demos"
+Requires-Dist: faiss-cpu>=1.7.4; extra == "demos"
+Requires-Dist: sentence-transformers>=2.6; extra == "demos"
+Provides-Extra: llm
+Requires-Dist: transformers>=4.40; extra == "llm"
+Requires-Dist: accelerate>=0.30; extra == "llm"
+Provides-Extra: all
+Requires-Dist: matplotlib>=3.7; extra == "all"
+Requires-Dist: faiss-cpu>=1.7.4; extra == "all"
+Requires-Dist: sentence-transformers>=2.6; extra == "all"
+Requires-Dist: transformers>=4.40; extra == "all"
+Requires-Dist: accelerate>=0.30; extra == "all"
+Dynamic: license-file
+# Tiny TurboQuant
+Tiny TurboQuant is a lightweight PyTorch research toolkit for low-bit vector compression and KV-cache compression experiments.
+It includes:
+- real `uint8` bit-packing for low-bit indices
+- MSE-style scalar quantization
+- product / inner-product-oriented quantization experiments
+- outlier-split quantization
+- a Hugging Face-compatible KV-cache prototype
+- demos for ANN search, embedding compression, and KV-cache memory measurement
+## Important limitation
+This package demonstrates packed memory compression. It is **not** a production compressed-attention engine. The KV-cache wrapper still dequantizes tensors before attention. Real latency gains require fused CUDA/Triton kernels or integration with an inference engine such as vLLM or TensorRT-LLM.
+## Install from local wheel
+```bash
+pip install tiny_turboquant-0.1.0-py3-none-any.whl
+```
+## Basic usage
+```python
+import torch
+from tiny_turboquant import TurboQuantMSE, TurboQuantKVCache
+x = torch.randn(128, 64)
+x = x / x.norm(dim=-1, keepdim=True)
+q = TurboQuantMSE.build(d=64, bits=4)
+idx = q.quant(x)
+x_hat = q.dequant(idx)
+cache = TurboQuantKVCache(bits=4)
+```
+## Run tests
+```bash
+python -m pytest
+```
+## Run demos from source checkout
+```bash
+python -m demos.demo1_distortion_vs_theory
+python -m demos.demo2_ann_vs_pq
+python -m demos.demo3_real_embeddings
+python -m demos.demo4_kv_cache
+```
+## Scope
+Use this package for:
+- compressed vector-search experiments
+- RAG embedding compression experiments
+- KV-cache memory/quality tradeoff experiments
+- educational or research benchmarking
+Do not claim it provides production-speed LLM inference. It reduces packed storage; speed requires optimized compressed-attention kernels.

tiny_turboquant-0.1.0/README.md ADDED Viewed

@@ -0,0 +1,64 @@
+# Tiny TurboQuant
+Tiny TurboQuant is a lightweight PyTorch research toolkit for low-bit vector compression and KV-cache compression experiments.
+It includes:
+- real `uint8` bit-packing for low-bit indices
+- MSE-style scalar quantization
+- product / inner-product-oriented quantization experiments
+- outlier-split quantization
+- a Hugging Face-compatible KV-cache prototype
+- demos for ANN search, embedding compression, and KV-cache memory measurement
+## Important limitation
+This package demonstrates packed memory compression. It is **not** a production compressed-attention engine. The KV-cache wrapper still dequantizes tensors before attention. Real latency gains require fused CUDA/Triton kernels or integration with an inference engine such as vLLM or TensorRT-LLM.
+## Install from local wheel
+```bash
+pip install tiny_turboquant-0.1.0-py3-none-any.whl
+```
+## Basic usage
+```python
+import torch
+from tiny_turboquant import TurboQuantMSE, TurboQuantKVCache
+x = torch.randn(128, 64)
+x = x / x.norm(dim=-1, keepdim=True)
+q = TurboQuantMSE.build(d=64, bits=4)
+idx = q.quant(x)
+x_hat = q.dequant(idx)
+cache = TurboQuantKVCache(bits=4)
+```
+## Run tests
+```bash
+python -m pytest
+```
+## Run demos from source checkout
+```bash
+python -m demos.demo1_distortion_vs_theory
+python -m demos.demo2_ann_vs_pq
+python -m demos.demo3_real_embeddings
+python -m demos.demo4_kv_cache
+```
+## Scope
+Use this package for:
+- compressed vector-search experiments
+- RAG embedding compression experiments
+- KV-cache memory/quality tradeoff experiments
+- educational or research benchmarking
+Do not claim it provides production-speed LLM inference. It reduces packed storage; speed requires optimized compressed-attention kernels.

tiny_turboquant-0.1.0/benchmarks/__init__.py ADDED Viewed

File without changes

tiny_turboquant-0.1.0/benchmarks/bench_ann.py ADDED Viewed

@@ -0,0 +1,188 @@
+"""
+ANN A/B benchmark: TurboQuant vs faiss PQ vs faiss IVFPQ vs RaBitQ (optional).
+Reports recall@k, indexing time, and queries/second for each method on the
+same dataset. The dataset can be loaded from a .npy file (most realistic)
+or generated synthetically.
+Usage:
+    python -m benchmarks.bench_ann --data path/to/embeddings.npy --bits 4 --k 10
+    python -m benchmarks.bench_ann --synthetic 50000 --d 768 --bits 4
+"""
+from __future__ import annotations
+import argparse
+import time
+from pathlib import Path
+import numpy as np
+import torch
+from tiny_turboquant import TurboQuantProd
+def parse_args():
+    p = argparse.ArgumentParser()
+    p.add_argument("--data",      type=str, default=None,
+                   help="Path to .npy of shape (N, D), float32.")
+    p.add_argument("--synthetic", type=int, default=50_000)
+    p.add_argument("--d",         type=int, default=768)
+    p.add_argument("--bits",      type=int, default=4)
+    p.add_argument("--k",         type=int, default=10)
+    p.add_argument("--n_queries", type=int, default=500)
+    p.add_argument("--device",    default="cuda" if torch.cuda.is_available() else "cpu")
+    return p.parse_args()
+def load_data(args) -> tuple[np.ndarray, np.ndarray]:
+    if args.data is not None:
+        X = np.load(args.data).astype(np.float32)
+    else:
+        rng = np.random.default_rng(0)
+        # Cluster structure mimicking real embedding stores.
+        n_clusters = max(50, args.synthetic // 500)
+        centers = rng.standard_normal((n_clusters, args.d)).astype(np.float32)
+        centers /= np.linalg.norm(centers, axis=1, keepdims=True)
+        cluster_id = rng.integers(0, n_clusters, size=args.synthetic)
+        X = centers[cluster_id] + 0.25 * rng.standard_normal((args.synthetic, args.d)).astype(np.float32)
+    X = X / (np.linalg.norm(X, axis=1, keepdims=True) + 1e-12)
+    rng = np.random.default_rng(1)
+    qi = rng.choice(X.shape[0], args.n_queries, replace=False)
+    return X, X[qi].copy()
+def recall_at_k(true_top: np.ndarray, est_top: np.ndarray) -> float:
+    hits = sum(len(set(t.tolist()) & set(e.tolist()))
+               for t, e in zip(true_top, est_top))
+    return hits / true_top.size
+# ---- methods ----------------------------------------------------------
+def turboquant_run(X, Q, k, bits, device):
+    Xt = torch.from_numpy(X).to(device)
+    Qt = torch.from_numpy(Q).to(device)
+    d = X.shape[1]
+    t0 = time.perf_counter()
+    q  = TurboQuantProd.build(d, bits, device=device, seed=0,
+                              dtype=torch.float32)
+    idx, signs, gamma = q.quant(Xt)
+    if device.startswith("cuda"):
+        torch.cuda.synchronize()
+    t_index = time.perf_counter() - t0
+    t0 = time.perf_counter()
+    Xh = q.dequant(idx, signs, gamma)
+    sims = Qt @ Xh.T
+    top = sims.topk(k, dim=-1).indices.cpu().numpy()
+    if device.startswith("cuda"):
+        torch.cuda.synchronize()
+    t_query = time.perf_counter() - t0
+    return top, t_index, t_query
+def faiss_pq_run(X, Q, k, bits):
+    try:
+        import faiss
+    except Exception as e:   # pragma: no cover
+        print(f"  (faiss unavailable: {e}) — skipping PQ")
+        return None, None, None
+    d = X.shape[1]
+    # Match bit-budget: PQ stores log2(ks) bits per sub-quantizer;
+    # M sub-vectors * log2(ks) / d total bits per coord.
+    # Use ks=2**bits and M=d so PQ's bit-budget = bits per coord.
+    M  = d
+    ks = 2 ** bits
+    if ks > 256:                          # faiss IndexPQ stores 1 byte per code
+        ks = 256
+    index = faiss.IndexPQ(d, M, int(np.log2(ks)))
+    index.metric_type = faiss.METRIC_INNER_PRODUCT
+    t0 = time.perf_counter(); index.train(X); index.add(X); t_index = time.perf_counter() - t0
+    t0 = time.perf_counter()
+    _, top = index.search(Q, k)
+    t_query = time.perf_counter() - t0
+    return top, t_index, t_query
+def faiss_ivfpq_run(X, Q, k, bits):
+    try:
+        import faiss
+    except Exception:
+        return None, None, None
+    d = X.shape[1]
+    nlist = 100
+    M = d
+    ks_bits = min(bits, 8)
+    quantizer = faiss.IndexFlatIP(d)
+    index = faiss.IndexIVFPQ(quantizer, d, nlist, M, ks_bits)
+    index.metric_type = faiss.METRIC_INNER_PRODUCT
+    t0 = time.perf_counter(); index.train(X); index.add(X); t_index = time.perf_counter() - t0
+    index.nprobe = 16
+    t0 = time.perf_counter()
+    _, top = index.search(Q, k)
+    t_query = time.perf_counter() - t0
+    return top, t_index, t_query
+def rabitq_run(X, Q, k, bits):                  # pragma: no cover
+    """Optional RaBitQ baseline; skipped silently if package missing."""
+    try:
+        import rabitqlib                          # noqa: F401
+    except Exception as e:
+        print(f"  (rabitq unavailable: {e}) — skipping RaBitQ")
+        return None, None, None
+    print("  TODO: integrate rabitqlib (left as exercise — official API churns)")
+    return None, None, None
+# ---- driver ----------------------------------------------------------
+def main():
+    args = parse_args()
+    X, Q = load_data(args)
+    n, d = X.shape
+    # Ground truth: brute-force top-k inner product
+    print(f"\nDataset: {n} vectors of dim {d}, {len(Q)} queries, k={args.k}, "
+          f"bits={args.bits}, device={args.device}\n")
+    t0 = time.perf_counter()
+    sims = Q @ X.T
+    true_top = np.argpartition(-sims, args.k, axis=1)[:, :args.k]
+    # exact top-k order
+    true_top = np.array([row[np.argsort(-sims[i, row])] for i, row in enumerate(true_top)])
+    t_brute = time.perf_counter() - t0
+    print(f"brute force baseline: {t_brute:.3f}s\n")
+    rows = []
+    for name, fn in (
+        ("TurboQuant",       lambda: turboquant_run(X, Q, args.k, args.bits, args.device)),
+        ("faiss PQ",         lambda: faiss_pq_run(X, Q, args.k, args.bits)),
+        ("faiss IVF-PQ",     lambda: faiss_ivfpq_run(X, Q, args.k, args.bits)),
+        ("RaBitQ (opt.)",    lambda: rabitq_run(X, Q, args.k, args.bits)),
+    ):
+        top, t_index, t_query = fn()
+        if top is None:
+            continue
+        rec = recall_at_k(true_top, top)
+        qps = len(Q) / max(t_query, 1e-9)
+        rows.append((name, t_index, t_query, qps, rec))
+    print(f"{'method':16s} | {'index(s)':>9} | {'query(s)':>9} | {'qps':>10} | {'recall@k':>9}")
+    print("-" * 72)
+    for name, ti, tq, qps, rec in rows:
+        print(f"{name:16s} | {ti:>9.3f} | {tq:>9.3f} | {qps:>10.0f} | {rec:>9.3f}")
+    print()
+    if rows:
+        tq_row = next((r for r in rows if r[0] == "TurboQuant"), None)
+        pq_row = next((r for r in rows if r[0] == "faiss PQ"),   None)
+        if tq_row and pq_row:
+            print(f"TurboQuant indexing speedup vs PQ: {pq_row[1] / tq_row[1]:.1f}×")
+            print(f"TurboQuant recall delta vs PQ    : {tq_row[4] - pq_row[4]:+.3f}")
+if __name__ == "__main__":
+    main()

tiny_turboquant-0.1.0/benchmarks/bench_kv_real.py ADDED Viewed

@@ -0,0 +1,192 @@
+"""
+Real-LLM benchmark: TurboQuant KV cache vs fp16 baseline.
+Runs a short prompt-completion task on a small open model and reports:
+    - actual packed KV cache memory
+    - per-token attention output cosine similarity
+    - generation perplexity / acceptance vs baseline
+Default model: HuggingFaceTB/SmolLM2-360M-Instruct (small enough for
+free Kaggle T4, big enough to be representative).
+Usage:
+    python -m benchmarks.bench_kv_real --model SmolLM2-360M --bits 4
+    python -m benchmarks.bench_kv_real --bits 2 --bits_outlier 3 --n_outlier 32
+"""
+from __future__ import annotations
+import argparse
+import math
+import os
+import time
+# Quieten the noisy weight-loading tqdm bar from transformers / accelerate.
+# (When stdout is captured by a subprocess pipe, tqdm prints one line per
+# update instead of redrawing in place, which floods the log.)
+os.environ.setdefault("TRANSFORMERS_VERBOSITY", "error")
+os.environ.setdefault("HF_HUB_DISABLE_PROGRESS_BARS", "1")
+os.environ.setdefault("ACCELERATE_DISABLE_RICH", "1")
+os.environ.setdefault("TQDM_DISABLE", "1")
+import torch
+from tiny_turboquant import TurboQuantKVCache
+def parse_args():
+    p = argparse.ArgumentParser()
+    p.add_argument("--model", default="HuggingFaceTB/SmolLM2-360M-Instruct")
+    p.add_argument("--bits",          type=int, default=4)
+    p.add_argument("--bits_outlier",  type=int, default=None,
+                   help="If set, enable outlier-channel split with this bit-width.")
+    p.add_argument("--n_outlier",     type=int, default=32)
+    p.add_argument("--prompt", default=(
+        "The capital of France is Paris. The capital of Germany is Berlin. "
+        "The capital of Spain is Madrid. The capital of Italy is"))
+    p.add_argument("--max_new_tokens", type=int, default=64)
+    p.add_argument("--device", default="cuda" if torch.cuda.is_available() else "cpu")
+    p.add_argument("--dtype",  default="float16")
+    return p.parse_args()
+@torch.no_grad()
+def measure(model, tokenizer, prompt, cache, max_new_tokens):
+    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
+    t0 = time.perf_counter()
+    out = model.generate(
+        **inputs,
+        max_new_tokens=max_new_tokens,
+        do_sample=False,
+        past_key_values=cache,
+    )
+    elapsed = time.perf_counter() - t0
+    text = tokenizer.decode(out[0], skip_special_tokens=True)
+    return text, elapsed, out
+@torch.no_grad()
+def measure_logit_kl(model, tokenizer, prompt, cache_factory, max_new_tokens=32):
+    """Teacher-forced KL between fp16 baseline logits and TurboQuant logits.
+    Both paths see identical input at every step. This isolates pure
+    quantization error. Free-running generations are still produced for
+    the "identical generation?" check, but they are NOT used for KL —
+    otherwise the metric explodes the moment paths diverge by one token.
+    """
+    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
+    # 1) Free-running baseline + TQ generations (for identical-generation check only)
+    out_ref = model.generate(
+        **inputs, max_new_tokens=max_new_tokens, do_sample=False,
+        return_dict_in_generate=True,
+    )
+    cache = cache_factory()
+    out_tq = model.generate(
+        **inputs, max_new_tokens=max_new_tokens, do_sample=False,
+        return_dict_in_generate=True,
+        past_key_values=cache,
+    )
+    text_ref = tokenizer.decode(out_ref.sequences[0], skip_special_tokens=True)
+    text_tq  = tokenizer.decode(out_tq.sequences[0],  skip_special_tokens=True)
+    # First-divergence index: how many decode steps stay greedy-identical?
+    # This is the production-relevant signal — small KL means the answer
+    # token is identical even when later filler tokens drift.
+    prompt_len_for_div = inputs["input_ids"].shape[1]
+    ref_new = out_ref.sequences[0, prompt_len_for_div:].tolist()
+    tq_new  = out_tq .sequences[0, prompt_len_for_div:].tolist()
+    n_new   = min(len(ref_new), len(tq_new))
+    first_diverge = next(
+        (i for i in range(n_new) if ref_new[i] != tq_new[i]),
+        n_new,
+    )
+    # 2) Teacher-forced KL: feed the SAME baseline-generated sequence into
+    #    both fp16-cache and TQ-cache, compare per-position logits.
+    seq = out_ref.sequences  # (1, prompt_len + new_tokens)
+    prompt_len = inputs["input_ids"].shape[1]
+    # fp16 reference: one forward pass, no cache compression.
+    ref_logits = model(seq).logits  # (1, T, V)
+    # TQ path: prefill + decode through TurboQuantKVCache so attention
+    # is computed against quantised K/V at every step.
+    tq_cache = cache_factory()
+    # prefill on the prompt
+    tq_logits_chunks = []
+    out = model(seq[:, :prompt_len], past_key_values=tq_cache, use_cache=True)
+    tq_logits_chunks.append(out.logits)
+    # decode one token at a time, feeding the *baseline* tokens
+    for t in range(prompt_len, seq.shape[1]):
+        out = model(seq[:, t:t + 1], past_key_values=tq_cache, use_cache=True)
+        tq_logits_chunks.append(out.logits)
+    tq_logits = torch.cat(tq_logits_chunks, dim=1)  # (1, T, V)
+    # Compare logits at positions [prompt_len-1 .. T-2] — the ones that
+    # predict the new tokens. Both saw identical inputs.
+    p = torch.softmax(ref_logits[:, prompt_len - 1:-1].float(), dim=-1)
+    q = torch.softmax(tq_logits [:, prompt_len - 1:-1].float(), dim=-1).clamp_min_(1e-12)
+    kls = (p * (p.clamp_min_(1e-12).log() - q.log())).sum(-1).squeeze(0).tolist()
+    return text_ref, text_tq, kls, cache, first_diverge, n_new
+def main() -> None:
+    args = parse_args()
+    from transformers import AutoModelForCausalLM, AutoTokenizer
+    from transformers.utils import logging as hf_logging
+    import huggingface_hub.utils as hf_hub_utils
+    # Hard-disable every progress bar transformers / huggingface_hub know
+    # about. Env vars alone are unreliable when this script is launched as
+    # a subprocess from kaggle_run.py (pipes confuse tqdm).
+    hf_logging.disable_progress_bar()
+    hf_logging.set_verbosity_error()
+    hf_hub_utils.disable_progress_bars()
+    dtype = {"float16": torch.float16, "bfloat16": torch.bfloat16,
+             "float32": torch.float32}[args.dtype]
+    print(f"Loading {args.model}  ({args.dtype} on {args.device})")
+    tok   = AutoTokenizer.from_pretrained(args.model)
+    model = AutoModelForCausalLM.from_pretrained(
+        args.model, torch_dtype=dtype
+    ).to(args.device).eval()
+    def factory():
+        return TurboQuantKVCache(
+            bits=args.bits,
+            bits_outlier=args.bits_outlier,
+            n_outlier=args.n_outlier,
+        )
+    text_ref, text_tq, kls, cache, first_diverge, n_new = measure_logit_kl(
+        model, tok, args.prompt, factory, max_new_tokens=args.max_new_tokens,
+    )
+    print("\n--- BASELINE (fp16, default DynamicCache) ---")
+    print(text_ref)
+    print("\n--- TURBOQUANT ---")
+    print(text_tq)
+    n_tokens = cache.get_seq_length(0)
+    fp16_bytes = cache.fp16_baseline_bytes()
+    actual_tq_bytes = cache.actual_memory_bytes()
+    theoretical_tq_bytes = cache.theoretical_memory_bytes()
+    print("\n--- METRICS ---")
+    print(f"prompt + decode tokens : {n_tokens}")
+    print(f"fp16 KV cache actual   : {fp16_bytes / 1e6:.2f} MB")
+    print(f"TurboQuant actual      : {actual_tq_bytes / 1e6:.2f} MB  "
+          f"(× compression = {fp16_bytes / max(actual_tq_bytes, 1):.2f})")
+    print(f"TurboQuant theoretical : {theoretical_tq_bytes / 1e6:.2f} MB")
+    print(f"mean per-token logit KL: {sum(kls) / len(kls):.5f}  (teacher-forced; lower is better)")
+    print(f"max  per-token logit KL: {max(kls):.5f}")
+    print(f"first divergence at    : {first_diverge}/{n_new} decode tokens "
+          f"(higher is better; {n_new}/{n_new} = exact match)")
+    print(f"identical generation?  : {text_ref == text_tq}")
+if __name__ == "__main__":
+    main()

tiny_turboquant-0.1.0/demos/__init__.py ADDED Viewed

File without changes

tiny_turboquant-0.1.0/demos/demo1_distortion_vs_theory.py ADDED Viewed

@@ -0,0 +1,75 @@
+"""
+Demo 1 — Empirical distortion matches Shannon-theoretic bounds.
+Story for the audience:
+  "We compress unit vectors to b bits per coordinate. The black dashed line
+   is the information-theoretic *lower* bound from Shannon source coding;
+   the orange line is what *any* possible algorithm could ever achieve
+   in the limit. TurboQuant (blue) sits within a small constant of optimum
+   for every bit-width, with zero tuning."
+Run:
+    python -m demos.demo1_distortion_vs_theory
+Outputs:
+    demos/out/distortion.png
+"""
+from __future__ import annotations
+import os
+import numpy as np
+import matplotlib.pyplot as plt
+from tiny_turboquant.numpy_reference import TurboQuantMSE, TurboQuantProd
+OUT = os.path.join(os.path.dirname(__file__), "out")
+os.makedirs(OUT, exist_ok=True)
+def main() -> None:
+    rng = np.random.default_rng(0)
+    d, n = 512, 4000
+    X = rng.standard_normal((n, d))
+    X /= np.linalg.norm(X, axis=1, keepdims=True)
+    bits = [1, 2, 3, 4, 5, 6]
+    mse_emp, mse_lb, mse_ub = [], [], []
+    for b in bits:
+        q = TurboQuantMSE(d, b, seed=0)
+        Xh = q.dequant(q.quant(X))
+        mse_emp.append(float(np.mean(np.sum((X - Xh) ** 2, axis=1))))
+        mse_lb.append(4.0 ** (-b))                    # Shannon lower bound
+        mse_ub.append(3 * np.pi / 2 * 4.0 ** (-b))    # paper Theorem 1
+    fig, ax = plt.subplots(figsize=(7, 4.5))
+    ax.plot(bits, mse_ub,  "--",  color="orange", label="Paper upper bound (3π/2·4⁻ᵇ)")
+    ax.plot(bits, mse_lb,  "--",  color="black",  label="Shannon lower bound (4⁻ᵇ)")
+    ax.plot(bits, mse_emp, "o-",  color="tab:blue", lw=2, label="TurboQuant empirical")
+    ax.set_yscale("log")
+    ax.set_xlabel("bits per coordinate")
+    ax.set_ylabel("MSE  E‖x − x̂‖²")
+    ax.set_title(f"TurboQuant-MSE vs information-theoretic bounds  (d={d}, n={n})")
+    ax.grid(True, which="both", alpha=0.3)
+    ax.legend()
+    fig.tight_layout()
+    out = os.path.join(OUT, "distortion.png")
+    fig.savefig(out, dpi=140)
+    print(f"saved {out}")
+    # Also show inner-product unbiasedness — the *killer* feature for ANN/RAG.
+    Y = rng.standard_normal((n, d)); Y /= np.linalg.norm(Y, axis=1, keepdims=True)
+    print("\n  bits |   bias   |  ip-MSE   |  ratio vs full-precision")
+    print("  -----+----------+-----------+--------------------------")
+    for b in (2, 3, 4):
+        qp = TurboQuantProd(d, b, seed=0)
+        idx, s, g = qp.quant(X)
+        Xh = qp.dequant(idx, s, g)
+        true = np.sum(X * Y, 1)
+        est  = np.sum(Xh * Y, 1)
+        bias = float(np.mean(est - true))
+        var  = float(np.mean((est - true) ** 2))
+        print(f"   {b}   | {bias:+.4f} | {var:.5f}   | "
+              f"{32/b:.0f}× smaller than fp32")
+if __name__ == "__main__":
+    main()