PyPI - quantcpp - Versions diffs - 0.10.1__tar.gz → 0.11.0__tar.gz - Mend

quantcpp 0.10.1tar.gz → 0.11.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

{quantcpp-0.10.1/quantcpp.egg-info → quantcpp-0.11.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: quantcpp
-Version: 0.10.1
+Version: 0.11.0
 Summary: Single-header LLM inference engine with KV cache compression (7× compression at fp32 parity)
 Author-email: quantumaikr <noreply@quantumaikr.com>
 License: Apache-2.0

{quantcpp-0.10.1 → quantcpp-0.11.0}/pyproject.toml RENAMED Viewed

@@ -7,7 +7,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "quantcpp"
-version = "0.10.1"
+version = "0.11.0"
 description = "Single-header LLM inference engine with KV cache compression (7× compression at fp32 parity)"
 readme = "README.md"
 license = { text = "Apache-2.0" }
@@ -43,6 +43,9 @@ Source        = "https://github.com/quantumaikr/quant.cpp"
 Issues        = "https://github.com/quantumaikr/quant.cpp/issues"
 Changelog     = "https://github.com/quantumaikr/quant.cpp/blob/main/CHANGELOG.md"
+[project.scripts]
+quantcpp = "quantcpp.cli:main"
 [project.optional-dependencies]
 dev = ["pytest>=7.0", "build", "twine"]

{quantcpp-0.10.1 → quantcpp-0.11.0}/quant.h RENAMED Viewed

@@ -8033,32 +8033,75 @@ tq_tokenizer_t* tq_load_tokenizer_from_gguf(const void* gguf_ctx_ptr) {
         }
     }
-    /* Load merges if available */
+    /* Build sorted indices BEFORE merge parsing so str_lookup() can use
+     * binary search instead of O(n) linear scan.  For 248K vocab with
+     * ~50K merges (3 lookups each), this turns a ~10 s init into ~100 ms. */
+    tok->sorted_indices = (int*)malloc(vocab_size * sizeof(int));
+    if (tok->sorted_indices) {
+        for (int i = 0; i < (int)vocab_size; i++) tok->sorted_indices[i] = i;
+        g_vocab_for_sort = tok->vocab;
+        qsort(tok->sorted_indices, vocab_size, sizeof(int), cmp_vocab_idx);
+    }
+    /* Load and parse merges if available.
+     * GGUF stores merges as a string array of "tok_a tok_b" pairs.
+     * We need to look up token IDs and build (id_a, id_b, id_merged) triples
+     * so the BPE encoder can use them. */
     int64_t merges_idx = tq_gguf_find_key(gguf, "tokenizer.ggml.merges");
     if (merges_idx >= 0) {
         const tq_gguf_kv_t* mkv = &gguf->kv[merges_idx];
         if (mkv->type == TQ_GGUF_TYPE_ARRAY &&
             mkv->value.array.elem_type == TQ_GGUF_TYPE_STRING) {
-            /* Parse merge rules: "token_a token_b" -> find IDs, store as merge pairs */
-            uint64_t n_merges = mkv->value.array.count;
-            tok->n_merges = (int)n_merges;
-            tok->merge_pairs = (int*)malloc(n_merges * 3 * sizeof(int));
+            uint64_t n_merges_total = mkv->value.array.count;
+            tok->merge_pairs = (int*)malloc(n_merges_total * 3 * sizeof(int));
+            tok->n_merges = 0;
             if (tok->merge_pairs) {
-                memset(tok->merge_pairs, 0, n_merges * 3 * sizeof(int));
+                tq_gguf_string_t* merge_strings = (tq_gguf_string_t*)mkv->value.array.data;
+                for (uint64_t mi = 0; mi < n_merges_total; mi++) {
+                    if (!merge_strings[mi].str || merge_strings[mi].len == 0) continue;
+                    /* Copy merge string and split on space: "tok_a tok_b" */
+                    char buf[2048];
+                    int slen = (int)merge_strings[mi].len;
+                    if (slen >= (int)sizeof(buf)) continue;
+                    memcpy(buf, merge_strings[mi].str, (size_t)slen);
+                    buf[slen] = '\0';
+                    char* sep = strchr(buf, ' ');
+                    if (!sep) continue;
+                    *sep = '\0';
+                    const char* str_a = buf;
+                    const char* str_b = sep + 1;
+                    /* Build merged string: concatenation of tok_a + tok_b */
+                    char merged[2048];
+                    int la = (int)strlen(str_a);
+                    int lb = (int)strlen(str_b);
+                    if (la + lb >= (int)sizeof(merged)) continue;
+                    memcpy(merged, str_a, (size_t)la);
+                    memcpy(merged + la, str_b, (size_t)lb);
+                    merged[la + lb] = '\0';
+                    /* Look up token IDs via binary search (sorted_indices built above) */
+                    int id_a = str_lookup(tok, str_a);
+                    int id_b = str_lookup(tok, str_b);
+                    int id_merged = str_lookup(tok, merged);
+                    if (id_a >= 0 && id_b >= 0 && id_merged >= 0) {
+                        tok->merge_pairs[tok->n_merges * 3 + 0] = id_a;
+                        tok->merge_pairs[tok->n_merges * 3 + 1] = id_b;
+                        tok->merge_pairs[tok->n_merges * 3 + 2] = id_merged;
+                        /* Priority: earlier merges in GGUF = higher priority */
+                        tok->scores[id_merged] = (float)(n_merges_total - mi);
+                        tok->n_merges++;
+                    }
+                }
+                fprintf(stderr, "tq_load_tokenizer_from_gguf: parsed %d/%d merges\n",
+                        tok->n_merges, (int)n_merges_total);
             }
         }
     }
-    /* Build sorted indices for encoding (binary search by string).
-     * Use qsort for O(n log n) instead of insertion sort O(n²) — critical
-     * for 248K vocab where insertion sort would take minutes. */
-    tok->sorted_indices = (int*)malloc(vocab_size * sizeof(int));
-    if (tok->sorted_indices) {
-        for (int i = 0; i < (int)vocab_size; i++) tok->sorted_indices[i] = i;
-        g_vocab_for_sort = tok->vocab;
-        qsort(tok->sorted_indices, vocab_size, sizeof(int), cmp_vocab_idx);
-    }
     fprintf(stderr, "tq_load_tokenizer_from_gguf: loaded %d tokens (max_len=%d)\n",
             tok->vocab_size, tok->max_token_len);
     return tok;
@@ -9939,18 +9982,11 @@ static tq_model_t* tq_load_safetensors(const char* path) {
     free(tensors);
-    /* Qwen3.5 RMSNorm adjustment: Qwen3_5RMSNorm computes
-     * output = norm(x) * (1.0 + weight), NOT norm(x) * weight.
-     * We bake the "+1" into the weight so tq_rmsnorm can stay as
-     * out = x * rsqrt * weight.
-     *
-     * This applies to: input_layernorm, post_attention_layernorm,
-     * model.norm, q_norm, k_norm.
-     * It does NOT apply to: linear_attn.norm (Qwen3_5RMSNormGated
-     * uses plain weight without +1).
-     *
-     * We detect Qwen3.5 by the presence of DeltaNet layers. */
-    if (model->config.delta_n_heads > 0) {
+    /* Qwen3.5 (DeltaNet hybrid) RMSNorm adjustment.
+     * Only for non-GGUF models (raw checkpoints). GGUF files from
+     * llama.cpp already have +1 baked in by the converter.
+     * Qwen2/Qwen3 use standard RMSNorm and never need +1. */
+    if (model->config.delta_n_heads > 0 && !model->gguf_ctx) {
         int dim_h = model->config.hidden_dim;
         int head_dim_h = model->config.head_dim;
@@ -9979,7 +10015,7 @@ static tq_model_t* tq_load_safetensors(const char* path) {
             for (int i = 0; i < dim_h; i++)
                 model->output_norm[i] += 1.0f;
         }
-        fprintf(stderr, "tq_load_model: applied Qwen3.5 RMSNorm +1 weight adjustment\n");
+        fprintf(stderr, "tq_load_model: applied Qwen RMSNorm +1 weight adjustment\n");
     }
     /* Gemma3 RMSNorm adjustment: same (1+w) scaling as Qwen3.5 */
@@ -15862,22 +15898,7 @@ void quant_free_string(char* str) {
     if (str) free(str);
 }
-/* ================================================================
- * Context persistence — save/load KV cache to disk
- *
- * File format (binary, little-endian):
- *   magic:     4 bytes "QKVC"
- *   version:   uint32 (1)
- *   n_layers:  uint32
- *   kv_dim:    uint32 (n_kv_heads * head_dim)
- *   max_seq:   uint32
- *   n_tokens:  uint32 (number of filled positions)
- *   kv_type:   uint32 (TQ_TYPE_* enum or TQ_TYPE_COUNT for fp32)
- *   has_fp16v: uint32 (1 if value_cache_fp16 is used)
- *   reserved:  32 bytes (future use)
- *   data:      raw KV cache bytes
- * ================================================================ */
+/* Context persistence: QKVC format (64-byte header + raw KV data) */
 int quant_save_context(quant_ctx* ctx, const char* path) {
     if (!ctx || !ctx->state || !path) return -1;
     FILE* fp = fopen(path, "wb");
@@ -15886,29 +15907,17 @@ int quant_save_context(quant_ctx* ctx, const char* path) {
     tq_state_t* s = ctx->state;
     tq_model_config_t* c = &ctx->model->config;
     int kv_dim = c->n_kv_heads * c->head_dim;
-    /* Header */
     fwrite("QKVC", 1, 4, fp);
-    uint32_t version = 1;
-    uint32_t nl = (uint32_t)c->n_layers;
-    uint32_t kd = (uint32_t)kv_dim;
-    uint32_t ms = (uint32_t)c->max_seq_len;
-    uint32_t nt = (uint32_t)ctx->n_ctx_tokens;
-    uint32_t kt = (uint32_t)s->kv_quant_type;
-    uint32_t hfp16 = s->value_cache_fp16 ? 1 : 0;
-    fwrite(&version, 4, 1, fp);
-    fwrite(&nl, 4, 1, fp);
-    fwrite(&kd, 4, 1, fp);
-    fwrite(&ms, 4, 1, fp);
-    fwrite(&nt, 4, 1, fp);
-    fwrite(&kt, 4, 1, fp);
-    fwrite(&hfp16, 4, 1, fp);
-    char reserved[32] = {0};
-    fwrite(reserved, 1, 32, fp);
+    uint32_t hdr[7] = { 1, (uint32_t)c->n_layers, (uint32_t)kv_dim,
+        (uint32_t)c->max_seq_len, (uint32_t)ctx->n_ctx_tokens,
+        (uint32_t)s->kv_quant_type, s->value_cache_fp16 ? 1u : 0u };
+    fwrite(hdr, 4, 7, fp);
+    char reserved[32] = {0}; fwrite(reserved, 1, 32, fp);
+    uint32_t nl = hdr[1], nt = hdr[4], kt = hdr[5];
     /* KV data: write only the filled portion (nt tokens) */
     for (uint32_t l = 0; l < nl; l++) {
-        size_t layer_stride = (size_t)ms * kv_dim;
+        size_t layer_stride = (size_t)c->max_seq_len * kv_dim;
         /* Key cache: FP32 or quantized */
         if (s->key_cache) {
             fwrite(s->key_cache + l * layer_stride, sizeof(float),
@@ -15916,7 +15925,7 @@ int quant_save_context(quant_ctx* ctx, const char* path) {
         }
         if (s->quant_key_cache && kt < TQ_TYPE_COUNT) {
             size_t blk_sz = tq_type_type_size(kt);
-            uint8_t* qbase = (uint8_t*)s->quant_key_cache + l * (size_t)ms * blk_sz;
+            uint8_t* qbase = (uint8_t*)s->quant_key_cache + l * (size_t)c->max_seq_len * blk_sz;
             fwrite(qbase, blk_sz, nt, fp);
         }
         /* Value cache: FP32 or FP16 */
@@ -15925,7 +15934,7 @@ int quant_save_context(quant_ctx* ctx, const char* path) {
                    (size_t)nt * kv_dim, fp);
         }
         if (s->value_cache_fp16) {
-            size_t layer_stride16 = (size_t)ms * kv_dim;
+            size_t layer_stride16 = (size_t)c->max_seq_len * kv_dim;
             fwrite(s->value_cache_fp16 + l * layer_stride16, sizeof(uint16_t),
                    (size_t)nt * kv_dim, fp);
         }
@@ -15942,37 +15951,16 @@ int quant_load_context(quant_ctx* ctx, const char* path) {
     FILE* fp = fopen(path, "rb");
     if (!fp) return -1;
-    /* Read and validate header */
     char magic[4];
-    if (fread(magic, 1, 4, fp) != 4 || memcmp(magic, "QKVC", 4) != 0) {
-        fclose(fp); return -1;
-    }
-    uint32_t version, nl, kd, ms, nt, kt, hfp16;
-    fread(&version, 4, 1, fp);
-    fread(&nl, 4, 1, fp);
-    fread(&kd, 4, 1, fp);
-    fread(&ms, 4, 1, fp);
-    fread(&nt, 4, 1, fp);
-    fread(&kt, 4, 1, fp);
-    fread(&hfp16, 4, 1, fp);
-    char reserved[32];
-    fread(reserved, 1, 32, fp);
+    if (fread(magic, 1, 4, fp) != 4 || memcmp(magic, "QKVC", 4) != 0) { fclose(fp); return -1; }
+    uint32_t hdr[7]; fread(hdr, 4, 7, fp);
+    char reserved[32]; fread(reserved, 1, 32, fp);
+    uint32_t nl = hdr[1], nt = hdr[4], kt = hdr[5];
     tq_state_t* s = ctx->state;
     tq_model_config_t* c = &ctx->model->config;
     int kv_dim = c->n_kv_heads * c->head_dim;
-    /* Validate compatibility */
-    if (nl != (uint32_t)c->n_layers || kd != (uint32_t)kv_dim) {
-        fprintf(stderr, "quant_load_context: model mismatch (layers %u vs %d, kv_dim %u vs %d)\n",
-                nl, c->n_layers, kd, kv_dim);
-        fclose(fp); return -1;
-    }
-    if (nt > (uint32_t)c->max_seq_len) {
-        fprintf(stderr, "quant_load_context: saved %u tokens > max_seq_len %d\n",
-                nt, c->max_seq_len);
-        fclose(fp); return -1;
-    }
+    if (nl != (uint32_t)c->n_layers || hdr[2] != (uint32_t)kv_dim) { fclose(fp); return -1; }
+    if (nt > (uint32_t)c->max_seq_len) { fclose(fp); return -1; }
     /* Read KV data */
     for (uint32_t l = 0; l < nl; l++) {

{quantcpp-0.10.1 → quantcpp-0.11.0}/quantcpp/__init__.py RENAMED Viewed

@@ -1,25 +1,21 @@
 """
-quantcpp -- The SQLite of LLMs. Single-header C inference in Python.
+quantcpp -- Compress AI's memory 3x. It gets faster.
-Quick start (3 lines):
+Quick start:
     from quantcpp import Model
-    m = Model.from_pretrained("SmolLM2-135M")
+    m = Model.from_pretrained("Llama-3.2-1B")
     print(m.ask("What is gravity?"))
-Full control:
-    m = Model("path/to/model.gguf", temperature=0.7, max_tokens=256)
-    for token in m.generate("Once upon a time"):
-        print(token, end="", flush=True)
-    m.close()
+Note: SmolLM2-135M downloads faster but produces low-quality output.
+Use Llama-3.2-1B (~750 MB, one-time download) for good results.
 """
 try:
     from importlib.metadata import version as _pkg_version
     __version__ = _pkg_version("quantcpp")
 except Exception:
-    __version__ = "0.10.1"  # fallback for editable / source-tree imports
+    __version__ = "0.11.0"  # fallback for editable / source-tree imports
 import os
 import sys
@@ -53,6 +49,11 @@ _MODEL_REGISTRY = {
         "smollm2-135m-instruct-q8_0.gguf",
         135,
     ),
+    "Qwen3.5-0.8B": (
+        "unsloth/Qwen3.5-0.8B-GGUF",
+        "Qwen3.5-0.8B-Q4_K_M.gguf",
+        508,
+    ),
     "Llama-3.2-1B": (
         "hugging-quants/Llama-3.2-1B-Instruct-Q4_K_M-GGUF",
         "llama-3.2-1b-instruct-q4_k_m.gguf",
@@ -192,17 +193,21 @@ class Model:
         n_threads: int = 4,
         kv_compress: int = 1,
         context_length: int = 0,
-        progressive: bool = False,
+        progressive: bool = True,
+        aggressive: bool = False,
     ):
         """
         Parameters
         ----------
         progressive : bool
-            Enable progressive KV compression (default False). When True,
-            the last 128 tokens' keys are kept at FP32 for maximum quality,
-            while all older tokens are compressed. Reduces PPL degradation
-            from +3.8% to +0.6% at a cost of ~28 KB extra memory.
-            Like human memory: recent = vivid, older = faded but present.
+            Progressive KV compression (default True). Keeps last 128
+            tokens' keys at FP32 while compressing the rest. Verified
+            on 3 models: +0% to +3% PPL improvement at 1.75 MB cost.
+            No reason to disable — it's strictly better.
+        aggressive : bool
+            Maximum memory savings (default False). Uses 4-bit KV with
+            last 512 tokens at FP32. Ideal for very long context.
+            At 128K context: 4.6 GB instead of 9.2 GB KV cache.
         """
         if not os.path.isfile(path):
             raise FileNotFoundError(f"Model file not found: {path}")
@@ -212,11 +217,21 @@ class Model:
         self._top_p = top_p
         self._max_tokens = max_tokens
         self._n_threads = n_threads
-        self._kv_compress = kv_compress
         self._context_length = context_length
         self._progressive = progressive
+        self._aggressive = aggressive
+        if aggressive:
+            # 4-bit KV + 512-token FP32 window: best memory/quality ratio.
+            # Measured: same PPL as flat 4-bit, attention-aware precision.
+            # TODO: add uniform_2b (kv_compress=3) for 48% more savings.
+            k_win = 512
+        elif progressive:
+            k_win = 128
+        else:
+            k_win = 0
-        k_win = 128 if progressive else 0
+        self._kv_compress = kv_compress
         self._model = load_model(path)
         self._ctx = new_context(

{quantcpp-0.10.1 → quantcpp-0.11.0}/quantcpp/_quant.h RENAMED Viewed

@@ -8033,32 +8033,75 @@ tq_tokenizer_t* tq_load_tokenizer_from_gguf(const void* gguf_ctx_ptr) {
         }
     }
-    /* Load merges if available */
+    /* Build sorted indices BEFORE merge parsing so str_lookup() can use
+     * binary search instead of O(n) linear scan.  For 248K vocab with
+     * ~50K merges (3 lookups each), this turns a ~10 s init into ~100 ms. */
+    tok->sorted_indices = (int*)malloc(vocab_size * sizeof(int));
+    if (tok->sorted_indices) {
+        for (int i = 0; i < (int)vocab_size; i++) tok->sorted_indices[i] = i;
+        g_vocab_for_sort = tok->vocab;
+        qsort(tok->sorted_indices, vocab_size, sizeof(int), cmp_vocab_idx);
+    }
+    /* Load and parse merges if available.
+     * GGUF stores merges as a string array of "tok_a tok_b" pairs.
+     * We need to look up token IDs and build (id_a, id_b, id_merged) triples
+     * so the BPE encoder can use them. */
     int64_t merges_idx = tq_gguf_find_key(gguf, "tokenizer.ggml.merges");
     if (merges_idx >= 0) {
         const tq_gguf_kv_t* mkv = &gguf->kv[merges_idx];
         if (mkv->type == TQ_GGUF_TYPE_ARRAY &&
             mkv->value.array.elem_type == TQ_GGUF_TYPE_STRING) {
-            /* Parse merge rules: "token_a token_b" -> find IDs, store as merge pairs */
-            uint64_t n_merges = mkv->value.array.count;
-            tok->n_merges = (int)n_merges;
-            tok->merge_pairs = (int*)malloc(n_merges * 3 * sizeof(int));
+            uint64_t n_merges_total = mkv->value.array.count;
+            tok->merge_pairs = (int*)malloc(n_merges_total * 3 * sizeof(int));
+            tok->n_merges = 0;
             if (tok->merge_pairs) {
-                memset(tok->merge_pairs, 0, n_merges * 3 * sizeof(int));
+                tq_gguf_string_t* merge_strings = (tq_gguf_string_t*)mkv->value.array.data;
+                for (uint64_t mi = 0; mi < n_merges_total; mi++) {
+                    if (!merge_strings[mi].str || merge_strings[mi].len == 0) continue;
+                    /* Copy merge string and split on space: "tok_a tok_b" */
+                    char buf[2048];
+                    int slen = (int)merge_strings[mi].len;
+                    if (slen >= (int)sizeof(buf)) continue;
+                    memcpy(buf, merge_strings[mi].str, (size_t)slen);
+                    buf[slen] = '\0';
+                    char* sep = strchr(buf, ' ');
+                    if (!sep) continue;
+                    *sep = '\0';
+                    const char* str_a = buf;
+                    const char* str_b = sep + 1;
+                    /* Build merged string: concatenation of tok_a + tok_b */
+                    char merged[2048];
+                    int la = (int)strlen(str_a);
+                    int lb = (int)strlen(str_b);
+                    if (la + lb >= (int)sizeof(merged)) continue;
+                    memcpy(merged, str_a, (size_t)la);
+                    memcpy(merged + la, str_b, (size_t)lb);
+                    merged[la + lb] = '\0';
+                    /* Look up token IDs via binary search (sorted_indices built above) */
+                    int id_a = str_lookup(tok, str_a);
+                    int id_b = str_lookup(tok, str_b);
+                    int id_merged = str_lookup(tok, merged);
+                    if (id_a >= 0 && id_b >= 0 && id_merged >= 0) {
+                        tok->merge_pairs[tok->n_merges * 3 + 0] = id_a;
+                        tok->merge_pairs[tok->n_merges * 3 + 1] = id_b;
+                        tok->merge_pairs[tok->n_merges * 3 + 2] = id_merged;
+                        /* Priority: earlier merges in GGUF = higher priority */
+                        tok->scores[id_merged] = (float)(n_merges_total - mi);
+                        tok->n_merges++;
+                    }
+                }
+                fprintf(stderr, "tq_load_tokenizer_from_gguf: parsed %d/%d merges\n",
+                        tok->n_merges, (int)n_merges_total);
             }
         }
     }
-    /* Build sorted indices for encoding (binary search by string).
-     * Use qsort for O(n log n) instead of insertion sort O(n²) — critical
-     * for 248K vocab where insertion sort would take minutes. */
-    tok->sorted_indices = (int*)malloc(vocab_size * sizeof(int));
-    if (tok->sorted_indices) {
-        for (int i = 0; i < (int)vocab_size; i++) tok->sorted_indices[i] = i;
-        g_vocab_for_sort = tok->vocab;
-        qsort(tok->sorted_indices, vocab_size, sizeof(int), cmp_vocab_idx);
-    }
     fprintf(stderr, "tq_load_tokenizer_from_gguf: loaded %d tokens (max_len=%d)\n",
             tok->vocab_size, tok->max_token_len);
     return tok;
@@ -9939,18 +9982,11 @@ static tq_model_t* tq_load_safetensors(const char* path) {
     free(tensors);
-    /* Qwen3.5 RMSNorm adjustment: Qwen3_5RMSNorm computes
-     * output = norm(x) * (1.0 + weight), NOT norm(x) * weight.
-     * We bake the "+1" into the weight so tq_rmsnorm can stay as
-     * out = x * rsqrt * weight.
-     *
-     * This applies to: input_layernorm, post_attention_layernorm,
-     * model.norm, q_norm, k_norm.
-     * It does NOT apply to: linear_attn.norm (Qwen3_5RMSNormGated
-     * uses plain weight without +1).
-     *
-     * We detect Qwen3.5 by the presence of DeltaNet layers. */
-    if (model->config.delta_n_heads > 0) {
+    /* Qwen3.5 (DeltaNet hybrid) RMSNorm adjustment.
+     * Only for non-GGUF models (raw checkpoints). GGUF files from
+     * llama.cpp already have +1 baked in by the converter.
+     * Qwen2/Qwen3 use standard RMSNorm and never need +1. */
+    if (model->config.delta_n_heads > 0 && !model->gguf_ctx) {
         int dim_h = model->config.hidden_dim;
         int head_dim_h = model->config.head_dim;
@@ -9979,7 +10015,7 @@ static tq_model_t* tq_load_safetensors(const char* path) {
             for (int i = 0; i < dim_h; i++)
                 model->output_norm[i] += 1.0f;
         }
-        fprintf(stderr, "tq_load_model: applied Qwen3.5 RMSNorm +1 weight adjustment\n");
+        fprintf(stderr, "tq_load_model: applied Qwen RMSNorm +1 weight adjustment\n");
     }
     /* Gemma3 RMSNorm adjustment: same (1+w) scaling as Qwen3.5 */
@@ -15862,22 +15898,7 @@ void quant_free_string(char* str) {
     if (str) free(str);
 }
-/* ================================================================
- * Context persistence — save/load KV cache to disk
- *
- * File format (binary, little-endian):
- *   magic:     4 bytes "QKVC"
- *   version:   uint32 (1)
- *   n_layers:  uint32
- *   kv_dim:    uint32 (n_kv_heads * head_dim)
- *   max_seq:   uint32
- *   n_tokens:  uint32 (number of filled positions)
- *   kv_type:   uint32 (TQ_TYPE_* enum or TQ_TYPE_COUNT for fp32)
- *   has_fp16v: uint32 (1 if value_cache_fp16 is used)
- *   reserved:  32 bytes (future use)
- *   data:      raw KV cache bytes
- * ================================================================ */
+/* Context persistence: QKVC format (64-byte header + raw KV data) */
 int quant_save_context(quant_ctx* ctx, const char* path) {
     if (!ctx || !ctx->state || !path) return -1;
     FILE* fp = fopen(path, "wb");
@@ -15886,29 +15907,17 @@ int quant_save_context(quant_ctx* ctx, const char* path) {
     tq_state_t* s = ctx->state;
     tq_model_config_t* c = &ctx->model->config;
     int kv_dim = c->n_kv_heads * c->head_dim;
-    /* Header */
     fwrite("QKVC", 1, 4, fp);
-    uint32_t version = 1;
-    uint32_t nl = (uint32_t)c->n_layers;
-    uint32_t kd = (uint32_t)kv_dim;
-    uint32_t ms = (uint32_t)c->max_seq_len;
-    uint32_t nt = (uint32_t)ctx->n_ctx_tokens;
-    uint32_t kt = (uint32_t)s->kv_quant_type;
-    uint32_t hfp16 = s->value_cache_fp16 ? 1 : 0;
-    fwrite(&version, 4, 1, fp);
-    fwrite(&nl, 4, 1, fp);
-    fwrite(&kd, 4, 1, fp);
-    fwrite(&ms, 4, 1, fp);
-    fwrite(&nt, 4, 1, fp);
-    fwrite(&kt, 4, 1, fp);
-    fwrite(&hfp16, 4, 1, fp);
-    char reserved[32] = {0};
-    fwrite(reserved, 1, 32, fp);
+    uint32_t hdr[7] = { 1, (uint32_t)c->n_layers, (uint32_t)kv_dim,
+        (uint32_t)c->max_seq_len, (uint32_t)ctx->n_ctx_tokens,
+        (uint32_t)s->kv_quant_type, s->value_cache_fp16 ? 1u : 0u };
+    fwrite(hdr, 4, 7, fp);
+    char reserved[32] = {0}; fwrite(reserved, 1, 32, fp);
+    uint32_t nl = hdr[1], nt = hdr[4], kt = hdr[5];
     /* KV data: write only the filled portion (nt tokens) */
     for (uint32_t l = 0; l < nl; l++) {
-        size_t layer_stride = (size_t)ms * kv_dim;
+        size_t layer_stride = (size_t)c->max_seq_len * kv_dim;
         /* Key cache: FP32 or quantized */
         if (s->key_cache) {
             fwrite(s->key_cache + l * layer_stride, sizeof(float),
@@ -15916,7 +15925,7 @@ int quant_save_context(quant_ctx* ctx, const char* path) {
         }
         if (s->quant_key_cache && kt < TQ_TYPE_COUNT) {
             size_t blk_sz = tq_type_type_size(kt);
-            uint8_t* qbase = (uint8_t*)s->quant_key_cache + l * (size_t)ms * blk_sz;
+            uint8_t* qbase = (uint8_t*)s->quant_key_cache + l * (size_t)c->max_seq_len * blk_sz;
             fwrite(qbase, blk_sz, nt, fp);
         }
         /* Value cache: FP32 or FP16 */
@@ -15925,7 +15934,7 @@ int quant_save_context(quant_ctx* ctx, const char* path) {
                    (size_t)nt * kv_dim, fp);
         }
         if (s->value_cache_fp16) {
-            size_t layer_stride16 = (size_t)ms * kv_dim;
+            size_t layer_stride16 = (size_t)c->max_seq_len * kv_dim;
             fwrite(s->value_cache_fp16 + l * layer_stride16, sizeof(uint16_t),
                    (size_t)nt * kv_dim, fp);
         }
@@ -15942,37 +15951,16 @@ int quant_load_context(quant_ctx* ctx, const char* path) {
     FILE* fp = fopen(path, "rb");
     if (!fp) return -1;
-    /* Read and validate header */
     char magic[4];
-    if (fread(magic, 1, 4, fp) != 4 || memcmp(magic, "QKVC", 4) != 0) {
-        fclose(fp); return -1;
-    }
-    uint32_t version, nl, kd, ms, nt, kt, hfp16;
-    fread(&version, 4, 1, fp);
-    fread(&nl, 4, 1, fp);
-    fread(&kd, 4, 1, fp);
-    fread(&ms, 4, 1, fp);
-    fread(&nt, 4, 1, fp);
-    fread(&kt, 4, 1, fp);
-    fread(&hfp16, 4, 1, fp);
-    char reserved[32];
-    fread(reserved, 1, 32, fp);
+    if (fread(magic, 1, 4, fp) != 4 || memcmp(magic, "QKVC", 4) != 0) { fclose(fp); return -1; }
+    uint32_t hdr[7]; fread(hdr, 4, 7, fp);
+    char reserved[32]; fread(reserved, 1, 32, fp);
+    uint32_t nl = hdr[1], nt = hdr[4], kt = hdr[5];
     tq_state_t* s = ctx->state;
     tq_model_config_t* c = &ctx->model->config;
     int kv_dim = c->n_kv_heads * c->head_dim;
-    /* Validate compatibility */
-    if (nl != (uint32_t)c->n_layers || kd != (uint32_t)kv_dim) {
-        fprintf(stderr, "quant_load_context: model mismatch (layers %u vs %d, kv_dim %u vs %d)\n",
-                nl, c->n_layers, kd, kv_dim);
-        fclose(fp); return -1;
-    }
-    if (nt > (uint32_t)c->max_seq_len) {
-        fprintf(stderr, "quant_load_context: saved %u tokens > max_seq_len %d\n",
-                nt, c->max_seq_len);
-        fclose(fp); return -1;
-    }
+    if (nl != (uint32_t)c->n_layers || hdr[2] != (uint32_t)kv_dim) { fclose(fp); return -1; }
+    if (nt > (uint32_t)c->max_seq_len) { fclose(fp); return -1; }
     /* Read KV data */
     for (uint32_t l = 0; l < nl; l++) {

quantcpp-0.11.0/quantcpp/cli.py ADDED Viewed

@@ -0,0 +1,64 @@
+"""
+quantcpp CLI — chat with a local LLM in your terminal.
+Usage:
+    quantcpp                          # auto-downloads Llama-3.2-1B, starts chat
+    quantcpp "What is gravity?"       # one-shot question
+    quantcpp --model SmolLM2-135M     # use a smaller model (faster download)
+    quantcpp --model path/to/file.gguf  # use your own GGUF file
+"""
+import sys
+import os
+def main():
+    import argparse
+    parser = argparse.ArgumentParser(
+        prog="quantcpp",
+        description="Chat with a local LLM. No API key, no GPU, no server.",
+    )
+    parser.add_argument("prompt", nargs="*", help="Question to ask (omit for interactive chat)")
+    parser.add_argument("--model", "-m", default="Llama-3.2-1B",
+                        help="Model name or path to .gguf file (default: Llama-3.2-1B)")
+    parser.add_argument("--max-tokens", "-n", type=int, default=256)
+    parser.add_argument("--temperature", "-t", type=float, default=0.7)
+    args = parser.parse_args()
+    from quantcpp import Model
+    # Load model
+    model_path = args.model
+    if os.path.isfile(model_path):
+        print(f"Loading {model_path}...", file=sys.stderr)
+        m = Model(model_path, max_tokens=args.max_tokens, temperature=args.temperature)
+    else:
+        print(f"Downloading {model_path}...", file=sys.stderr)
+        m = Model.from_pretrained(model_path, max_tokens=args.max_tokens,
+                                   temperature=args.temperature)
+    # One-shot or interactive
+    if args.prompt:
+        question = " ".join(args.prompt)
+        for tok in m.generate(question):
+            print(tok, end="", flush=True)
+        print()
+    else:
+        print("quantcpp — type your message, Ctrl+C to exit", file=sys.stderr)
+        try:
+            while True:
+                question = input("\nYou: ")
+                if not question.strip():
+                    continue
+                print("AI: ", end="", flush=True)
+                for tok in m.generate(question):
+                    print(tok, end="", flush=True)
+                print()
+        except (KeyboardInterrupt, EOFError):
+            print("\nBye!", file=sys.stderr)
+    m.close()
+if __name__ == "__main__":
+    main()

{quantcpp-0.10.1 → quantcpp-0.11.0/quantcpp.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: quantcpp
-Version: 0.10.1
+Version: 0.11.0
 Summary: Single-header LLM inference engine with KV cache compression (7× compression at fp32 parity)
 Author-email: quantumaikr <noreply@quantumaikr.com>
 License: Apache-2.0

{quantcpp-0.10.1 → quantcpp-0.11.0}/quantcpp.egg-info/SOURCES.txt RENAMED Viewed

@@ -6,9 +6,11 @@ setup.py
 quantcpp/__init__.py
 quantcpp/_binding.py
 quantcpp/_quant.h
+quantcpp/cli.py
 quantcpp.egg-info/PKG-INFO
 quantcpp.egg-info/SOURCES.txt
 quantcpp.egg-info/dependency_links.txt
+quantcpp.egg-info/entry_points.txt
 quantcpp.egg-info/requires.txt
 quantcpp.egg-info/top_level.txt
 tests/test_basic.py