quantcpp 0.10.1__tar.gz → 0.11.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: quantcpp
3
- Version: 0.10.1
3
+ Version: 0.11.0
4
4
  Summary: Single-header LLM inference engine with KV cache compression (7× compression at fp32 parity)
5
5
  Author-email: quantumaikr <noreply@quantumaikr.com>
6
6
  License: Apache-2.0
@@ -7,7 +7,7 @@ build-backend = "setuptools.build_meta"
7
7
 
8
8
  [project]
9
9
  name = "quantcpp"
10
- version = "0.10.1"
10
+ version = "0.11.0"
11
11
  description = "Single-header LLM inference engine with KV cache compression (7× compression at fp32 parity)"
12
12
  readme = "README.md"
13
13
  license = { text = "Apache-2.0" }
@@ -43,6 +43,9 @@ Source = "https://github.com/quantumaikr/quant.cpp"
43
43
  Issues = "https://github.com/quantumaikr/quant.cpp/issues"
44
44
  Changelog = "https://github.com/quantumaikr/quant.cpp/blob/main/CHANGELOG.md"
45
45
 
46
+ [project.scripts]
47
+ quantcpp = "quantcpp.cli:main"
48
+
46
49
  [project.optional-dependencies]
47
50
  dev = ["pytest>=7.0", "build", "twine"]
48
51
 
@@ -8033,32 +8033,75 @@ tq_tokenizer_t* tq_load_tokenizer_from_gguf(const void* gguf_ctx_ptr) {
8033
8033
  }
8034
8034
  }
8035
8035
 
8036
- /* Load merges if available */
8036
+ /* Build sorted indices BEFORE merge parsing so str_lookup() can use
8037
+ * binary search instead of O(n) linear scan. For 248K vocab with
8038
+ * ~50K merges (3 lookups each), this turns a ~10 s init into ~100 ms. */
8039
+ tok->sorted_indices = (int*)malloc(vocab_size * sizeof(int));
8040
+ if (tok->sorted_indices) {
8041
+ for (int i = 0; i < (int)vocab_size; i++) tok->sorted_indices[i] = i;
8042
+ g_vocab_for_sort = tok->vocab;
8043
+ qsort(tok->sorted_indices, vocab_size, sizeof(int), cmp_vocab_idx);
8044
+ }
8045
+
8046
+ /* Load and parse merges if available.
8047
+ * GGUF stores merges as a string array of "tok_a tok_b" pairs.
8048
+ * We need to look up token IDs and build (id_a, id_b, id_merged) triples
8049
+ * so the BPE encoder can use them. */
8037
8050
  int64_t merges_idx = tq_gguf_find_key(gguf, "tokenizer.ggml.merges");
8038
8051
  if (merges_idx >= 0) {
8039
8052
  const tq_gguf_kv_t* mkv = &gguf->kv[merges_idx];
8040
8053
  if (mkv->type == TQ_GGUF_TYPE_ARRAY &&
8041
8054
  mkv->value.array.elem_type == TQ_GGUF_TYPE_STRING) {
8042
- /* Parse merge rules: "token_a token_b" -> find IDs, store as merge pairs */
8043
- uint64_t n_merges = mkv->value.array.count;
8044
- tok->n_merges = (int)n_merges;
8045
- tok->merge_pairs = (int*)malloc(n_merges * 3 * sizeof(int));
8055
+ uint64_t n_merges_total = mkv->value.array.count;
8056
+ tok->merge_pairs = (int*)malloc(n_merges_total * 3 * sizeof(int));
8057
+ tok->n_merges = 0;
8046
8058
  if (tok->merge_pairs) {
8047
- memset(tok->merge_pairs, 0, n_merges * 3 * sizeof(int));
8059
+ tq_gguf_string_t* merge_strings = (tq_gguf_string_t*)mkv->value.array.data;
8060
+ for (uint64_t mi = 0; mi < n_merges_total; mi++) {
8061
+ if (!merge_strings[mi].str || merge_strings[mi].len == 0) continue;
8062
+
8063
+ /* Copy merge string and split on space: "tok_a tok_b" */
8064
+ char buf[2048];
8065
+ int slen = (int)merge_strings[mi].len;
8066
+ if (slen >= (int)sizeof(buf)) continue;
8067
+ memcpy(buf, merge_strings[mi].str, (size_t)slen);
8068
+ buf[slen] = '\0';
8069
+
8070
+ char* sep = strchr(buf, ' ');
8071
+ if (!sep) continue;
8072
+ *sep = '\0';
8073
+ const char* str_a = buf;
8074
+ const char* str_b = sep + 1;
8075
+
8076
+ /* Build merged string: concatenation of tok_a + tok_b */
8077
+ char merged[2048];
8078
+ int la = (int)strlen(str_a);
8079
+ int lb = (int)strlen(str_b);
8080
+ if (la + lb >= (int)sizeof(merged)) continue;
8081
+ memcpy(merged, str_a, (size_t)la);
8082
+ memcpy(merged + la, str_b, (size_t)lb);
8083
+ merged[la + lb] = '\0';
8084
+
8085
+ /* Look up token IDs via binary search (sorted_indices built above) */
8086
+ int id_a = str_lookup(tok, str_a);
8087
+ int id_b = str_lookup(tok, str_b);
8088
+ int id_merged = str_lookup(tok, merged);
8089
+
8090
+ if (id_a >= 0 && id_b >= 0 && id_merged >= 0) {
8091
+ tok->merge_pairs[tok->n_merges * 3 + 0] = id_a;
8092
+ tok->merge_pairs[tok->n_merges * 3 + 1] = id_b;
8093
+ tok->merge_pairs[tok->n_merges * 3 + 2] = id_merged;
8094
+ /* Priority: earlier merges in GGUF = higher priority */
8095
+ tok->scores[id_merged] = (float)(n_merges_total - mi);
8096
+ tok->n_merges++;
8097
+ }
8098
+ }
8099
+ fprintf(stderr, "tq_load_tokenizer_from_gguf: parsed %d/%d merges\n",
8100
+ tok->n_merges, (int)n_merges_total);
8048
8101
  }
8049
8102
  }
8050
8103
  }
8051
8104
 
8052
- /* Build sorted indices for encoding (binary search by string).
8053
- * Use qsort for O(n log n) instead of insertion sort O(n²) — critical
8054
- * for 248K vocab where insertion sort would take minutes. */
8055
- tok->sorted_indices = (int*)malloc(vocab_size * sizeof(int));
8056
- if (tok->sorted_indices) {
8057
- for (int i = 0; i < (int)vocab_size; i++) tok->sorted_indices[i] = i;
8058
- g_vocab_for_sort = tok->vocab;
8059
- qsort(tok->sorted_indices, vocab_size, sizeof(int), cmp_vocab_idx);
8060
- }
8061
-
8062
8105
  fprintf(stderr, "tq_load_tokenizer_from_gguf: loaded %d tokens (max_len=%d)\n",
8063
8106
  tok->vocab_size, tok->max_token_len);
8064
8107
  return tok;
@@ -9939,18 +9982,11 @@ static tq_model_t* tq_load_safetensors(const char* path) {
9939
9982
 
9940
9983
  free(tensors);
9941
9984
 
9942
- /* Qwen3.5 RMSNorm adjustment: Qwen3_5RMSNorm computes
9943
- * output = norm(x) * (1.0 + weight), NOT norm(x) * weight.
9944
- * We bake the "+1" into the weight so tq_rmsnorm can stay as
9945
- * out = x * rsqrt * weight.
9946
- *
9947
- * This applies to: input_layernorm, post_attention_layernorm,
9948
- * model.norm, q_norm, k_norm.
9949
- * It does NOT apply to: linear_attn.norm (Qwen3_5RMSNormGated
9950
- * uses plain weight without +1).
9951
- *
9952
- * We detect Qwen3.5 by the presence of DeltaNet layers. */
9953
- if (model->config.delta_n_heads > 0) {
9985
+ /* Qwen3.5 (DeltaNet hybrid) RMSNorm adjustment.
9986
+ * Only for non-GGUF models (raw checkpoints). GGUF files from
9987
+ * llama.cpp already have +1 baked in by the converter.
9988
+ * Qwen2/Qwen3 use standard RMSNorm and never need +1. */
9989
+ if (model->config.delta_n_heads > 0 && !model->gguf_ctx) {
9954
9990
  int dim_h = model->config.hidden_dim;
9955
9991
  int head_dim_h = model->config.head_dim;
9956
9992
 
@@ -9979,7 +10015,7 @@ static tq_model_t* tq_load_safetensors(const char* path) {
9979
10015
  for (int i = 0; i < dim_h; i++)
9980
10016
  model->output_norm[i] += 1.0f;
9981
10017
  }
9982
- fprintf(stderr, "tq_load_model: applied Qwen3.5 RMSNorm +1 weight adjustment\n");
10018
+ fprintf(stderr, "tq_load_model: applied Qwen RMSNorm +1 weight adjustment\n");
9983
10019
  }
9984
10020
 
9985
10021
  /* Gemma3 RMSNorm adjustment: same (1+w) scaling as Qwen3.5 */
@@ -15862,22 +15898,7 @@ void quant_free_string(char* str) {
15862
15898
  if (str) free(str);
15863
15899
  }
15864
15900
 
15865
- /* ================================================================
15866
- * Context persistence — save/load KV cache to disk
15867
- *
15868
- * File format (binary, little-endian):
15869
- * magic: 4 bytes "QKVC"
15870
- * version: uint32 (1)
15871
- * n_layers: uint32
15872
- * kv_dim: uint32 (n_kv_heads * head_dim)
15873
- * max_seq: uint32
15874
- * n_tokens: uint32 (number of filled positions)
15875
- * kv_type: uint32 (TQ_TYPE_* enum or TQ_TYPE_COUNT for fp32)
15876
- * has_fp16v: uint32 (1 if value_cache_fp16 is used)
15877
- * reserved: 32 bytes (future use)
15878
- * data: raw KV cache bytes
15879
- * ================================================================ */
15880
-
15901
+ /* Context persistence: QKVC format (64-byte header + raw KV data) */
15881
15902
  int quant_save_context(quant_ctx* ctx, const char* path) {
15882
15903
  if (!ctx || !ctx->state || !path) return -1;
15883
15904
  FILE* fp = fopen(path, "wb");
@@ -15886,29 +15907,17 @@ int quant_save_context(quant_ctx* ctx, const char* path) {
15886
15907
  tq_state_t* s = ctx->state;
15887
15908
  tq_model_config_t* c = &ctx->model->config;
15888
15909
  int kv_dim = c->n_kv_heads * c->head_dim;
15889
-
15890
- /* Header */
15891
15910
  fwrite("QKVC", 1, 4, fp);
15892
- uint32_t version = 1;
15893
- uint32_t nl = (uint32_t)c->n_layers;
15894
- uint32_t kd = (uint32_t)kv_dim;
15895
- uint32_t ms = (uint32_t)c->max_seq_len;
15896
- uint32_t nt = (uint32_t)ctx->n_ctx_tokens;
15897
- uint32_t kt = (uint32_t)s->kv_quant_type;
15898
- uint32_t hfp16 = s->value_cache_fp16 ? 1 : 0;
15899
- fwrite(&version, 4, 1, fp);
15900
- fwrite(&nl, 4, 1, fp);
15901
- fwrite(&kd, 4, 1, fp);
15902
- fwrite(&ms, 4, 1, fp);
15903
- fwrite(&nt, 4, 1, fp);
15904
- fwrite(&kt, 4, 1, fp);
15905
- fwrite(&hfp16, 4, 1, fp);
15906
- char reserved[32] = {0};
15907
- fwrite(reserved, 1, 32, fp);
15911
+ uint32_t hdr[7] = { 1, (uint32_t)c->n_layers, (uint32_t)kv_dim,
15912
+ (uint32_t)c->max_seq_len, (uint32_t)ctx->n_ctx_tokens,
15913
+ (uint32_t)s->kv_quant_type, s->value_cache_fp16 ? 1u : 0u };
15914
+ fwrite(hdr, 4, 7, fp);
15915
+ char reserved[32] = {0}; fwrite(reserved, 1, 32, fp);
15916
+ uint32_t nl = hdr[1], nt = hdr[4], kt = hdr[5];
15908
15917
 
15909
15918
  /* KV data: write only the filled portion (nt tokens) */
15910
15919
  for (uint32_t l = 0; l < nl; l++) {
15911
- size_t layer_stride = (size_t)ms * kv_dim;
15920
+ size_t layer_stride = (size_t)c->max_seq_len * kv_dim;
15912
15921
  /* Key cache: FP32 or quantized */
15913
15922
  if (s->key_cache) {
15914
15923
  fwrite(s->key_cache + l * layer_stride, sizeof(float),
@@ -15916,7 +15925,7 @@ int quant_save_context(quant_ctx* ctx, const char* path) {
15916
15925
  }
15917
15926
  if (s->quant_key_cache && kt < TQ_TYPE_COUNT) {
15918
15927
  size_t blk_sz = tq_type_type_size(kt);
15919
- uint8_t* qbase = (uint8_t*)s->quant_key_cache + l * (size_t)ms * blk_sz;
15928
+ uint8_t* qbase = (uint8_t*)s->quant_key_cache + l * (size_t)c->max_seq_len * blk_sz;
15920
15929
  fwrite(qbase, blk_sz, nt, fp);
15921
15930
  }
15922
15931
  /* Value cache: FP32 or FP16 */
@@ -15925,7 +15934,7 @@ int quant_save_context(quant_ctx* ctx, const char* path) {
15925
15934
  (size_t)nt * kv_dim, fp);
15926
15935
  }
15927
15936
  if (s->value_cache_fp16) {
15928
- size_t layer_stride16 = (size_t)ms * kv_dim;
15937
+ size_t layer_stride16 = (size_t)c->max_seq_len * kv_dim;
15929
15938
  fwrite(s->value_cache_fp16 + l * layer_stride16, sizeof(uint16_t),
15930
15939
  (size_t)nt * kv_dim, fp);
15931
15940
  }
@@ -15942,37 +15951,16 @@ int quant_load_context(quant_ctx* ctx, const char* path) {
15942
15951
  FILE* fp = fopen(path, "rb");
15943
15952
  if (!fp) return -1;
15944
15953
 
15945
- /* Read and validate header */
15946
15954
  char magic[4];
15947
- if (fread(magic, 1, 4, fp) != 4 || memcmp(magic, "QKVC", 4) != 0) {
15948
- fclose(fp); return -1;
15949
- }
15950
- uint32_t version, nl, kd, ms, nt, kt, hfp16;
15951
- fread(&version, 4, 1, fp);
15952
- fread(&nl, 4, 1, fp);
15953
- fread(&kd, 4, 1, fp);
15954
- fread(&ms, 4, 1, fp);
15955
- fread(&nt, 4, 1, fp);
15956
- fread(&kt, 4, 1, fp);
15957
- fread(&hfp16, 4, 1, fp);
15958
- char reserved[32];
15959
- fread(reserved, 1, 32, fp);
15960
-
15955
+ if (fread(magic, 1, 4, fp) != 4 || memcmp(magic, "QKVC", 4) != 0) { fclose(fp); return -1; }
15956
+ uint32_t hdr[7]; fread(hdr, 4, 7, fp);
15957
+ char reserved[32]; fread(reserved, 1, 32, fp);
15958
+ uint32_t nl = hdr[1], nt = hdr[4], kt = hdr[5];
15961
15959
  tq_state_t* s = ctx->state;
15962
15960
  tq_model_config_t* c = &ctx->model->config;
15963
15961
  int kv_dim = c->n_kv_heads * c->head_dim;
15964
-
15965
- /* Validate compatibility */
15966
- if (nl != (uint32_t)c->n_layers || kd != (uint32_t)kv_dim) {
15967
- fprintf(stderr, "quant_load_context: model mismatch (layers %u vs %d, kv_dim %u vs %d)\n",
15968
- nl, c->n_layers, kd, kv_dim);
15969
- fclose(fp); return -1;
15970
- }
15971
- if (nt > (uint32_t)c->max_seq_len) {
15972
- fprintf(stderr, "quant_load_context: saved %u tokens > max_seq_len %d\n",
15973
- nt, c->max_seq_len);
15974
- fclose(fp); return -1;
15975
- }
15962
+ if (nl != (uint32_t)c->n_layers || hdr[2] != (uint32_t)kv_dim) { fclose(fp); return -1; }
15963
+ if (nt > (uint32_t)c->max_seq_len) { fclose(fp); return -1; }
15976
15964
 
15977
15965
  /* Read KV data */
15978
15966
  for (uint32_t l = 0; l < nl; l++) {
@@ -1,25 +1,21 @@
1
1
  """
2
- quantcpp -- The SQLite of LLMs. Single-header C inference in Python.
2
+ quantcpp -- Compress AI's memory 3x. It gets faster.
3
3
 
4
- Quick start (3 lines):
4
+ Quick start:
5
5
 
6
6
  from quantcpp import Model
7
- m = Model.from_pretrained("SmolLM2-135M")
7
+ m = Model.from_pretrained("Llama-3.2-1B")
8
8
  print(m.ask("What is gravity?"))
9
9
 
10
- Full control:
11
-
12
- m = Model("path/to/model.gguf", temperature=0.7, max_tokens=256)
13
- for token in m.generate("Once upon a time"):
14
- print(token, end="", flush=True)
15
- m.close()
10
+ Note: SmolLM2-135M downloads faster but produces low-quality output.
11
+ Use Llama-3.2-1B (~750 MB, one-time download) for good results.
16
12
  """
17
13
 
18
14
  try:
19
15
  from importlib.metadata import version as _pkg_version
20
16
  __version__ = _pkg_version("quantcpp")
21
17
  except Exception:
22
- __version__ = "0.10.1" # fallback for editable / source-tree imports
18
+ __version__ = "0.11.0" # fallback for editable / source-tree imports
23
19
 
24
20
  import os
25
21
  import sys
@@ -53,6 +49,11 @@ _MODEL_REGISTRY = {
53
49
  "smollm2-135m-instruct-q8_0.gguf",
54
50
  135,
55
51
  ),
52
+ "Qwen3.5-0.8B": (
53
+ "unsloth/Qwen3.5-0.8B-GGUF",
54
+ "Qwen3.5-0.8B-Q4_K_M.gguf",
55
+ 508,
56
+ ),
56
57
  "Llama-3.2-1B": (
57
58
  "hugging-quants/Llama-3.2-1B-Instruct-Q4_K_M-GGUF",
58
59
  "llama-3.2-1b-instruct-q4_k_m.gguf",
@@ -192,17 +193,21 @@ class Model:
192
193
  n_threads: int = 4,
193
194
  kv_compress: int = 1,
194
195
  context_length: int = 0,
195
- progressive: bool = False,
196
+ progressive: bool = True,
197
+ aggressive: bool = False,
196
198
  ):
197
199
  """
198
200
  Parameters
199
201
  ----------
200
202
  progressive : bool
201
- Enable progressive KV compression (default False). When True,
202
- the last 128 tokens' keys are kept at FP32 for maximum quality,
203
- while all older tokens are compressed. Reduces PPL degradation
204
- from +3.8% to +0.6% at a cost of ~28 KB extra memory.
205
- Like human memory: recent = vivid, older = faded but present.
203
+ Progressive KV compression (default True). Keeps last 128
204
+ tokens' keys at FP32 while compressing the rest. Verified
205
+ on 3 models: +0% to +3% PPL improvement at 1.75 MB cost.
206
+ No reason to disable it's strictly better.
207
+ aggressive : bool
208
+ Maximum memory savings (default False). Uses 4-bit KV with
209
+ last 512 tokens at FP32. Ideal for very long context.
210
+ At 128K context: 4.6 GB instead of 9.2 GB KV cache.
206
211
  """
207
212
  if not os.path.isfile(path):
208
213
  raise FileNotFoundError(f"Model file not found: {path}")
@@ -212,11 +217,21 @@ class Model:
212
217
  self._top_p = top_p
213
218
  self._max_tokens = max_tokens
214
219
  self._n_threads = n_threads
215
- self._kv_compress = kv_compress
216
220
  self._context_length = context_length
217
221
  self._progressive = progressive
222
+ self._aggressive = aggressive
223
+
224
+ if aggressive:
225
+ # 4-bit KV + 512-token FP32 window: best memory/quality ratio.
226
+ # Measured: same PPL as flat 4-bit, attention-aware precision.
227
+ # TODO: add uniform_2b (kv_compress=3) for 48% more savings.
228
+ k_win = 512
229
+ elif progressive:
230
+ k_win = 128
231
+ else:
232
+ k_win = 0
218
233
 
219
- k_win = 128 if progressive else 0
234
+ self._kv_compress = kv_compress
220
235
 
221
236
  self._model = load_model(path)
222
237
  self._ctx = new_context(
@@ -8033,32 +8033,75 @@ tq_tokenizer_t* tq_load_tokenizer_from_gguf(const void* gguf_ctx_ptr) {
8033
8033
  }
8034
8034
  }
8035
8035
 
8036
- /* Load merges if available */
8036
+ /* Build sorted indices BEFORE merge parsing so str_lookup() can use
8037
+ * binary search instead of O(n) linear scan. For 248K vocab with
8038
+ * ~50K merges (3 lookups each), this turns a ~10 s init into ~100 ms. */
8039
+ tok->sorted_indices = (int*)malloc(vocab_size * sizeof(int));
8040
+ if (tok->sorted_indices) {
8041
+ for (int i = 0; i < (int)vocab_size; i++) tok->sorted_indices[i] = i;
8042
+ g_vocab_for_sort = tok->vocab;
8043
+ qsort(tok->sorted_indices, vocab_size, sizeof(int), cmp_vocab_idx);
8044
+ }
8045
+
8046
+ /* Load and parse merges if available.
8047
+ * GGUF stores merges as a string array of "tok_a tok_b" pairs.
8048
+ * We need to look up token IDs and build (id_a, id_b, id_merged) triples
8049
+ * so the BPE encoder can use them. */
8037
8050
  int64_t merges_idx = tq_gguf_find_key(gguf, "tokenizer.ggml.merges");
8038
8051
  if (merges_idx >= 0) {
8039
8052
  const tq_gguf_kv_t* mkv = &gguf->kv[merges_idx];
8040
8053
  if (mkv->type == TQ_GGUF_TYPE_ARRAY &&
8041
8054
  mkv->value.array.elem_type == TQ_GGUF_TYPE_STRING) {
8042
- /* Parse merge rules: "token_a token_b" -> find IDs, store as merge pairs */
8043
- uint64_t n_merges = mkv->value.array.count;
8044
- tok->n_merges = (int)n_merges;
8045
- tok->merge_pairs = (int*)malloc(n_merges * 3 * sizeof(int));
8055
+ uint64_t n_merges_total = mkv->value.array.count;
8056
+ tok->merge_pairs = (int*)malloc(n_merges_total * 3 * sizeof(int));
8057
+ tok->n_merges = 0;
8046
8058
  if (tok->merge_pairs) {
8047
- memset(tok->merge_pairs, 0, n_merges * 3 * sizeof(int));
8059
+ tq_gguf_string_t* merge_strings = (tq_gguf_string_t*)mkv->value.array.data;
8060
+ for (uint64_t mi = 0; mi < n_merges_total; mi++) {
8061
+ if (!merge_strings[mi].str || merge_strings[mi].len == 0) continue;
8062
+
8063
+ /* Copy merge string and split on space: "tok_a tok_b" */
8064
+ char buf[2048];
8065
+ int slen = (int)merge_strings[mi].len;
8066
+ if (slen >= (int)sizeof(buf)) continue;
8067
+ memcpy(buf, merge_strings[mi].str, (size_t)slen);
8068
+ buf[slen] = '\0';
8069
+
8070
+ char* sep = strchr(buf, ' ');
8071
+ if (!sep) continue;
8072
+ *sep = '\0';
8073
+ const char* str_a = buf;
8074
+ const char* str_b = sep + 1;
8075
+
8076
+ /* Build merged string: concatenation of tok_a + tok_b */
8077
+ char merged[2048];
8078
+ int la = (int)strlen(str_a);
8079
+ int lb = (int)strlen(str_b);
8080
+ if (la + lb >= (int)sizeof(merged)) continue;
8081
+ memcpy(merged, str_a, (size_t)la);
8082
+ memcpy(merged + la, str_b, (size_t)lb);
8083
+ merged[la + lb] = '\0';
8084
+
8085
+ /* Look up token IDs via binary search (sorted_indices built above) */
8086
+ int id_a = str_lookup(tok, str_a);
8087
+ int id_b = str_lookup(tok, str_b);
8088
+ int id_merged = str_lookup(tok, merged);
8089
+
8090
+ if (id_a >= 0 && id_b >= 0 && id_merged >= 0) {
8091
+ tok->merge_pairs[tok->n_merges * 3 + 0] = id_a;
8092
+ tok->merge_pairs[tok->n_merges * 3 + 1] = id_b;
8093
+ tok->merge_pairs[tok->n_merges * 3 + 2] = id_merged;
8094
+ /* Priority: earlier merges in GGUF = higher priority */
8095
+ tok->scores[id_merged] = (float)(n_merges_total - mi);
8096
+ tok->n_merges++;
8097
+ }
8098
+ }
8099
+ fprintf(stderr, "tq_load_tokenizer_from_gguf: parsed %d/%d merges\n",
8100
+ tok->n_merges, (int)n_merges_total);
8048
8101
  }
8049
8102
  }
8050
8103
  }
8051
8104
 
8052
- /* Build sorted indices for encoding (binary search by string).
8053
- * Use qsort for O(n log n) instead of insertion sort O(n²) — critical
8054
- * for 248K vocab where insertion sort would take minutes. */
8055
- tok->sorted_indices = (int*)malloc(vocab_size * sizeof(int));
8056
- if (tok->sorted_indices) {
8057
- for (int i = 0; i < (int)vocab_size; i++) tok->sorted_indices[i] = i;
8058
- g_vocab_for_sort = tok->vocab;
8059
- qsort(tok->sorted_indices, vocab_size, sizeof(int), cmp_vocab_idx);
8060
- }
8061
-
8062
8105
  fprintf(stderr, "tq_load_tokenizer_from_gguf: loaded %d tokens (max_len=%d)\n",
8063
8106
  tok->vocab_size, tok->max_token_len);
8064
8107
  return tok;
@@ -9939,18 +9982,11 @@ static tq_model_t* tq_load_safetensors(const char* path) {
9939
9982
 
9940
9983
  free(tensors);
9941
9984
 
9942
- /* Qwen3.5 RMSNorm adjustment: Qwen3_5RMSNorm computes
9943
- * output = norm(x) * (1.0 + weight), NOT norm(x) * weight.
9944
- * We bake the "+1" into the weight so tq_rmsnorm can stay as
9945
- * out = x * rsqrt * weight.
9946
- *
9947
- * This applies to: input_layernorm, post_attention_layernorm,
9948
- * model.norm, q_norm, k_norm.
9949
- * It does NOT apply to: linear_attn.norm (Qwen3_5RMSNormGated
9950
- * uses plain weight without +1).
9951
- *
9952
- * We detect Qwen3.5 by the presence of DeltaNet layers. */
9953
- if (model->config.delta_n_heads > 0) {
9985
+ /* Qwen3.5 (DeltaNet hybrid) RMSNorm adjustment.
9986
+ * Only for non-GGUF models (raw checkpoints). GGUF files from
9987
+ * llama.cpp already have +1 baked in by the converter.
9988
+ * Qwen2/Qwen3 use standard RMSNorm and never need +1. */
9989
+ if (model->config.delta_n_heads > 0 && !model->gguf_ctx) {
9954
9990
  int dim_h = model->config.hidden_dim;
9955
9991
  int head_dim_h = model->config.head_dim;
9956
9992
 
@@ -9979,7 +10015,7 @@ static tq_model_t* tq_load_safetensors(const char* path) {
9979
10015
  for (int i = 0; i < dim_h; i++)
9980
10016
  model->output_norm[i] += 1.0f;
9981
10017
  }
9982
- fprintf(stderr, "tq_load_model: applied Qwen3.5 RMSNorm +1 weight adjustment\n");
10018
+ fprintf(stderr, "tq_load_model: applied Qwen RMSNorm +1 weight adjustment\n");
9983
10019
  }
9984
10020
 
9985
10021
  /* Gemma3 RMSNorm adjustment: same (1+w) scaling as Qwen3.5 */
@@ -15862,22 +15898,7 @@ void quant_free_string(char* str) {
15862
15898
  if (str) free(str);
15863
15899
  }
15864
15900
 
15865
- /* ================================================================
15866
- * Context persistence — save/load KV cache to disk
15867
- *
15868
- * File format (binary, little-endian):
15869
- * magic: 4 bytes "QKVC"
15870
- * version: uint32 (1)
15871
- * n_layers: uint32
15872
- * kv_dim: uint32 (n_kv_heads * head_dim)
15873
- * max_seq: uint32
15874
- * n_tokens: uint32 (number of filled positions)
15875
- * kv_type: uint32 (TQ_TYPE_* enum or TQ_TYPE_COUNT for fp32)
15876
- * has_fp16v: uint32 (1 if value_cache_fp16 is used)
15877
- * reserved: 32 bytes (future use)
15878
- * data: raw KV cache bytes
15879
- * ================================================================ */
15880
-
15901
+ /* Context persistence: QKVC format (64-byte header + raw KV data) */
15881
15902
  int quant_save_context(quant_ctx* ctx, const char* path) {
15882
15903
  if (!ctx || !ctx->state || !path) return -1;
15883
15904
  FILE* fp = fopen(path, "wb");
@@ -15886,29 +15907,17 @@ int quant_save_context(quant_ctx* ctx, const char* path) {
15886
15907
  tq_state_t* s = ctx->state;
15887
15908
  tq_model_config_t* c = &ctx->model->config;
15888
15909
  int kv_dim = c->n_kv_heads * c->head_dim;
15889
-
15890
- /* Header */
15891
15910
  fwrite("QKVC", 1, 4, fp);
15892
- uint32_t version = 1;
15893
- uint32_t nl = (uint32_t)c->n_layers;
15894
- uint32_t kd = (uint32_t)kv_dim;
15895
- uint32_t ms = (uint32_t)c->max_seq_len;
15896
- uint32_t nt = (uint32_t)ctx->n_ctx_tokens;
15897
- uint32_t kt = (uint32_t)s->kv_quant_type;
15898
- uint32_t hfp16 = s->value_cache_fp16 ? 1 : 0;
15899
- fwrite(&version, 4, 1, fp);
15900
- fwrite(&nl, 4, 1, fp);
15901
- fwrite(&kd, 4, 1, fp);
15902
- fwrite(&ms, 4, 1, fp);
15903
- fwrite(&nt, 4, 1, fp);
15904
- fwrite(&kt, 4, 1, fp);
15905
- fwrite(&hfp16, 4, 1, fp);
15906
- char reserved[32] = {0};
15907
- fwrite(reserved, 1, 32, fp);
15911
+ uint32_t hdr[7] = { 1, (uint32_t)c->n_layers, (uint32_t)kv_dim,
15912
+ (uint32_t)c->max_seq_len, (uint32_t)ctx->n_ctx_tokens,
15913
+ (uint32_t)s->kv_quant_type, s->value_cache_fp16 ? 1u : 0u };
15914
+ fwrite(hdr, 4, 7, fp);
15915
+ char reserved[32] = {0}; fwrite(reserved, 1, 32, fp);
15916
+ uint32_t nl = hdr[1], nt = hdr[4], kt = hdr[5];
15908
15917
 
15909
15918
  /* KV data: write only the filled portion (nt tokens) */
15910
15919
  for (uint32_t l = 0; l < nl; l++) {
15911
- size_t layer_stride = (size_t)ms * kv_dim;
15920
+ size_t layer_stride = (size_t)c->max_seq_len * kv_dim;
15912
15921
  /* Key cache: FP32 or quantized */
15913
15922
  if (s->key_cache) {
15914
15923
  fwrite(s->key_cache + l * layer_stride, sizeof(float),
@@ -15916,7 +15925,7 @@ int quant_save_context(quant_ctx* ctx, const char* path) {
15916
15925
  }
15917
15926
  if (s->quant_key_cache && kt < TQ_TYPE_COUNT) {
15918
15927
  size_t blk_sz = tq_type_type_size(kt);
15919
- uint8_t* qbase = (uint8_t*)s->quant_key_cache + l * (size_t)ms * blk_sz;
15928
+ uint8_t* qbase = (uint8_t*)s->quant_key_cache + l * (size_t)c->max_seq_len * blk_sz;
15920
15929
  fwrite(qbase, blk_sz, nt, fp);
15921
15930
  }
15922
15931
  /* Value cache: FP32 or FP16 */
@@ -15925,7 +15934,7 @@ int quant_save_context(quant_ctx* ctx, const char* path) {
15925
15934
  (size_t)nt * kv_dim, fp);
15926
15935
  }
15927
15936
  if (s->value_cache_fp16) {
15928
- size_t layer_stride16 = (size_t)ms * kv_dim;
15937
+ size_t layer_stride16 = (size_t)c->max_seq_len * kv_dim;
15929
15938
  fwrite(s->value_cache_fp16 + l * layer_stride16, sizeof(uint16_t),
15930
15939
  (size_t)nt * kv_dim, fp);
15931
15940
  }
@@ -15942,37 +15951,16 @@ int quant_load_context(quant_ctx* ctx, const char* path) {
15942
15951
  FILE* fp = fopen(path, "rb");
15943
15952
  if (!fp) return -1;
15944
15953
 
15945
- /* Read and validate header */
15946
15954
  char magic[4];
15947
- if (fread(magic, 1, 4, fp) != 4 || memcmp(magic, "QKVC", 4) != 0) {
15948
- fclose(fp); return -1;
15949
- }
15950
- uint32_t version, nl, kd, ms, nt, kt, hfp16;
15951
- fread(&version, 4, 1, fp);
15952
- fread(&nl, 4, 1, fp);
15953
- fread(&kd, 4, 1, fp);
15954
- fread(&ms, 4, 1, fp);
15955
- fread(&nt, 4, 1, fp);
15956
- fread(&kt, 4, 1, fp);
15957
- fread(&hfp16, 4, 1, fp);
15958
- char reserved[32];
15959
- fread(reserved, 1, 32, fp);
15960
-
15955
+ if (fread(magic, 1, 4, fp) != 4 || memcmp(magic, "QKVC", 4) != 0) { fclose(fp); return -1; }
15956
+ uint32_t hdr[7]; fread(hdr, 4, 7, fp);
15957
+ char reserved[32]; fread(reserved, 1, 32, fp);
15958
+ uint32_t nl = hdr[1], nt = hdr[4], kt = hdr[5];
15961
15959
  tq_state_t* s = ctx->state;
15962
15960
  tq_model_config_t* c = &ctx->model->config;
15963
15961
  int kv_dim = c->n_kv_heads * c->head_dim;
15964
-
15965
- /* Validate compatibility */
15966
- if (nl != (uint32_t)c->n_layers || kd != (uint32_t)kv_dim) {
15967
- fprintf(stderr, "quant_load_context: model mismatch (layers %u vs %d, kv_dim %u vs %d)\n",
15968
- nl, c->n_layers, kd, kv_dim);
15969
- fclose(fp); return -1;
15970
- }
15971
- if (nt > (uint32_t)c->max_seq_len) {
15972
- fprintf(stderr, "quant_load_context: saved %u tokens > max_seq_len %d\n",
15973
- nt, c->max_seq_len);
15974
- fclose(fp); return -1;
15975
- }
15962
+ if (nl != (uint32_t)c->n_layers || hdr[2] != (uint32_t)kv_dim) { fclose(fp); return -1; }
15963
+ if (nt > (uint32_t)c->max_seq_len) { fclose(fp); return -1; }
15976
15964
 
15977
15965
  /* Read KV data */
15978
15966
  for (uint32_t l = 0; l < nl; l++) {
@@ -0,0 +1,64 @@
1
+ """
2
+ quantcpp CLI — chat with a local LLM in your terminal.
3
+
4
+ Usage:
5
+ quantcpp # auto-downloads Llama-3.2-1B, starts chat
6
+ quantcpp "What is gravity?" # one-shot question
7
+ quantcpp --model SmolLM2-135M # use a smaller model (faster download)
8
+ quantcpp --model path/to/file.gguf # use your own GGUF file
9
+ """
10
+
11
+ import sys
12
+ import os
13
+
14
+
15
+ def main():
16
+ import argparse
17
+ parser = argparse.ArgumentParser(
18
+ prog="quantcpp",
19
+ description="Chat with a local LLM. No API key, no GPU, no server.",
20
+ )
21
+ parser.add_argument("prompt", nargs="*", help="Question to ask (omit for interactive chat)")
22
+ parser.add_argument("--model", "-m", default="Llama-3.2-1B",
23
+ help="Model name or path to .gguf file (default: Llama-3.2-1B)")
24
+ parser.add_argument("--max-tokens", "-n", type=int, default=256)
25
+ parser.add_argument("--temperature", "-t", type=float, default=0.7)
26
+ args = parser.parse_args()
27
+
28
+ from quantcpp import Model
29
+
30
+ # Load model
31
+ model_path = args.model
32
+ if os.path.isfile(model_path):
33
+ print(f"Loading {model_path}...", file=sys.stderr)
34
+ m = Model(model_path, max_tokens=args.max_tokens, temperature=args.temperature)
35
+ else:
36
+ print(f"Downloading {model_path}...", file=sys.stderr)
37
+ m = Model.from_pretrained(model_path, max_tokens=args.max_tokens,
38
+ temperature=args.temperature)
39
+
40
+ # One-shot or interactive
41
+ if args.prompt:
42
+ question = " ".join(args.prompt)
43
+ for tok in m.generate(question):
44
+ print(tok, end="", flush=True)
45
+ print()
46
+ else:
47
+ print("quantcpp — type your message, Ctrl+C to exit", file=sys.stderr)
48
+ try:
49
+ while True:
50
+ question = input("\nYou: ")
51
+ if not question.strip():
52
+ continue
53
+ print("AI: ", end="", flush=True)
54
+ for tok in m.generate(question):
55
+ print(tok, end="", flush=True)
56
+ print()
57
+ except (KeyboardInterrupt, EOFError):
58
+ print("\nBye!", file=sys.stderr)
59
+
60
+ m.close()
61
+
62
+
63
+ if __name__ == "__main__":
64
+ main()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: quantcpp
3
- Version: 0.10.1
3
+ Version: 0.11.0
4
4
  Summary: Single-header LLM inference engine with KV cache compression (7× compression at fp32 parity)
5
5
  Author-email: quantumaikr <noreply@quantumaikr.com>
6
6
  License: Apache-2.0
@@ -6,9 +6,11 @@ setup.py
6
6
  quantcpp/__init__.py
7
7
  quantcpp/_binding.py
8
8
  quantcpp/_quant.h
9
+ quantcpp/cli.py
9
10
  quantcpp.egg-info/PKG-INFO
10
11
  quantcpp.egg-info/SOURCES.txt
11
12
  quantcpp.egg-info/dependency_links.txt
13
+ quantcpp.egg-info/entry_points.txt
12
14
  quantcpp.egg-info/requires.txt
13
15
  quantcpp.egg-info/top_level.txt
14
16
  tests/test_basic.py
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ quantcpp = quantcpp.cli:main
File without changes
File without changes
File without changes
File without changes
File without changes