quantcpp 0.10.1__tar.gz → 0.11.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {quantcpp-0.10.1/quantcpp.egg-info → quantcpp-0.11.0}/PKG-INFO +1 -1
- {quantcpp-0.10.1 → quantcpp-0.11.0}/pyproject.toml +4 -1
- {quantcpp-0.10.1 → quantcpp-0.11.0}/quant.h +81 -93
- {quantcpp-0.10.1 → quantcpp-0.11.0}/quantcpp/__init__.py +33 -18
- {quantcpp-0.10.1 → quantcpp-0.11.0}/quantcpp/_quant.h +81 -93
- quantcpp-0.11.0/quantcpp/cli.py +64 -0
- {quantcpp-0.10.1 → quantcpp-0.11.0/quantcpp.egg-info}/PKG-INFO +1 -1
- {quantcpp-0.10.1 → quantcpp-0.11.0}/quantcpp.egg-info/SOURCES.txt +2 -0
- quantcpp-0.11.0/quantcpp.egg-info/entry_points.txt +2 -0
- {quantcpp-0.10.1 → quantcpp-0.11.0}/MANIFEST.in +0 -0
- {quantcpp-0.10.1 → quantcpp-0.11.0}/README.md +0 -0
- {quantcpp-0.10.1 → quantcpp-0.11.0}/quantcpp/_binding.py +0 -0
- {quantcpp-0.10.1 → quantcpp-0.11.0}/quantcpp.egg-info/dependency_links.txt +0 -0
- {quantcpp-0.10.1 → quantcpp-0.11.0}/quantcpp.egg-info/requires.txt +0 -0
- {quantcpp-0.10.1 → quantcpp-0.11.0}/quantcpp.egg-info/top_level.txt +0 -0
- {quantcpp-0.10.1 → quantcpp-0.11.0}/setup.cfg +0 -0
- {quantcpp-0.10.1 → quantcpp-0.11.0}/setup.py +0 -0
- {quantcpp-0.10.1 → quantcpp-0.11.0}/tests/test_basic.py +0 -0
- {quantcpp-0.10.1 → quantcpp-0.11.0}/tests/test_python.py +0 -0
|
@@ -7,7 +7,7 @@ build-backend = "setuptools.build_meta"
|
|
|
7
7
|
|
|
8
8
|
[project]
|
|
9
9
|
name = "quantcpp"
|
|
10
|
-
version = "0.
|
|
10
|
+
version = "0.11.0"
|
|
11
11
|
description = "Single-header LLM inference engine with KV cache compression (7× compression at fp32 parity)"
|
|
12
12
|
readme = "README.md"
|
|
13
13
|
license = { text = "Apache-2.0" }
|
|
@@ -43,6 +43,9 @@ Source = "https://github.com/quantumaikr/quant.cpp"
|
|
|
43
43
|
Issues = "https://github.com/quantumaikr/quant.cpp/issues"
|
|
44
44
|
Changelog = "https://github.com/quantumaikr/quant.cpp/blob/main/CHANGELOG.md"
|
|
45
45
|
|
|
46
|
+
[project.scripts]
|
|
47
|
+
quantcpp = "quantcpp.cli:main"
|
|
48
|
+
|
|
46
49
|
[project.optional-dependencies]
|
|
47
50
|
dev = ["pytest>=7.0", "build", "twine"]
|
|
48
51
|
|
|
@@ -8033,32 +8033,75 @@ tq_tokenizer_t* tq_load_tokenizer_from_gguf(const void* gguf_ctx_ptr) {
|
|
|
8033
8033
|
}
|
|
8034
8034
|
}
|
|
8035
8035
|
|
|
8036
|
-
/*
|
|
8036
|
+
/* Build sorted indices BEFORE merge parsing so str_lookup() can use
|
|
8037
|
+
* binary search instead of O(n) linear scan. For 248K vocab with
|
|
8038
|
+
* ~50K merges (3 lookups each), this turns a ~10 s init into ~100 ms. */
|
|
8039
|
+
tok->sorted_indices = (int*)malloc(vocab_size * sizeof(int));
|
|
8040
|
+
if (tok->sorted_indices) {
|
|
8041
|
+
for (int i = 0; i < (int)vocab_size; i++) tok->sorted_indices[i] = i;
|
|
8042
|
+
g_vocab_for_sort = tok->vocab;
|
|
8043
|
+
qsort(tok->sorted_indices, vocab_size, sizeof(int), cmp_vocab_idx);
|
|
8044
|
+
}
|
|
8045
|
+
|
|
8046
|
+
/* Load and parse merges if available.
|
|
8047
|
+
* GGUF stores merges as a string array of "tok_a tok_b" pairs.
|
|
8048
|
+
* We need to look up token IDs and build (id_a, id_b, id_merged) triples
|
|
8049
|
+
* so the BPE encoder can use them. */
|
|
8037
8050
|
int64_t merges_idx = tq_gguf_find_key(gguf, "tokenizer.ggml.merges");
|
|
8038
8051
|
if (merges_idx >= 0) {
|
|
8039
8052
|
const tq_gguf_kv_t* mkv = &gguf->kv[merges_idx];
|
|
8040
8053
|
if (mkv->type == TQ_GGUF_TYPE_ARRAY &&
|
|
8041
8054
|
mkv->value.array.elem_type == TQ_GGUF_TYPE_STRING) {
|
|
8042
|
-
|
|
8043
|
-
|
|
8044
|
-
tok->n_merges =
|
|
8045
|
-
tok->merge_pairs = (int*)malloc(n_merges * 3 * sizeof(int));
|
|
8055
|
+
uint64_t n_merges_total = mkv->value.array.count;
|
|
8056
|
+
tok->merge_pairs = (int*)malloc(n_merges_total * 3 * sizeof(int));
|
|
8057
|
+
tok->n_merges = 0;
|
|
8046
8058
|
if (tok->merge_pairs) {
|
|
8047
|
-
|
|
8059
|
+
tq_gguf_string_t* merge_strings = (tq_gguf_string_t*)mkv->value.array.data;
|
|
8060
|
+
for (uint64_t mi = 0; mi < n_merges_total; mi++) {
|
|
8061
|
+
if (!merge_strings[mi].str || merge_strings[mi].len == 0) continue;
|
|
8062
|
+
|
|
8063
|
+
/* Copy merge string and split on space: "tok_a tok_b" */
|
|
8064
|
+
char buf[2048];
|
|
8065
|
+
int slen = (int)merge_strings[mi].len;
|
|
8066
|
+
if (slen >= (int)sizeof(buf)) continue;
|
|
8067
|
+
memcpy(buf, merge_strings[mi].str, (size_t)slen);
|
|
8068
|
+
buf[slen] = '\0';
|
|
8069
|
+
|
|
8070
|
+
char* sep = strchr(buf, ' ');
|
|
8071
|
+
if (!sep) continue;
|
|
8072
|
+
*sep = '\0';
|
|
8073
|
+
const char* str_a = buf;
|
|
8074
|
+
const char* str_b = sep + 1;
|
|
8075
|
+
|
|
8076
|
+
/* Build merged string: concatenation of tok_a + tok_b */
|
|
8077
|
+
char merged[2048];
|
|
8078
|
+
int la = (int)strlen(str_a);
|
|
8079
|
+
int lb = (int)strlen(str_b);
|
|
8080
|
+
if (la + lb >= (int)sizeof(merged)) continue;
|
|
8081
|
+
memcpy(merged, str_a, (size_t)la);
|
|
8082
|
+
memcpy(merged + la, str_b, (size_t)lb);
|
|
8083
|
+
merged[la + lb] = '\0';
|
|
8084
|
+
|
|
8085
|
+
/* Look up token IDs via binary search (sorted_indices built above) */
|
|
8086
|
+
int id_a = str_lookup(tok, str_a);
|
|
8087
|
+
int id_b = str_lookup(tok, str_b);
|
|
8088
|
+
int id_merged = str_lookup(tok, merged);
|
|
8089
|
+
|
|
8090
|
+
if (id_a >= 0 && id_b >= 0 && id_merged >= 0) {
|
|
8091
|
+
tok->merge_pairs[tok->n_merges * 3 + 0] = id_a;
|
|
8092
|
+
tok->merge_pairs[tok->n_merges * 3 + 1] = id_b;
|
|
8093
|
+
tok->merge_pairs[tok->n_merges * 3 + 2] = id_merged;
|
|
8094
|
+
/* Priority: earlier merges in GGUF = higher priority */
|
|
8095
|
+
tok->scores[id_merged] = (float)(n_merges_total - mi);
|
|
8096
|
+
tok->n_merges++;
|
|
8097
|
+
}
|
|
8098
|
+
}
|
|
8099
|
+
fprintf(stderr, "tq_load_tokenizer_from_gguf: parsed %d/%d merges\n",
|
|
8100
|
+
tok->n_merges, (int)n_merges_total);
|
|
8048
8101
|
}
|
|
8049
8102
|
}
|
|
8050
8103
|
}
|
|
8051
8104
|
|
|
8052
|
-
/* Build sorted indices for encoding (binary search by string).
|
|
8053
|
-
* Use qsort for O(n log n) instead of insertion sort O(n²) — critical
|
|
8054
|
-
* for 248K vocab where insertion sort would take minutes. */
|
|
8055
|
-
tok->sorted_indices = (int*)malloc(vocab_size * sizeof(int));
|
|
8056
|
-
if (tok->sorted_indices) {
|
|
8057
|
-
for (int i = 0; i < (int)vocab_size; i++) tok->sorted_indices[i] = i;
|
|
8058
|
-
g_vocab_for_sort = tok->vocab;
|
|
8059
|
-
qsort(tok->sorted_indices, vocab_size, sizeof(int), cmp_vocab_idx);
|
|
8060
|
-
}
|
|
8061
|
-
|
|
8062
8105
|
fprintf(stderr, "tq_load_tokenizer_from_gguf: loaded %d tokens (max_len=%d)\n",
|
|
8063
8106
|
tok->vocab_size, tok->max_token_len);
|
|
8064
8107
|
return tok;
|
|
@@ -9939,18 +9982,11 @@ static tq_model_t* tq_load_safetensors(const char* path) {
|
|
|
9939
9982
|
|
|
9940
9983
|
free(tensors);
|
|
9941
9984
|
|
|
9942
|
-
/* Qwen3.5 RMSNorm adjustment
|
|
9943
|
-
*
|
|
9944
|
-
*
|
|
9945
|
-
*
|
|
9946
|
-
|
|
9947
|
-
* This applies to: input_layernorm, post_attention_layernorm,
|
|
9948
|
-
* model.norm, q_norm, k_norm.
|
|
9949
|
-
* It does NOT apply to: linear_attn.norm (Qwen3_5RMSNormGated
|
|
9950
|
-
* uses plain weight without +1).
|
|
9951
|
-
*
|
|
9952
|
-
* We detect Qwen3.5 by the presence of DeltaNet layers. */
|
|
9953
|
-
if (model->config.delta_n_heads > 0) {
|
|
9985
|
+
/* Qwen3.5 (DeltaNet hybrid) RMSNorm adjustment.
|
|
9986
|
+
* Only for non-GGUF models (raw checkpoints). GGUF files from
|
|
9987
|
+
* llama.cpp already have +1 baked in by the converter.
|
|
9988
|
+
* Qwen2/Qwen3 use standard RMSNorm and never need +1. */
|
|
9989
|
+
if (model->config.delta_n_heads > 0 && !model->gguf_ctx) {
|
|
9954
9990
|
int dim_h = model->config.hidden_dim;
|
|
9955
9991
|
int head_dim_h = model->config.head_dim;
|
|
9956
9992
|
|
|
@@ -9979,7 +10015,7 @@ static tq_model_t* tq_load_safetensors(const char* path) {
|
|
|
9979
10015
|
for (int i = 0; i < dim_h; i++)
|
|
9980
10016
|
model->output_norm[i] += 1.0f;
|
|
9981
10017
|
}
|
|
9982
|
-
fprintf(stderr, "tq_load_model: applied
|
|
10018
|
+
fprintf(stderr, "tq_load_model: applied Qwen RMSNorm +1 weight adjustment\n");
|
|
9983
10019
|
}
|
|
9984
10020
|
|
|
9985
10021
|
/* Gemma3 RMSNorm adjustment: same (1+w) scaling as Qwen3.5 */
|
|
@@ -15862,22 +15898,7 @@ void quant_free_string(char* str) {
|
|
|
15862
15898
|
if (str) free(str);
|
|
15863
15899
|
}
|
|
15864
15900
|
|
|
15865
|
-
/*
|
|
15866
|
-
* Context persistence — save/load KV cache to disk
|
|
15867
|
-
*
|
|
15868
|
-
* File format (binary, little-endian):
|
|
15869
|
-
* magic: 4 bytes "QKVC"
|
|
15870
|
-
* version: uint32 (1)
|
|
15871
|
-
* n_layers: uint32
|
|
15872
|
-
* kv_dim: uint32 (n_kv_heads * head_dim)
|
|
15873
|
-
* max_seq: uint32
|
|
15874
|
-
* n_tokens: uint32 (number of filled positions)
|
|
15875
|
-
* kv_type: uint32 (TQ_TYPE_* enum or TQ_TYPE_COUNT for fp32)
|
|
15876
|
-
* has_fp16v: uint32 (1 if value_cache_fp16 is used)
|
|
15877
|
-
* reserved: 32 bytes (future use)
|
|
15878
|
-
* data: raw KV cache bytes
|
|
15879
|
-
* ================================================================ */
|
|
15880
|
-
|
|
15901
|
+
/* Context persistence: QKVC format (64-byte header + raw KV data) */
|
|
15881
15902
|
int quant_save_context(quant_ctx* ctx, const char* path) {
|
|
15882
15903
|
if (!ctx || !ctx->state || !path) return -1;
|
|
15883
15904
|
FILE* fp = fopen(path, "wb");
|
|
@@ -15886,29 +15907,17 @@ int quant_save_context(quant_ctx* ctx, const char* path) {
|
|
|
15886
15907
|
tq_state_t* s = ctx->state;
|
|
15887
15908
|
tq_model_config_t* c = &ctx->model->config;
|
|
15888
15909
|
int kv_dim = c->n_kv_heads * c->head_dim;
|
|
15889
|
-
|
|
15890
|
-
/* Header */
|
|
15891
15910
|
fwrite("QKVC", 1, 4, fp);
|
|
15892
|
-
uint32_t
|
|
15893
|
-
|
|
15894
|
-
|
|
15895
|
-
|
|
15896
|
-
|
|
15897
|
-
uint32_t kt =
|
|
15898
|
-
uint32_t hfp16 = s->value_cache_fp16 ? 1 : 0;
|
|
15899
|
-
fwrite(&version, 4, 1, fp);
|
|
15900
|
-
fwrite(&nl, 4, 1, fp);
|
|
15901
|
-
fwrite(&kd, 4, 1, fp);
|
|
15902
|
-
fwrite(&ms, 4, 1, fp);
|
|
15903
|
-
fwrite(&nt, 4, 1, fp);
|
|
15904
|
-
fwrite(&kt, 4, 1, fp);
|
|
15905
|
-
fwrite(&hfp16, 4, 1, fp);
|
|
15906
|
-
char reserved[32] = {0};
|
|
15907
|
-
fwrite(reserved, 1, 32, fp);
|
|
15911
|
+
uint32_t hdr[7] = { 1, (uint32_t)c->n_layers, (uint32_t)kv_dim,
|
|
15912
|
+
(uint32_t)c->max_seq_len, (uint32_t)ctx->n_ctx_tokens,
|
|
15913
|
+
(uint32_t)s->kv_quant_type, s->value_cache_fp16 ? 1u : 0u };
|
|
15914
|
+
fwrite(hdr, 4, 7, fp);
|
|
15915
|
+
char reserved[32] = {0}; fwrite(reserved, 1, 32, fp);
|
|
15916
|
+
uint32_t nl = hdr[1], nt = hdr[4], kt = hdr[5];
|
|
15908
15917
|
|
|
15909
15918
|
/* KV data: write only the filled portion (nt tokens) */
|
|
15910
15919
|
for (uint32_t l = 0; l < nl; l++) {
|
|
15911
|
-
size_t layer_stride = (size_t)
|
|
15920
|
+
size_t layer_stride = (size_t)c->max_seq_len * kv_dim;
|
|
15912
15921
|
/* Key cache: FP32 or quantized */
|
|
15913
15922
|
if (s->key_cache) {
|
|
15914
15923
|
fwrite(s->key_cache + l * layer_stride, sizeof(float),
|
|
@@ -15916,7 +15925,7 @@ int quant_save_context(quant_ctx* ctx, const char* path) {
|
|
|
15916
15925
|
}
|
|
15917
15926
|
if (s->quant_key_cache && kt < TQ_TYPE_COUNT) {
|
|
15918
15927
|
size_t blk_sz = tq_type_type_size(kt);
|
|
15919
|
-
uint8_t* qbase = (uint8_t*)s->quant_key_cache + l * (size_t)
|
|
15928
|
+
uint8_t* qbase = (uint8_t*)s->quant_key_cache + l * (size_t)c->max_seq_len * blk_sz;
|
|
15920
15929
|
fwrite(qbase, blk_sz, nt, fp);
|
|
15921
15930
|
}
|
|
15922
15931
|
/* Value cache: FP32 or FP16 */
|
|
@@ -15925,7 +15934,7 @@ int quant_save_context(quant_ctx* ctx, const char* path) {
|
|
|
15925
15934
|
(size_t)nt * kv_dim, fp);
|
|
15926
15935
|
}
|
|
15927
15936
|
if (s->value_cache_fp16) {
|
|
15928
|
-
size_t layer_stride16 = (size_t)
|
|
15937
|
+
size_t layer_stride16 = (size_t)c->max_seq_len * kv_dim;
|
|
15929
15938
|
fwrite(s->value_cache_fp16 + l * layer_stride16, sizeof(uint16_t),
|
|
15930
15939
|
(size_t)nt * kv_dim, fp);
|
|
15931
15940
|
}
|
|
@@ -15942,37 +15951,16 @@ int quant_load_context(quant_ctx* ctx, const char* path) {
|
|
|
15942
15951
|
FILE* fp = fopen(path, "rb");
|
|
15943
15952
|
if (!fp) return -1;
|
|
15944
15953
|
|
|
15945
|
-
/* Read and validate header */
|
|
15946
15954
|
char magic[4];
|
|
15947
|
-
if (fread(magic, 1, 4, fp) != 4 || memcmp(magic, "QKVC", 4) != 0) {
|
|
15948
|
-
|
|
15949
|
-
|
|
15950
|
-
uint32_t
|
|
15951
|
-
fread(&version, 4, 1, fp);
|
|
15952
|
-
fread(&nl, 4, 1, fp);
|
|
15953
|
-
fread(&kd, 4, 1, fp);
|
|
15954
|
-
fread(&ms, 4, 1, fp);
|
|
15955
|
-
fread(&nt, 4, 1, fp);
|
|
15956
|
-
fread(&kt, 4, 1, fp);
|
|
15957
|
-
fread(&hfp16, 4, 1, fp);
|
|
15958
|
-
char reserved[32];
|
|
15959
|
-
fread(reserved, 1, 32, fp);
|
|
15960
|
-
|
|
15955
|
+
if (fread(magic, 1, 4, fp) != 4 || memcmp(magic, "QKVC", 4) != 0) { fclose(fp); return -1; }
|
|
15956
|
+
uint32_t hdr[7]; fread(hdr, 4, 7, fp);
|
|
15957
|
+
char reserved[32]; fread(reserved, 1, 32, fp);
|
|
15958
|
+
uint32_t nl = hdr[1], nt = hdr[4], kt = hdr[5];
|
|
15961
15959
|
tq_state_t* s = ctx->state;
|
|
15962
15960
|
tq_model_config_t* c = &ctx->model->config;
|
|
15963
15961
|
int kv_dim = c->n_kv_heads * c->head_dim;
|
|
15964
|
-
|
|
15965
|
-
|
|
15966
|
-
if (nl != (uint32_t)c->n_layers || kd != (uint32_t)kv_dim) {
|
|
15967
|
-
fprintf(stderr, "quant_load_context: model mismatch (layers %u vs %d, kv_dim %u vs %d)\n",
|
|
15968
|
-
nl, c->n_layers, kd, kv_dim);
|
|
15969
|
-
fclose(fp); return -1;
|
|
15970
|
-
}
|
|
15971
|
-
if (nt > (uint32_t)c->max_seq_len) {
|
|
15972
|
-
fprintf(stderr, "quant_load_context: saved %u tokens > max_seq_len %d\n",
|
|
15973
|
-
nt, c->max_seq_len);
|
|
15974
|
-
fclose(fp); return -1;
|
|
15975
|
-
}
|
|
15962
|
+
if (nl != (uint32_t)c->n_layers || hdr[2] != (uint32_t)kv_dim) { fclose(fp); return -1; }
|
|
15963
|
+
if (nt > (uint32_t)c->max_seq_len) { fclose(fp); return -1; }
|
|
15976
15964
|
|
|
15977
15965
|
/* Read KV data */
|
|
15978
15966
|
for (uint32_t l = 0; l < nl; l++) {
|
|
@@ -1,25 +1,21 @@
|
|
|
1
1
|
"""
|
|
2
|
-
quantcpp --
|
|
2
|
+
quantcpp -- Compress AI's memory 3x. It gets faster.
|
|
3
3
|
|
|
4
|
-
Quick start
|
|
4
|
+
Quick start:
|
|
5
5
|
|
|
6
6
|
from quantcpp import Model
|
|
7
|
-
m = Model.from_pretrained("
|
|
7
|
+
m = Model.from_pretrained("Llama-3.2-1B")
|
|
8
8
|
print(m.ask("What is gravity?"))
|
|
9
9
|
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
m = Model("path/to/model.gguf", temperature=0.7, max_tokens=256)
|
|
13
|
-
for token in m.generate("Once upon a time"):
|
|
14
|
-
print(token, end="", flush=True)
|
|
15
|
-
m.close()
|
|
10
|
+
Note: SmolLM2-135M downloads faster but produces low-quality output.
|
|
11
|
+
Use Llama-3.2-1B (~750 MB, one-time download) for good results.
|
|
16
12
|
"""
|
|
17
13
|
|
|
18
14
|
try:
|
|
19
15
|
from importlib.metadata import version as _pkg_version
|
|
20
16
|
__version__ = _pkg_version("quantcpp")
|
|
21
17
|
except Exception:
|
|
22
|
-
__version__ = "0.
|
|
18
|
+
__version__ = "0.11.0" # fallback for editable / source-tree imports
|
|
23
19
|
|
|
24
20
|
import os
|
|
25
21
|
import sys
|
|
@@ -53,6 +49,11 @@ _MODEL_REGISTRY = {
|
|
|
53
49
|
"smollm2-135m-instruct-q8_0.gguf",
|
|
54
50
|
135,
|
|
55
51
|
),
|
|
52
|
+
"Qwen3.5-0.8B": (
|
|
53
|
+
"unsloth/Qwen3.5-0.8B-GGUF",
|
|
54
|
+
"Qwen3.5-0.8B-Q4_K_M.gguf",
|
|
55
|
+
508,
|
|
56
|
+
),
|
|
56
57
|
"Llama-3.2-1B": (
|
|
57
58
|
"hugging-quants/Llama-3.2-1B-Instruct-Q4_K_M-GGUF",
|
|
58
59
|
"llama-3.2-1b-instruct-q4_k_m.gguf",
|
|
@@ -192,17 +193,21 @@ class Model:
|
|
|
192
193
|
n_threads: int = 4,
|
|
193
194
|
kv_compress: int = 1,
|
|
194
195
|
context_length: int = 0,
|
|
195
|
-
progressive: bool =
|
|
196
|
+
progressive: bool = True,
|
|
197
|
+
aggressive: bool = False,
|
|
196
198
|
):
|
|
197
199
|
"""
|
|
198
200
|
Parameters
|
|
199
201
|
----------
|
|
200
202
|
progressive : bool
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
203
|
+
Progressive KV compression (default True). Keeps last 128
|
|
204
|
+
tokens' keys at FP32 while compressing the rest. Verified
|
|
205
|
+
on 3 models: +0% to +3% PPL improvement at 1.75 MB cost.
|
|
206
|
+
No reason to disable — it's strictly better.
|
|
207
|
+
aggressive : bool
|
|
208
|
+
Maximum memory savings (default False). Uses 4-bit KV with
|
|
209
|
+
last 512 tokens at FP32. Ideal for very long context.
|
|
210
|
+
At 128K context: 4.6 GB instead of 9.2 GB KV cache.
|
|
206
211
|
"""
|
|
207
212
|
if not os.path.isfile(path):
|
|
208
213
|
raise FileNotFoundError(f"Model file not found: {path}")
|
|
@@ -212,11 +217,21 @@ class Model:
|
|
|
212
217
|
self._top_p = top_p
|
|
213
218
|
self._max_tokens = max_tokens
|
|
214
219
|
self._n_threads = n_threads
|
|
215
|
-
self._kv_compress = kv_compress
|
|
216
220
|
self._context_length = context_length
|
|
217
221
|
self._progressive = progressive
|
|
222
|
+
self._aggressive = aggressive
|
|
223
|
+
|
|
224
|
+
if aggressive:
|
|
225
|
+
# 4-bit KV + 512-token FP32 window: best memory/quality ratio.
|
|
226
|
+
# Measured: same PPL as flat 4-bit, attention-aware precision.
|
|
227
|
+
# TODO: add uniform_2b (kv_compress=3) for 48% more savings.
|
|
228
|
+
k_win = 512
|
|
229
|
+
elif progressive:
|
|
230
|
+
k_win = 128
|
|
231
|
+
else:
|
|
232
|
+
k_win = 0
|
|
218
233
|
|
|
219
|
-
|
|
234
|
+
self._kv_compress = kv_compress
|
|
220
235
|
|
|
221
236
|
self._model = load_model(path)
|
|
222
237
|
self._ctx = new_context(
|
|
@@ -8033,32 +8033,75 @@ tq_tokenizer_t* tq_load_tokenizer_from_gguf(const void* gguf_ctx_ptr) {
|
|
|
8033
8033
|
}
|
|
8034
8034
|
}
|
|
8035
8035
|
|
|
8036
|
-
/*
|
|
8036
|
+
/* Build sorted indices BEFORE merge parsing so str_lookup() can use
|
|
8037
|
+
* binary search instead of O(n) linear scan. For 248K vocab with
|
|
8038
|
+
* ~50K merges (3 lookups each), this turns a ~10 s init into ~100 ms. */
|
|
8039
|
+
tok->sorted_indices = (int*)malloc(vocab_size * sizeof(int));
|
|
8040
|
+
if (tok->sorted_indices) {
|
|
8041
|
+
for (int i = 0; i < (int)vocab_size; i++) tok->sorted_indices[i] = i;
|
|
8042
|
+
g_vocab_for_sort = tok->vocab;
|
|
8043
|
+
qsort(tok->sorted_indices, vocab_size, sizeof(int), cmp_vocab_idx);
|
|
8044
|
+
}
|
|
8045
|
+
|
|
8046
|
+
/* Load and parse merges if available.
|
|
8047
|
+
* GGUF stores merges as a string array of "tok_a tok_b" pairs.
|
|
8048
|
+
* We need to look up token IDs and build (id_a, id_b, id_merged) triples
|
|
8049
|
+
* so the BPE encoder can use them. */
|
|
8037
8050
|
int64_t merges_idx = tq_gguf_find_key(gguf, "tokenizer.ggml.merges");
|
|
8038
8051
|
if (merges_idx >= 0) {
|
|
8039
8052
|
const tq_gguf_kv_t* mkv = &gguf->kv[merges_idx];
|
|
8040
8053
|
if (mkv->type == TQ_GGUF_TYPE_ARRAY &&
|
|
8041
8054
|
mkv->value.array.elem_type == TQ_GGUF_TYPE_STRING) {
|
|
8042
|
-
|
|
8043
|
-
|
|
8044
|
-
tok->n_merges =
|
|
8045
|
-
tok->merge_pairs = (int*)malloc(n_merges * 3 * sizeof(int));
|
|
8055
|
+
uint64_t n_merges_total = mkv->value.array.count;
|
|
8056
|
+
tok->merge_pairs = (int*)malloc(n_merges_total * 3 * sizeof(int));
|
|
8057
|
+
tok->n_merges = 0;
|
|
8046
8058
|
if (tok->merge_pairs) {
|
|
8047
|
-
|
|
8059
|
+
tq_gguf_string_t* merge_strings = (tq_gguf_string_t*)mkv->value.array.data;
|
|
8060
|
+
for (uint64_t mi = 0; mi < n_merges_total; mi++) {
|
|
8061
|
+
if (!merge_strings[mi].str || merge_strings[mi].len == 0) continue;
|
|
8062
|
+
|
|
8063
|
+
/* Copy merge string and split on space: "tok_a tok_b" */
|
|
8064
|
+
char buf[2048];
|
|
8065
|
+
int slen = (int)merge_strings[mi].len;
|
|
8066
|
+
if (slen >= (int)sizeof(buf)) continue;
|
|
8067
|
+
memcpy(buf, merge_strings[mi].str, (size_t)slen);
|
|
8068
|
+
buf[slen] = '\0';
|
|
8069
|
+
|
|
8070
|
+
char* sep = strchr(buf, ' ');
|
|
8071
|
+
if (!sep) continue;
|
|
8072
|
+
*sep = '\0';
|
|
8073
|
+
const char* str_a = buf;
|
|
8074
|
+
const char* str_b = sep + 1;
|
|
8075
|
+
|
|
8076
|
+
/* Build merged string: concatenation of tok_a + tok_b */
|
|
8077
|
+
char merged[2048];
|
|
8078
|
+
int la = (int)strlen(str_a);
|
|
8079
|
+
int lb = (int)strlen(str_b);
|
|
8080
|
+
if (la + lb >= (int)sizeof(merged)) continue;
|
|
8081
|
+
memcpy(merged, str_a, (size_t)la);
|
|
8082
|
+
memcpy(merged + la, str_b, (size_t)lb);
|
|
8083
|
+
merged[la + lb] = '\0';
|
|
8084
|
+
|
|
8085
|
+
/* Look up token IDs via binary search (sorted_indices built above) */
|
|
8086
|
+
int id_a = str_lookup(tok, str_a);
|
|
8087
|
+
int id_b = str_lookup(tok, str_b);
|
|
8088
|
+
int id_merged = str_lookup(tok, merged);
|
|
8089
|
+
|
|
8090
|
+
if (id_a >= 0 && id_b >= 0 && id_merged >= 0) {
|
|
8091
|
+
tok->merge_pairs[tok->n_merges * 3 + 0] = id_a;
|
|
8092
|
+
tok->merge_pairs[tok->n_merges * 3 + 1] = id_b;
|
|
8093
|
+
tok->merge_pairs[tok->n_merges * 3 + 2] = id_merged;
|
|
8094
|
+
/* Priority: earlier merges in GGUF = higher priority */
|
|
8095
|
+
tok->scores[id_merged] = (float)(n_merges_total - mi);
|
|
8096
|
+
tok->n_merges++;
|
|
8097
|
+
}
|
|
8098
|
+
}
|
|
8099
|
+
fprintf(stderr, "tq_load_tokenizer_from_gguf: parsed %d/%d merges\n",
|
|
8100
|
+
tok->n_merges, (int)n_merges_total);
|
|
8048
8101
|
}
|
|
8049
8102
|
}
|
|
8050
8103
|
}
|
|
8051
8104
|
|
|
8052
|
-
/* Build sorted indices for encoding (binary search by string).
|
|
8053
|
-
* Use qsort for O(n log n) instead of insertion sort O(n²) — critical
|
|
8054
|
-
* for 248K vocab where insertion sort would take minutes. */
|
|
8055
|
-
tok->sorted_indices = (int*)malloc(vocab_size * sizeof(int));
|
|
8056
|
-
if (tok->sorted_indices) {
|
|
8057
|
-
for (int i = 0; i < (int)vocab_size; i++) tok->sorted_indices[i] = i;
|
|
8058
|
-
g_vocab_for_sort = tok->vocab;
|
|
8059
|
-
qsort(tok->sorted_indices, vocab_size, sizeof(int), cmp_vocab_idx);
|
|
8060
|
-
}
|
|
8061
|
-
|
|
8062
8105
|
fprintf(stderr, "tq_load_tokenizer_from_gguf: loaded %d tokens (max_len=%d)\n",
|
|
8063
8106
|
tok->vocab_size, tok->max_token_len);
|
|
8064
8107
|
return tok;
|
|
@@ -9939,18 +9982,11 @@ static tq_model_t* tq_load_safetensors(const char* path) {
|
|
|
9939
9982
|
|
|
9940
9983
|
free(tensors);
|
|
9941
9984
|
|
|
9942
|
-
/* Qwen3.5 RMSNorm adjustment
|
|
9943
|
-
*
|
|
9944
|
-
*
|
|
9945
|
-
*
|
|
9946
|
-
|
|
9947
|
-
* This applies to: input_layernorm, post_attention_layernorm,
|
|
9948
|
-
* model.norm, q_norm, k_norm.
|
|
9949
|
-
* It does NOT apply to: linear_attn.norm (Qwen3_5RMSNormGated
|
|
9950
|
-
* uses plain weight without +1).
|
|
9951
|
-
*
|
|
9952
|
-
* We detect Qwen3.5 by the presence of DeltaNet layers. */
|
|
9953
|
-
if (model->config.delta_n_heads > 0) {
|
|
9985
|
+
/* Qwen3.5 (DeltaNet hybrid) RMSNorm adjustment.
|
|
9986
|
+
* Only for non-GGUF models (raw checkpoints). GGUF files from
|
|
9987
|
+
* llama.cpp already have +1 baked in by the converter.
|
|
9988
|
+
* Qwen2/Qwen3 use standard RMSNorm and never need +1. */
|
|
9989
|
+
if (model->config.delta_n_heads > 0 && !model->gguf_ctx) {
|
|
9954
9990
|
int dim_h = model->config.hidden_dim;
|
|
9955
9991
|
int head_dim_h = model->config.head_dim;
|
|
9956
9992
|
|
|
@@ -9979,7 +10015,7 @@ static tq_model_t* tq_load_safetensors(const char* path) {
|
|
|
9979
10015
|
for (int i = 0; i < dim_h; i++)
|
|
9980
10016
|
model->output_norm[i] += 1.0f;
|
|
9981
10017
|
}
|
|
9982
|
-
fprintf(stderr, "tq_load_model: applied
|
|
10018
|
+
fprintf(stderr, "tq_load_model: applied Qwen RMSNorm +1 weight adjustment\n");
|
|
9983
10019
|
}
|
|
9984
10020
|
|
|
9985
10021
|
/* Gemma3 RMSNorm adjustment: same (1+w) scaling as Qwen3.5 */
|
|
@@ -15862,22 +15898,7 @@ void quant_free_string(char* str) {
|
|
|
15862
15898
|
if (str) free(str);
|
|
15863
15899
|
}
|
|
15864
15900
|
|
|
15865
|
-
/*
|
|
15866
|
-
* Context persistence — save/load KV cache to disk
|
|
15867
|
-
*
|
|
15868
|
-
* File format (binary, little-endian):
|
|
15869
|
-
* magic: 4 bytes "QKVC"
|
|
15870
|
-
* version: uint32 (1)
|
|
15871
|
-
* n_layers: uint32
|
|
15872
|
-
* kv_dim: uint32 (n_kv_heads * head_dim)
|
|
15873
|
-
* max_seq: uint32
|
|
15874
|
-
* n_tokens: uint32 (number of filled positions)
|
|
15875
|
-
* kv_type: uint32 (TQ_TYPE_* enum or TQ_TYPE_COUNT for fp32)
|
|
15876
|
-
* has_fp16v: uint32 (1 if value_cache_fp16 is used)
|
|
15877
|
-
* reserved: 32 bytes (future use)
|
|
15878
|
-
* data: raw KV cache bytes
|
|
15879
|
-
* ================================================================ */
|
|
15880
|
-
|
|
15901
|
+
/* Context persistence: QKVC format (64-byte header + raw KV data) */
|
|
15881
15902
|
int quant_save_context(quant_ctx* ctx, const char* path) {
|
|
15882
15903
|
if (!ctx || !ctx->state || !path) return -1;
|
|
15883
15904
|
FILE* fp = fopen(path, "wb");
|
|
@@ -15886,29 +15907,17 @@ int quant_save_context(quant_ctx* ctx, const char* path) {
|
|
|
15886
15907
|
tq_state_t* s = ctx->state;
|
|
15887
15908
|
tq_model_config_t* c = &ctx->model->config;
|
|
15888
15909
|
int kv_dim = c->n_kv_heads * c->head_dim;
|
|
15889
|
-
|
|
15890
|
-
/* Header */
|
|
15891
15910
|
fwrite("QKVC", 1, 4, fp);
|
|
15892
|
-
uint32_t
|
|
15893
|
-
|
|
15894
|
-
|
|
15895
|
-
|
|
15896
|
-
|
|
15897
|
-
uint32_t kt =
|
|
15898
|
-
uint32_t hfp16 = s->value_cache_fp16 ? 1 : 0;
|
|
15899
|
-
fwrite(&version, 4, 1, fp);
|
|
15900
|
-
fwrite(&nl, 4, 1, fp);
|
|
15901
|
-
fwrite(&kd, 4, 1, fp);
|
|
15902
|
-
fwrite(&ms, 4, 1, fp);
|
|
15903
|
-
fwrite(&nt, 4, 1, fp);
|
|
15904
|
-
fwrite(&kt, 4, 1, fp);
|
|
15905
|
-
fwrite(&hfp16, 4, 1, fp);
|
|
15906
|
-
char reserved[32] = {0};
|
|
15907
|
-
fwrite(reserved, 1, 32, fp);
|
|
15911
|
+
uint32_t hdr[7] = { 1, (uint32_t)c->n_layers, (uint32_t)kv_dim,
|
|
15912
|
+
(uint32_t)c->max_seq_len, (uint32_t)ctx->n_ctx_tokens,
|
|
15913
|
+
(uint32_t)s->kv_quant_type, s->value_cache_fp16 ? 1u : 0u };
|
|
15914
|
+
fwrite(hdr, 4, 7, fp);
|
|
15915
|
+
char reserved[32] = {0}; fwrite(reserved, 1, 32, fp);
|
|
15916
|
+
uint32_t nl = hdr[1], nt = hdr[4], kt = hdr[5];
|
|
15908
15917
|
|
|
15909
15918
|
/* KV data: write only the filled portion (nt tokens) */
|
|
15910
15919
|
for (uint32_t l = 0; l < nl; l++) {
|
|
15911
|
-
size_t layer_stride = (size_t)
|
|
15920
|
+
size_t layer_stride = (size_t)c->max_seq_len * kv_dim;
|
|
15912
15921
|
/* Key cache: FP32 or quantized */
|
|
15913
15922
|
if (s->key_cache) {
|
|
15914
15923
|
fwrite(s->key_cache + l * layer_stride, sizeof(float),
|
|
@@ -15916,7 +15925,7 @@ int quant_save_context(quant_ctx* ctx, const char* path) {
|
|
|
15916
15925
|
}
|
|
15917
15926
|
if (s->quant_key_cache && kt < TQ_TYPE_COUNT) {
|
|
15918
15927
|
size_t blk_sz = tq_type_type_size(kt);
|
|
15919
|
-
uint8_t* qbase = (uint8_t*)s->quant_key_cache + l * (size_t)
|
|
15928
|
+
uint8_t* qbase = (uint8_t*)s->quant_key_cache + l * (size_t)c->max_seq_len * blk_sz;
|
|
15920
15929
|
fwrite(qbase, blk_sz, nt, fp);
|
|
15921
15930
|
}
|
|
15922
15931
|
/* Value cache: FP32 or FP16 */
|
|
@@ -15925,7 +15934,7 @@ int quant_save_context(quant_ctx* ctx, const char* path) {
|
|
|
15925
15934
|
(size_t)nt * kv_dim, fp);
|
|
15926
15935
|
}
|
|
15927
15936
|
if (s->value_cache_fp16) {
|
|
15928
|
-
size_t layer_stride16 = (size_t)
|
|
15937
|
+
size_t layer_stride16 = (size_t)c->max_seq_len * kv_dim;
|
|
15929
15938
|
fwrite(s->value_cache_fp16 + l * layer_stride16, sizeof(uint16_t),
|
|
15930
15939
|
(size_t)nt * kv_dim, fp);
|
|
15931
15940
|
}
|
|
@@ -15942,37 +15951,16 @@ int quant_load_context(quant_ctx* ctx, const char* path) {
|
|
|
15942
15951
|
FILE* fp = fopen(path, "rb");
|
|
15943
15952
|
if (!fp) return -1;
|
|
15944
15953
|
|
|
15945
|
-
/* Read and validate header */
|
|
15946
15954
|
char magic[4];
|
|
15947
|
-
if (fread(magic, 1, 4, fp) != 4 || memcmp(magic, "QKVC", 4) != 0) {
|
|
15948
|
-
|
|
15949
|
-
|
|
15950
|
-
uint32_t
|
|
15951
|
-
fread(&version, 4, 1, fp);
|
|
15952
|
-
fread(&nl, 4, 1, fp);
|
|
15953
|
-
fread(&kd, 4, 1, fp);
|
|
15954
|
-
fread(&ms, 4, 1, fp);
|
|
15955
|
-
fread(&nt, 4, 1, fp);
|
|
15956
|
-
fread(&kt, 4, 1, fp);
|
|
15957
|
-
fread(&hfp16, 4, 1, fp);
|
|
15958
|
-
char reserved[32];
|
|
15959
|
-
fread(reserved, 1, 32, fp);
|
|
15960
|
-
|
|
15955
|
+
if (fread(magic, 1, 4, fp) != 4 || memcmp(magic, "QKVC", 4) != 0) { fclose(fp); return -1; }
|
|
15956
|
+
uint32_t hdr[7]; fread(hdr, 4, 7, fp);
|
|
15957
|
+
char reserved[32]; fread(reserved, 1, 32, fp);
|
|
15958
|
+
uint32_t nl = hdr[1], nt = hdr[4], kt = hdr[5];
|
|
15961
15959
|
tq_state_t* s = ctx->state;
|
|
15962
15960
|
tq_model_config_t* c = &ctx->model->config;
|
|
15963
15961
|
int kv_dim = c->n_kv_heads * c->head_dim;
|
|
15964
|
-
|
|
15965
|
-
|
|
15966
|
-
if (nl != (uint32_t)c->n_layers || kd != (uint32_t)kv_dim) {
|
|
15967
|
-
fprintf(stderr, "quant_load_context: model mismatch (layers %u vs %d, kv_dim %u vs %d)\n",
|
|
15968
|
-
nl, c->n_layers, kd, kv_dim);
|
|
15969
|
-
fclose(fp); return -1;
|
|
15970
|
-
}
|
|
15971
|
-
if (nt > (uint32_t)c->max_seq_len) {
|
|
15972
|
-
fprintf(stderr, "quant_load_context: saved %u tokens > max_seq_len %d\n",
|
|
15973
|
-
nt, c->max_seq_len);
|
|
15974
|
-
fclose(fp); return -1;
|
|
15975
|
-
}
|
|
15962
|
+
if (nl != (uint32_t)c->n_layers || hdr[2] != (uint32_t)kv_dim) { fclose(fp); return -1; }
|
|
15963
|
+
if (nt > (uint32_t)c->max_seq_len) { fclose(fp); return -1; }
|
|
15976
15964
|
|
|
15977
15965
|
/* Read KV data */
|
|
15978
15966
|
for (uint32_t l = 0; l < nl; l++) {
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
"""
|
|
2
|
+
quantcpp CLI — chat with a local LLM in your terminal.
|
|
3
|
+
|
|
4
|
+
Usage:
|
|
5
|
+
quantcpp # auto-downloads Llama-3.2-1B, starts chat
|
|
6
|
+
quantcpp "What is gravity?" # one-shot question
|
|
7
|
+
quantcpp --model SmolLM2-135M # use a smaller model (faster download)
|
|
8
|
+
quantcpp --model path/to/file.gguf # use your own GGUF file
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import sys
|
|
12
|
+
import os
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def main():
|
|
16
|
+
import argparse
|
|
17
|
+
parser = argparse.ArgumentParser(
|
|
18
|
+
prog="quantcpp",
|
|
19
|
+
description="Chat with a local LLM. No API key, no GPU, no server.",
|
|
20
|
+
)
|
|
21
|
+
parser.add_argument("prompt", nargs="*", help="Question to ask (omit for interactive chat)")
|
|
22
|
+
parser.add_argument("--model", "-m", default="Llama-3.2-1B",
|
|
23
|
+
help="Model name or path to .gguf file (default: Llama-3.2-1B)")
|
|
24
|
+
parser.add_argument("--max-tokens", "-n", type=int, default=256)
|
|
25
|
+
parser.add_argument("--temperature", "-t", type=float, default=0.7)
|
|
26
|
+
args = parser.parse_args()
|
|
27
|
+
|
|
28
|
+
from quantcpp import Model
|
|
29
|
+
|
|
30
|
+
# Load model
|
|
31
|
+
model_path = args.model
|
|
32
|
+
if os.path.isfile(model_path):
|
|
33
|
+
print(f"Loading {model_path}...", file=sys.stderr)
|
|
34
|
+
m = Model(model_path, max_tokens=args.max_tokens, temperature=args.temperature)
|
|
35
|
+
else:
|
|
36
|
+
print(f"Downloading {model_path}...", file=sys.stderr)
|
|
37
|
+
m = Model.from_pretrained(model_path, max_tokens=args.max_tokens,
|
|
38
|
+
temperature=args.temperature)
|
|
39
|
+
|
|
40
|
+
# One-shot or interactive
|
|
41
|
+
if args.prompt:
|
|
42
|
+
question = " ".join(args.prompt)
|
|
43
|
+
for tok in m.generate(question):
|
|
44
|
+
print(tok, end="", flush=True)
|
|
45
|
+
print()
|
|
46
|
+
else:
|
|
47
|
+
print("quantcpp — type your message, Ctrl+C to exit", file=sys.stderr)
|
|
48
|
+
try:
|
|
49
|
+
while True:
|
|
50
|
+
question = input("\nYou: ")
|
|
51
|
+
if not question.strip():
|
|
52
|
+
continue
|
|
53
|
+
print("AI: ", end="", flush=True)
|
|
54
|
+
for tok in m.generate(question):
|
|
55
|
+
print(tok, end="", flush=True)
|
|
56
|
+
print()
|
|
57
|
+
except (KeyboardInterrupt, EOFError):
|
|
58
|
+
print("\nBye!", file=sys.stderr)
|
|
59
|
+
|
|
60
|
+
m.close()
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
if __name__ == "__main__":
|
|
64
|
+
main()
|
|
@@ -6,9 +6,11 @@ setup.py
|
|
|
6
6
|
quantcpp/__init__.py
|
|
7
7
|
quantcpp/_binding.py
|
|
8
8
|
quantcpp/_quant.h
|
|
9
|
+
quantcpp/cli.py
|
|
9
10
|
quantcpp.egg-info/PKG-INFO
|
|
10
11
|
quantcpp.egg-info/SOURCES.txt
|
|
11
12
|
quantcpp.egg-info/dependency_links.txt
|
|
13
|
+
quantcpp.egg-info/entry_points.txt
|
|
12
14
|
quantcpp.egg-info/requires.txt
|
|
13
15
|
quantcpp.egg-info/top_level.txt
|
|
14
16
|
tests/test_basic.py
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|