quantcpp 0.11.0__tar.gz → 0.12.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: quantcpp
3
- Version: 0.11.0
3
+ Version: 0.12.1
4
4
  Summary: Single-header LLM inference engine with KV cache compression (7× compression at fp32 parity)
5
5
  Author-email: quantumaikr <noreply@quantumaikr.com>
6
6
  License: Apache-2.0
@@ -7,7 +7,7 @@ build-backend = "setuptools.build_meta"
7
7
 
8
8
  [project]
9
9
  name = "quantcpp"
10
- version = "0.11.0"
10
+ version = "0.12.1"
11
11
  description = "Single-header LLM inference engine with KV cache compression (7× compression at fp32 parity)"
12
12
  readme = "README.md"
13
13
  license = { text = "Apache-2.0" }
@@ -202,8 +202,6 @@ static inline int clock_gettime(int id, struct timespec* ts) {
202
202
  // Section 1: Types and Specs (from tq_types.h, tq_spec.h)
203
203
  // ============================================================================
204
204
 
205
-
206
-
207
205
  /* Cross-language static assert: works in both C11 and C++11/17 */
208
206
  #ifdef __cplusplus
209
207
  #define TQ_STATIC_ASSERT(cond, msg) static_assert(cond, msg)
@@ -219,8 +217,6 @@ static inline int clock_gettime(int id, struct timespec* ts) {
219
217
  #define TQ_PI_2 1.5707963267948966f
220
218
  #endif
221
219
 
222
-
223
-
224
220
  /* ============================================================
225
221
  * Constants
226
222
  * ============================================================ */
@@ -398,8 +394,6 @@ typedef struct {
398
394
  int enable_recompression;/* Tier 1 → Tier 2 re-compression */
399
395
  } tq_progressive_config_t;
400
396
 
401
-
402
-
403
397
  /* TurboQuant KV cache block: RHT + Lloyd-Max codebook + QJL residual
404
398
  * 3-bit variant: 2-bit codebook (4 levels) + 1-bit QJL sign hash
405
399
  * Block covers TQ_BK elements (128).
@@ -469,12 +463,6 @@ TQ_CHECK_SIZE(block_tq_turbo_kv_4b, 8 + TQ_BK * 3 / 8 + TQ_BK / 8);
469
463
  TQ_CHECK_SIZE(block_tq_turbo_kv_1b, 8 + TQ_BK / 8);
470
464
  TQ_CHECK_SIZE(block_tq_turbo_kv_2b, 8 + TQ_BK / 8 + TQ_BK / 8);
471
465
 
472
-
473
-
474
-
475
-
476
-
477
-
478
466
  /* Format specification — version-aware, ONNX-inspired */
479
467
 
480
468
  #define TQ_SPEC_VERSION 1
@@ -500,18 +488,10 @@ typedef struct {
500
488
  uint8_t flags; /* TQ_FLAG_* bitmask */
501
489
  } tq_format_spec_t;
502
490
 
503
-
504
-
505
-
506
-
507
491
  // ============================================================================
508
492
  // Section 2: Engine Types (from tq_engine.h)
509
493
  // ============================================================================
510
494
 
511
-
512
-
513
-
514
-
515
495
  /* ============================================================
516
496
  * Model configuration
517
497
  * ============================================================ */
@@ -886,6 +866,7 @@ typedef struct {
886
866
  int n_threads;
887
867
  float rep_penalty; /* repetition penalty (default: 1.1, 1.0 = disabled) */
888
868
  int rep_window; /* how many recent tokens to penalize (default: 32) */
869
+ unsigned long long rng_seed; /* sampling seed (default: 42, 0 = use 42 for back-compat) */
889
870
  /* Callback for streaming output */
890
871
  void (*on_token)(const char* text, void* user_data);
891
872
  void* user_data;
@@ -1123,9 +1104,6 @@ void tq_tp_run(void* (*fn)(void*), void** args, int n_tasks);
1123
1104
  /* Max threads supported by thread pool */
1124
1105
  #define TQ_TP_MAX 16
1125
1106
 
1126
-
1127
-
1128
-
1129
1107
  // ============================================================================
1130
1108
  // Section 3: GGUF Types (from tq_gguf.h)
1131
1109
  // ============================================================================
@@ -1143,10 +1121,6 @@ void tq_tp_run(void* (*fn)(void*), void** args, int n_tasks);
1143
1121
  * directly into TurboQuant inference engine.
1144
1122
  */
1145
1123
 
1146
-
1147
-
1148
-
1149
-
1150
1124
  /* ============================================================
1151
1125
  * GGUF format constants
1152
1126
  * ============================================================ */
@@ -1462,14 +1436,10 @@ int tq_metal_moe_forward(
1462
1436
  const int* up_types, /* per-expert up quant types, NULL = use weight_type */
1463
1437
  const int* down_types); /* per-expert down quant types, NULL = use weight_type */
1464
1438
 
1465
-
1466
-
1467
-
1468
1439
  // ============================================================================
1469
1440
  // Section 4: Internal API (from turboquant.h)
1470
1441
  // ============================================================================
1471
1442
 
1472
-
1473
1443
  /**
1474
1444
  * TurboQuant.cpp — Cross-platform KV cache compression library
1475
1445
  *
@@ -1477,9 +1447,6 @@ int tq_metal_moe_forward(
1477
1447
  * Zero external dependencies (libc/libm only).
1478
1448
  */
1479
1449
 
1480
-
1481
-
1482
-
1483
1450
  /* ============================================================
1484
1451
  * Version
1485
1452
  * ============================================================ */
@@ -1753,15 +1720,10 @@ void tq_progressive_free(tq_progressive_t* p);
1753
1720
 
1754
1721
  tq_progressive_config_t tq_progressive_default_config(void);
1755
1722
 
1756
-
1757
-
1758
-
1759
-
1760
1723
  // ============================================================================
1761
1724
  // Section 5: quant_ctx struct definition
1762
1725
  // ============================================================================
1763
1726
 
1764
-
1765
1727
  struct quant_ctx {
1766
1728
  tq_model_t* model;
1767
1729
  tq_state_t* state;
@@ -1788,7 +1750,6 @@ struct quant_ctx {
1788
1750
  * - Random signs decorrelate channels across different blocks
1789
1751
  */
1790
1752
 
1791
-
1792
1753
  #ifdef __ARM_NEON
1793
1754
  #include <arm_neon.h>
1794
1755
  #endif
@@ -1902,7 +1863,6 @@ void tq_rht_inverse(float* data, int n, uint32_t seed) {
1902
1863
  */
1903
1864
  /* Generic reference — no compiler-specific pragmas */
1904
1865
 
1905
-
1906
1866
  /* ---------- FP16 helpers ---------- */
1907
1867
 
1908
1868
  static uint16_t uni_fp32_to_fp16(float v) {
@@ -2285,7 +2245,6 @@ void tq_uniform_3b_attention_ref(const float* query, const void* kv,
2285
2245
  // Section 8: Type Traits (from tq_traits.c)
2286
2246
  // ============================================================================
2287
2247
 
2288
-
2289
2248
  /* Stub implementations for excluded quantization types (polar, qjl, turbo, mixed) */
2290
2249
  static void tq_stub_quantize(const float* src, void* dst, int n) {
2291
2250
  (void)src; (void)dst; (void)n;
@@ -2583,7 +2542,6 @@ tq_type tq_type_from_name(const char* name) {
2583
2542
  * No external dependencies — libc/libm only.
2584
2543
  */
2585
2544
 
2586
-
2587
2545
  #ifdef __ARM_NEON
2588
2546
  #include <arm_neon.h>
2589
2547
  #endif
@@ -2617,7 +2575,6 @@ static struct {
2617
2575
 
2618
2576
  static int g_n_threads = 1;
2619
2577
 
2620
-
2621
2578
  static void* tp_worker(void* arg) {
2622
2579
  int id = (int)(intptr_t)arg;
2623
2580
  int my_gen = 0;
@@ -4173,6 +4130,7 @@ tq_gen_config_t tq_default_gen_config(void) {
4173
4130
  config.n_threads = 1;
4174
4131
  config.rep_penalty = 1.1f;
4175
4132
  config.rep_window = 32;
4133
+ config.rng_seed = 42ULL;
4176
4134
  config.on_token = NULL;
4177
4135
  config.user_data = NULL;
4178
4136
  return config;
@@ -4388,8 +4346,6 @@ void tq_matmul_1bit(float* out, const float* x,
4388
4346
  * SPDX-License-Identifier: MIT
4389
4347
  */
4390
4348
 
4391
-
4392
-
4393
4349
  #ifdef _WIN32
4394
4350
  #else
4395
4351
  #endif
@@ -5098,8 +5054,6 @@ const tq_gguf_tensor_t* tq_gguf_find_tensor(const tq_gguf_ctx_t* ctx, const char
5098
5054
  * Pure C11, no external dependencies.
5099
5055
  */
5100
5056
 
5101
-
5102
-
5103
5057
  #if defined(__ARM_NEON) || defined(__ARM_NEON__)
5104
5058
  #include <arm_neon.h>
5105
5059
  #define TQ_HAS_NEON 1
@@ -7174,7 +7128,6 @@ void tq_metal_batch_end_if_available(void) {
7174
7128
  * Also supports the legacy llama2.c binary tokenizer format as fallback.
7175
7129
  */
7176
7130
 
7177
-
7178
7131
  /* Global for qsort comparator (vocab index sorting) */
7179
7132
  static char** g_vocab_for_sort;
7180
7133
  static int cmp_vocab_idx(const void* a, const void* b) {
@@ -8519,7 +8472,6 @@ const char* tq_decode(const tq_tokenizer_t* tok, int prev_token, int token) {
8519
8472
  * Supports hybrid architectures (e.g., Qwen3.5 DeltaNet + self_attn).
8520
8473
  */
8521
8474
 
8522
-
8523
8475
  #ifdef _WIN32
8524
8476
  #else
8525
8477
  #endif
@@ -12179,8 +12131,13 @@ tq_model_t* tq_load_gguf(const char* path) {
12179
12131
  }
12180
12132
 
12181
12133
  const size_t MAX_FP32_BYTES = (size_t)16 * 1024 * 1024 * 1024ULL; /* 16 GB */
12182
- /* TQ_NO_Q4=1 disables Q4 recompression → use direct GGUF dequant for better quality */
12134
+ /* TQ_NO_Q4=1 disables Q4 recompression → use direct GGUF dequant for better quality.
12135
+ * Can be set via environment variable or compile-time define (useful for WASM). */
12136
+ #ifdef TQ_NO_Q4
12137
+ if (1) {
12138
+ #else
12183
12139
  if (getenv("TQ_NO_Q4")) {
12140
+ #endif
12184
12141
  fprintf(stderr, "tq_load_gguf: TQ_NO_Q4 set — skipping Q4 conversion, using GGUF on-the-fly dequant\n");
12185
12142
  goto skip_q4_conversion;
12186
12143
  }
@@ -12929,7 +12886,6 @@ void tq_quantize_weights_1bit(tq_model_t* model) {
12929
12886
  * -> residual add
12930
12887
  */
12931
12888
 
12932
-
12933
12889
  /* Unified Q2/1-bit matmul dispatch.
12934
12890
  * When model->use_1bit_weights, Q2 fields contain sign bits + norms,
12935
12891
  * dispatched to tq_matmul_1bit (FP32 input required).
@@ -15189,7 +15145,6 @@ float* tq_forward(tq_model_t* model, tq_state_t* s, int token, int pos) {
15189
15145
  }
15190
15146
  }
15191
15147
 
15192
-
15193
15148
  /* Increment profile token count if profiling is active */
15194
15149
  if (s->profile_kv) {
15195
15150
  s->profile_kv_count++;
@@ -15240,7 +15195,6 @@ float* tq_forward(tq_model_t* model, tq_state_t* s, int token, int pos) {
15240
15195
  * - Full generation loop with streaming callback
15241
15196
  */
15242
15197
 
15243
-
15244
15198
  /* ============================================================
15245
15199
  * Argmax sampling: return token with highest logit
15246
15200
  * ============================================================ */
@@ -15461,7 +15415,12 @@ int tq_generate(tq_model_t* model, tq_tokenizer_t* tokenizer,
15461
15415
  fprintf(stderr, "\n");
15462
15416
  }
15463
15417
 
15464
- /* Prefill: process all prompt tokens */
15418
+ /* Prefill: process all prompt tokens.
15419
+ * NOTE: No emscripten_sleep() here — the call stack during tq_forward()
15420
+ * is too deep for ASYNCIFY to unwind (matmul → SIMD kernels). Adding
15421
+ * sleep here breaks ASYNCIFY for the entire generate call, including
15422
+ * the token streaming callback. The browser shows "Thinking..." via
15423
+ * requestAnimationFrame before entering this blocking prefill. */
15465
15424
  for (int i = 0; i < n_prompt; i++) {
15466
15425
  tq_forward(model, state, prompt_tokens[i], i);
15467
15426
  }
@@ -15496,9 +15455,11 @@ int tq_generate(tq_model_t* model, tq_tokenizer_t* tokenizer,
15496
15455
  }
15497
15456
  }
15498
15457
 
15499
- /* Sample first generated token */
15458
+ /* Sample first generated token. The seed is configurable via
15459
+ * config->rng_seed (default 42); 0 falls back to 42 so existing
15460
+ * callers that never set rng_seed get bit-identical behaviour. */
15500
15461
  int pos = n_prompt;
15501
- unsigned long long rng_state = 42;
15462
+ unsigned long long rng_state = config->rng_seed ? config->rng_seed : 42ULL;
15502
15463
  int next_token = tq_sample_topp(state->logits, vocab_size,
15503
15464
  config->temperature, config->top_p,
15504
15465
  &rng_state);
@@ -15663,7 +15624,6 @@ int tq_generate(tq_model_t* model, tq_tokenizer_t* tokenizer,
15663
15624
  return generated;
15664
15625
  }
15665
15626
 
15666
-
15667
15627
  // ============================================================================
15668
15628
 
15669
15629
  // ============================================================================
@@ -15,7 +15,7 @@ try:
15
15
  from importlib.metadata import version as _pkg_version
16
16
  __version__ = _pkg_version("quantcpp")
17
17
  except Exception:
18
- __version__ = "0.11.0" # fallback for editable / source-tree imports
18
+ __version__ = "0.12.1" # fallback for editable / source-tree imports
19
19
 
20
20
  import os
21
21
  import sys
@@ -202,8 +202,6 @@ static inline int clock_gettime(int id, struct timespec* ts) {
202
202
  // Section 1: Types and Specs (from tq_types.h, tq_spec.h)
203
203
  // ============================================================================
204
204
 
205
-
206
-
207
205
  /* Cross-language static assert: works in both C11 and C++11/17 */
208
206
  #ifdef __cplusplus
209
207
  #define TQ_STATIC_ASSERT(cond, msg) static_assert(cond, msg)
@@ -219,8 +217,6 @@ static inline int clock_gettime(int id, struct timespec* ts) {
219
217
  #define TQ_PI_2 1.5707963267948966f
220
218
  #endif
221
219
 
222
-
223
-
224
220
  /* ============================================================
225
221
  * Constants
226
222
  * ============================================================ */
@@ -398,8 +394,6 @@ typedef struct {
398
394
  int enable_recompression;/* Tier 1 → Tier 2 re-compression */
399
395
  } tq_progressive_config_t;
400
396
 
401
-
402
-
403
397
  /* TurboQuant KV cache block: RHT + Lloyd-Max codebook + QJL residual
404
398
  * 3-bit variant: 2-bit codebook (4 levels) + 1-bit QJL sign hash
405
399
  * Block covers TQ_BK elements (128).
@@ -469,12 +463,6 @@ TQ_CHECK_SIZE(block_tq_turbo_kv_4b, 8 + TQ_BK * 3 / 8 + TQ_BK / 8);
469
463
  TQ_CHECK_SIZE(block_tq_turbo_kv_1b, 8 + TQ_BK / 8);
470
464
  TQ_CHECK_SIZE(block_tq_turbo_kv_2b, 8 + TQ_BK / 8 + TQ_BK / 8);
471
465
 
472
-
473
-
474
-
475
-
476
-
477
-
478
466
  /* Format specification — version-aware, ONNX-inspired */
479
467
 
480
468
  #define TQ_SPEC_VERSION 1
@@ -500,18 +488,10 @@ typedef struct {
500
488
  uint8_t flags; /* TQ_FLAG_* bitmask */
501
489
  } tq_format_spec_t;
502
490
 
503
-
504
-
505
-
506
-
507
491
  // ============================================================================
508
492
  // Section 2: Engine Types (from tq_engine.h)
509
493
  // ============================================================================
510
494
 
511
-
512
-
513
-
514
-
515
495
  /* ============================================================
516
496
  * Model configuration
517
497
  * ============================================================ */
@@ -886,6 +866,7 @@ typedef struct {
886
866
  int n_threads;
887
867
  float rep_penalty; /* repetition penalty (default: 1.1, 1.0 = disabled) */
888
868
  int rep_window; /* how many recent tokens to penalize (default: 32) */
869
+ unsigned long long rng_seed; /* sampling seed (default: 42, 0 = use 42 for back-compat) */
889
870
  /* Callback for streaming output */
890
871
  void (*on_token)(const char* text, void* user_data);
891
872
  void* user_data;
@@ -1123,9 +1104,6 @@ void tq_tp_run(void* (*fn)(void*), void** args, int n_tasks);
1123
1104
  /* Max threads supported by thread pool */
1124
1105
  #define TQ_TP_MAX 16
1125
1106
 
1126
-
1127
-
1128
-
1129
1107
  // ============================================================================
1130
1108
  // Section 3: GGUF Types (from tq_gguf.h)
1131
1109
  // ============================================================================
@@ -1143,10 +1121,6 @@ void tq_tp_run(void* (*fn)(void*), void** args, int n_tasks);
1143
1121
  * directly into TurboQuant inference engine.
1144
1122
  */
1145
1123
 
1146
-
1147
-
1148
-
1149
-
1150
1124
  /* ============================================================
1151
1125
  * GGUF format constants
1152
1126
  * ============================================================ */
@@ -1462,14 +1436,10 @@ int tq_metal_moe_forward(
1462
1436
  const int* up_types, /* per-expert up quant types, NULL = use weight_type */
1463
1437
  const int* down_types); /* per-expert down quant types, NULL = use weight_type */
1464
1438
 
1465
-
1466
-
1467
-
1468
1439
  // ============================================================================
1469
1440
  // Section 4: Internal API (from turboquant.h)
1470
1441
  // ============================================================================
1471
1442
 
1472
-
1473
1443
  /**
1474
1444
  * TurboQuant.cpp — Cross-platform KV cache compression library
1475
1445
  *
@@ -1477,9 +1447,6 @@ int tq_metal_moe_forward(
1477
1447
  * Zero external dependencies (libc/libm only).
1478
1448
  */
1479
1449
 
1480
-
1481
-
1482
-
1483
1450
  /* ============================================================
1484
1451
  * Version
1485
1452
  * ============================================================ */
@@ -1753,15 +1720,10 @@ void tq_progressive_free(tq_progressive_t* p);
1753
1720
 
1754
1721
  tq_progressive_config_t tq_progressive_default_config(void);
1755
1722
 
1756
-
1757
-
1758
-
1759
-
1760
1723
  // ============================================================================
1761
1724
  // Section 5: quant_ctx struct definition
1762
1725
  // ============================================================================
1763
1726
 
1764
-
1765
1727
  struct quant_ctx {
1766
1728
  tq_model_t* model;
1767
1729
  tq_state_t* state;
@@ -1788,7 +1750,6 @@ struct quant_ctx {
1788
1750
  * - Random signs decorrelate channels across different blocks
1789
1751
  */
1790
1752
 
1791
-
1792
1753
  #ifdef __ARM_NEON
1793
1754
  #include <arm_neon.h>
1794
1755
  #endif
@@ -1902,7 +1863,6 @@ void tq_rht_inverse(float* data, int n, uint32_t seed) {
1902
1863
  */
1903
1864
  /* Generic reference — no compiler-specific pragmas */
1904
1865
 
1905
-
1906
1866
  /* ---------- FP16 helpers ---------- */
1907
1867
 
1908
1868
  static uint16_t uni_fp32_to_fp16(float v) {
@@ -2285,7 +2245,6 @@ void tq_uniform_3b_attention_ref(const float* query, const void* kv,
2285
2245
  // Section 8: Type Traits (from tq_traits.c)
2286
2246
  // ============================================================================
2287
2247
 
2288
-
2289
2248
  /* Stub implementations for excluded quantization types (polar, qjl, turbo, mixed) */
2290
2249
  static void tq_stub_quantize(const float* src, void* dst, int n) {
2291
2250
  (void)src; (void)dst; (void)n;
@@ -2583,7 +2542,6 @@ tq_type tq_type_from_name(const char* name) {
2583
2542
  * No external dependencies — libc/libm only.
2584
2543
  */
2585
2544
 
2586
-
2587
2545
  #ifdef __ARM_NEON
2588
2546
  #include <arm_neon.h>
2589
2547
  #endif
@@ -2617,7 +2575,6 @@ static struct {
2617
2575
 
2618
2576
  static int g_n_threads = 1;
2619
2577
 
2620
-
2621
2578
  static void* tp_worker(void* arg) {
2622
2579
  int id = (int)(intptr_t)arg;
2623
2580
  int my_gen = 0;
@@ -4173,6 +4130,7 @@ tq_gen_config_t tq_default_gen_config(void) {
4173
4130
  config.n_threads = 1;
4174
4131
  config.rep_penalty = 1.1f;
4175
4132
  config.rep_window = 32;
4133
+ config.rng_seed = 42ULL;
4176
4134
  config.on_token = NULL;
4177
4135
  config.user_data = NULL;
4178
4136
  return config;
@@ -4388,8 +4346,6 @@ void tq_matmul_1bit(float* out, const float* x,
4388
4346
  * SPDX-License-Identifier: MIT
4389
4347
  */
4390
4348
 
4391
-
4392
-
4393
4349
  #ifdef _WIN32
4394
4350
  #else
4395
4351
  #endif
@@ -5098,8 +5054,6 @@ const tq_gguf_tensor_t* tq_gguf_find_tensor(const tq_gguf_ctx_t* ctx, const char
5098
5054
  * Pure C11, no external dependencies.
5099
5055
  */
5100
5056
 
5101
-
5102
-
5103
5057
  #if defined(__ARM_NEON) || defined(__ARM_NEON__)
5104
5058
  #include <arm_neon.h>
5105
5059
  #define TQ_HAS_NEON 1
@@ -7174,7 +7128,6 @@ void tq_metal_batch_end_if_available(void) {
7174
7128
  * Also supports the legacy llama2.c binary tokenizer format as fallback.
7175
7129
  */
7176
7130
 
7177
-
7178
7131
  /* Global for qsort comparator (vocab index sorting) */
7179
7132
  static char** g_vocab_for_sort;
7180
7133
  static int cmp_vocab_idx(const void* a, const void* b) {
@@ -8519,7 +8472,6 @@ const char* tq_decode(const tq_tokenizer_t* tok, int prev_token, int token) {
8519
8472
  * Supports hybrid architectures (e.g., Qwen3.5 DeltaNet + self_attn).
8520
8473
  */
8521
8474
 
8522
-
8523
8475
  #ifdef _WIN32
8524
8476
  #else
8525
8477
  #endif
@@ -12179,8 +12131,13 @@ tq_model_t* tq_load_gguf(const char* path) {
12179
12131
  }
12180
12132
 
12181
12133
  const size_t MAX_FP32_BYTES = (size_t)16 * 1024 * 1024 * 1024ULL; /* 16 GB */
12182
- /* TQ_NO_Q4=1 disables Q4 recompression → use direct GGUF dequant for better quality */
12134
+ /* TQ_NO_Q4=1 disables Q4 recompression → use direct GGUF dequant for better quality.
12135
+ * Can be set via environment variable or compile-time define (useful for WASM). */
12136
+ #ifdef TQ_NO_Q4
12137
+ if (1) {
12138
+ #else
12183
12139
  if (getenv("TQ_NO_Q4")) {
12140
+ #endif
12184
12141
  fprintf(stderr, "tq_load_gguf: TQ_NO_Q4 set — skipping Q4 conversion, using GGUF on-the-fly dequant\n");
12185
12142
  goto skip_q4_conversion;
12186
12143
  }
@@ -12929,7 +12886,6 @@ void tq_quantize_weights_1bit(tq_model_t* model) {
12929
12886
  * -> residual add
12930
12887
  */
12931
12888
 
12932
-
12933
12889
  /* Unified Q2/1-bit matmul dispatch.
12934
12890
  * When model->use_1bit_weights, Q2 fields contain sign bits + norms,
12935
12891
  * dispatched to tq_matmul_1bit (FP32 input required).
@@ -15189,7 +15145,6 @@ float* tq_forward(tq_model_t* model, tq_state_t* s, int token, int pos) {
15189
15145
  }
15190
15146
  }
15191
15147
 
15192
-
15193
15148
  /* Increment profile token count if profiling is active */
15194
15149
  if (s->profile_kv) {
15195
15150
  s->profile_kv_count++;
@@ -15240,7 +15195,6 @@ float* tq_forward(tq_model_t* model, tq_state_t* s, int token, int pos) {
15240
15195
  * - Full generation loop with streaming callback
15241
15196
  */
15242
15197
 
15243
-
15244
15198
  /* ============================================================
15245
15199
  * Argmax sampling: return token with highest logit
15246
15200
  * ============================================================ */
@@ -15461,7 +15415,12 @@ int tq_generate(tq_model_t* model, tq_tokenizer_t* tokenizer,
15461
15415
  fprintf(stderr, "\n");
15462
15416
  }
15463
15417
 
15464
- /* Prefill: process all prompt tokens */
15418
+ /* Prefill: process all prompt tokens.
15419
+ * NOTE: No emscripten_sleep() here — the call stack during tq_forward()
15420
+ * is too deep for ASYNCIFY to unwind (matmul → SIMD kernels). Adding
15421
+ * sleep here breaks ASYNCIFY for the entire generate call, including
15422
+ * the token streaming callback. The browser shows "Thinking..." via
15423
+ * requestAnimationFrame before entering this blocking prefill. */
15465
15424
  for (int i = 0; i < n_prompt; i++) {
15466
15425
  tq_forward(model, state, prompt_tokens[i], i);
15467
15426
  }
@@ -15496,9 +15455,11 @@ int tq_generate(tq_model_t* model, tq_tokenizer_t* tokenizer,
15496
15455
  }
15497
15456
  }
15498
15457
 
15499
- /* Sample first generated token */
15458
+ /* Sample first generated token. The seed is configurable via
15459
+ * config->rng_seed (default 42); 0 falls back to 42 so existing
15460
+ * callers that never set rng_seed get bit-identical behaviour. */
15500
15461
  int pos = n_prompt;
15501
- unsigned long long rng_state = 42;
15462
+ unsigned long long rng_state = config->rng_seed ? config->rng_seed : 42ULL;
15502
15463
  int next_token = tq_sample_topp(state->logits, vocab_size,
15503
15464
  config->temperature, config->top_p,
15504
15465
  &rng_state);
@@ -15663,7 +15624,6 @@ int tq_generate(tq_model_t* model, tq_tokenizer_t* tokenizer,
15663
15624
  return generated;
15664
15625
  }
15665
15626
 
15666
-
15667
15627
  // ============================================================================
15668
15628
 
15669
15629
  // ============================================================================
@@ -0,0 +1,390 @@
1
+ """
2
+ quantcpp CLI — chat with a local LLM in your terminal.
3
+
4
+ Ollama-style commands:
5
+ quantcpp pull MODEL Download a model from HuggingFace
6
+ quantcpp list List cached and available models
7
+ quantcpp run MODEL [Q] Chat with a model (auto-pulls if needed)
8
+ quantcpp serve MODEL Start OpenAI-compatible HTTP server
9
+
10
+ Backwards-compatible shortcut:
11
+ quantcpp Auto-downloads Llama-3.2-1B, starts chat
12
+ quantcpp "What is X?" One-shot question with default model
13
+ quantcpp --model NAME Use a specific model
14
+ """
15
+
16
+ import sys
17
+ import os
18
+ import json
19
+
20
+
21
+ # Ollama-style short aliases → canonical _MODEL_REGISTRY keys
22
+ MODEL_ALIASES = {
23
+ "smollm2": "SmolLM2-135M",
24
+ "smollm2:135m": "SmolLM2-135M",
25
+ "qwen3.5": "Qwen3.5-0.8B",
26
+ "qwen3.5:0.8b": "Qwen3.5-0.8B",
27
+ "llama3.2": "Llama-3.2-1B",
28
+ "llama3.2:1b": "Llama-3.2-1B",
29
+ }
30
+
31
+
32
+ def _resolve_name(name):
33
+ """Resolve user input to canonical registry key or local path."""
34
+ if name is None:
35
+ return None
36
+ if os.path.exists(name) and name.endswith(".gguf"):
37
+ return name
38
+ return MODEL_ALIASES.get(name.lower(), name)
39
+
40
+
41
+ def _registry():
42
+ from quantcpp import _MODEL_REGISTRY, _CACHE_DIR
43
+ return _MODEL_REGISTRY, _CACHE_DIR
44
+
45
+
46
+ def cmd_pull(args):
47
+ """Download a model by alias or canonical name."""
48
+ import quantcpp
49
+ name = _resolve_name(args.model)
50
+
51
+ if os.path.exists(name) and name.endswith(".gguf"):
52
+ print(f"already local: {name}")
53
+ return 0
54
+
55
+ if name not in quantcpp._MODEL_REGISTRY:
56
+ avail = ", ".join(sorted(quantcpp._MODEL_REGISTRY.keys()))
57
+ aliases = ", ".join(sorted(MODEL_ALIASES.keys()))
58
+ print(f"unknown model: {args.model!r}", file=sys.stderr)
59
+ print(f" registry: {avail}", file=sys.stderr)
60
+ print(f" aliases: {aliases}", file=sys.stderr)
61
+ return 1
62
+
63
+ print(f"pulling {name}...", file=sys.stderr)
64
+ try:
65
+ path = quantcpp.download(name)
66
+ size_mb = os.path.getsize(path) / (1024 * 1024)
67
+ print(f"\u2713 {name} \u2192 {path} ({size_mb:.0f} MB)", file=sys.stderr)
68
+ return 0
69
+ except Exception as e:
70
+ print(f"download failed: {e}", file=sys.stderr)
71
+ return 1
72
+
73
+
74
+ def cmd_list(args):
75
+ """List cached and available models."""
76
+ registry, cache_dir = _registry()
77
+
78
+ rows = []
79
+ for name, (repo, filename, approx_mb) in sorted(registry.items()):
80
+ path = cache_dir / filename
81
+ if path.exists():
82
+ size_mb = path.stat().st_size / (1024 * 1024)
83
+ status = "cached"
84
+ display_path = str(path)
85
+ else:
86
+ size_mb = approx_mb
87
+ status = "remote"
88
+ display_path = f"~{approx_mb} MB"
89
+ alias = next((a for a, n in MODEL_ALIASES.items() if n == name and ":" in a), "")
90
+ rows.append((status, name, alias, size_mb, display_path))
91
+
92
+ if args.json_output:
93
+ print(json.dumps([
94
+ {"status": s, "name": n, "alias": a, "size_mb": round(sz, 1), "path": p}
95
+ for (s, n, a, sz, p) in rows
96
+ ], indent=2))
97
+ return 0
98
+
99
+ print(f"\n Models cache: {cache_dir}\n")
100
+ print(f" {'STATUS':<8} {'NAME':<16} {'ALIAS':<14} {'SIZE':>8}")
101
+ print(f" {'-'*8} {'-'*16} {'-'*14} {'-'*8}")
102
+ for status, name, alias, size_mb, _ in rows:
103
+ size_str = f"{size_mb:.0f} MB"
104
+ print(f" {status:<8} {name:<16} {alias:<14} {size_str:>8}")
105
+ print()
106
+ return 0
107
+
108
+
109
+ def _resolve_to_path(name_or_path):
110
+ """Resolve alias/name to a local .gguf path, downloading if needed."""
111
+ import quantcpp
112
+ name = _resolve_name(name_or_path)
113
+
114
+ if os.path.exists(name) and name.endswith(".gguf"):
115
+ return name
116
+
117
+ if name not in quantcpp._MODEL_REGISTRY:
118
+ avail = ", ".join(sorted(quantcpp._MODEL_REGISTRY.keys()))
119
+ raise ValueError(
120
+ f"unknown model: {name_or_path!r}. Available: {avail}"
121
+ )
122
+
123
+ repo, filename, _ = quantcpp._MODEL_REGISTRY[name]
124
+ cached = quantcpp._CACHE_DIR / filename
125
+ if cached.exists():
126
+ return str(cached)
127
+
128
+ print(f"model not cached \u2014 pulling {name}...", file=sys.stderr)
129
+ return quantcpp.download(name)
130
+
131
+
132
+ def cmd_run(args):
133
+ """Chat with a model (auto-pull if needed)."""
134
+ try:
135
+ model_path = _resolve_to_path(args.model)
136
+ except ValueError as e:
137
+ print(str(e), file=sys.stderr)
138
+ return 1
139
+ except Exception as e:
140
+ print(f"pull failed: {e}", file=sys.stderr)
141
+ return 1
142
+
143
+ from quantcpp import Model
144
+ print(f"loading {os.path.basename(model_path)}...", file=sys.stderr)
145
+ m = Model(model_path, max_tokens=args.max_tokens, temperature=args.temperature,
146
+ n_threads=args.threads)
147
+
148
+ if args.prompt:
149
+ question = " ".join(args.prompt) if isinstance(args.prompt, list) else args.prompt
150
+ for tok in m.generate(question):
151
+ print(tok, end="", flush=True)
152
+ print()
153
+ else:
154
+ print("quantcpp \u2014 type your message, Ctrl+C to exit", file=sys.stderr)
155
+ try:
156
+ while True:
157
+ question = input("\nYou: ")
158
+ if not question.strip():
159
+ continue
160
+ print("AI: ", end="", flush=True)
161
+ for tok in m.generate(question):
162
+ print(tok, end="", flush=True)
163
+ print()
164
+ except (KeyboardInterrupt, EOFError):
165
+ print("\nBye!", file=sys.stderr)
166
+
167
+ m.close()
168
+ return 0
169
+
170
+
171
+ def cmd_serve(args):
172
+ """Start OpenAI-compatible HTTP server (requires quant-server binary)."""
173
+ import shutil
174
+ import subprocess
175
+
176
+ try:
177
+ model_path = _resolve_to_path(args.model)
178
+ except Exception as e:
179
+ print(f"error: {e}", file=sys.stderr)
180
+ return 1
181
+
182
+ binary = shutil.which("quant-server")
183
+ if not binary:
184
+ # Look in common build dirs relative to repo
185
+ for guess in ("./build/quant-server", "./build_metal/quant-server"):
186
+ if os.path.isfile(guess) and os.access(guess, os.X_OK):
187
+ binary = guess
188
+ break
189
+
190
+ if not binary:
191
+ print("quant-server binary not found.", file=sys.stderr)
192
+ print(" Build with: cmake -B build -DTQ_BUILD_SERVER=ON && cmake --build build",
193
+ file=sys.stderr)
194
+ print(" Or install via your package manager.", file=sys.stderr)
195
+ return 2
196
+
197
+ cmd = [binary, model_path, "-p", str(args.port), "-j", str(args.threads)]
198
+ print(f"quantcpp serve {os.path.basename(model_path)} on :{args.port}", file=sys.stderr)
199
+ print("", file=sys.stderr)
200
+ print("OpenAI-compatible endpoints:", file=sys.stderr)
201
+ print(f" POST http://localhost:{args.port}/v1/chat/completions", file=sys.stderr)
202
+ print(f" GET http://localhost:{args.port}/v1/models", file=sys.stderr)
203
+ print(f" GET http://localhost:{args.port}/health", file=sys.stderr)
204
+ print("", file=sys.stderr)
205
+ print("Streaming (SSE — token-by-token):", file=sys.stderr)
206
+ print(f" curl -N http://localhost:{args.port}/v1/chat/completions \\", file=sys.stderr)
207
+ print(" -H 'Content-Type: application/json' \\", file=sys.stderr)
208
+ print(' -d \'{"messages":[{"role":"user","content":"Hi"}],"stream":true}\'',
209
+ file=sys.stderr)
210
+ print("", file=sys.stderr)
211
+ print("Non-streaming (single JSON response):", file=sys.stderr)
212
+ print(f" curl http://localhost:{args.port}/v1/chat/completions \\", file=sys.stderr)
213
+ print(" -H 'Content-Type: application/json' \\", file=sys.stderr)
214
+ print(' -d \'{"messages":[{"role":"user","content":"Hi"}]}\'',
215
+ file=sys.stderr)
216
+ print("", file=sys.stderr)
217
+ print("OpenAI Python SDK works as-is:", file=sys.stderr)
218
+ print(f" client = OpenAI(base_url='http://localhost:{args.port}/v1', api_key='none')",
219
+ file=sys.stderr)
220
+ print(" client.chat.completions.create(model='quantcpp', messages=[...], stream=True)",
221
+ file=sys.stderr)
222
+ print("", file=sys.stderr)
223
+ os.execvp(cmd[0], cmd)
224
+
225
+
226
+ def cmd_client(args):
227
+ """Send a chat request to a running quantcpp serve endpoint.
228
+
229
+ Default mode is streaming (SSE) — tokens print as they arrive.
230
+ Use --no-stream for a single JSON response.
231
+ """
232
+ import json as _json
233
+ import urllib.request
234
+
235
+ url = args.url.rstrip("/") + "/v1/chat/completions"
236
+ payload = {
237
+ "model": args.model_name,
238
+ "messages": [{"role": "user", "content": args.prompt}],
239
+ "max_tokens": args.max_tokens,
240
+ "temperature": args.temperature,
241
+ "stream": not args.no_stream,
242
+ }
243
+ body = _json.dumps(payload).encode()
244
+ req = urllib.request.Request(
245
+ url, data=body,
246
+ headers={
247
+ "Content-Type": "application/json",
248
+ "User-Agent": "quantcpp-client",
249
+ },
250
+ )
251
+
252
+ try:
253
+ with urllib.request.urlopen(req) as resp:
254
+ if args.no_stream:
255
+ data = _json.loads(resp.read())
256
+ print(data["choices"][0]["message"]["content"])
257
+ return 0
258
+
259
+ # SSE stream — parse `data: {...}\n\n` chunks
260
+ for line in resp:
261
+ line = line.decode("utf-8", errors="replace").rstrip()
262
+ if not line.startswith("data:"):
263
+ continue
264
+ payload_str = line[5:].strip()
265
+ if payload_str == "[DONE]":
266
+ break
267
+ try:
268
+ chunk = _json.loads(payload_str)
269
+ delta = chunk["choices"][0]["delta"].get("content", "")
270
+ if delta:
271
+ print(delta, end="", flush=True)
272
+ except Exception:
273
+ pass
274
+ print()
275
+ return 0
276
+ except urllib.error.URLError as e:
277
+ print(f"connection failed: {e}", file=sys.stderr)
278
+ print(f" Is the server running on {args.url}?", file=sys.stderr)
279
+ print(f" Start it with: quantcpp serve llama3.2:1b -p {args.url.rsplit(':', 1)[-1].rstrip('/')}",
280
+ file=sys.stderr)
281
+ return 1
282
+
283
+
284
+ def cmd_chat_default(args):
285
+ """Backwards-compatible default: auto-download Llama-3.2-1B and chat."""
286
+ args.model = args.model or "Llama-3.2-1B"
287
+ args.threads = getattr(args, "threads", 4)
288
+ args.max_tokens = getattr(args, "max_tokens", 256)
289
+ args.temperature = getattr(args, "temperature", 0.7)
290
+ args.prompt = args.prompt or None
291
+ return cmd_run(args)
292
+
293
+
294
+ def main():
295
+ import argparse
296
+
297
+ parser = argparse.ArgumentParser(
298
+ prog="quantcpp",
299
+ description="Chat with a local LLM. No API key, no GPU, no server.",
300
+ formatter_class=argparse.RawDescriptionHelpFormatter,
301
+ epilog="""
302
+ commands:
303
+ pull MODEL Download a model (e.g. llama3.2:1b)
304
+ list List cached and available models
305
+ run MODEL [PROMPT] Chat with a model (auto-pulls if needed)
306
+ serve MODEL Start OpenAI-compatible HTTP server
307
+ client PROMPT Send a request to a running serve (default: SSE streaming)
308
+
309
+ examples:
310
+ quantcpp pull llama3.2:1b
311
+ quantcpp list
312
+ quantcpp run llama3.2:1b
313
+ quantcpp run llama3.2:1b "What is gravity?"
314
+ quantcpp serve llama3.2:1b --port 8080
315
+ quantcpp client "What is gravity?" # streams from :8080
316
+ quantcpp client "Hi" --url http://localhost:8081
317
+ quantcpp client "Hi" --no-stream # single JSON response
318
+
319
+ backwards-compat (no subcommand):
320
+ quantcpp # default chat with Llama-3.2-1B
321
+ quantcpp "What is gravity?" # one-shot
322
+ quantcpp --model SmolLM2-135M # different model
323
+ """,
324
+ )
325
+
326
+ sub = parser.add_subparsers(dest="command")
327
+
328
+ # pull
329
+ p_pull = sub.add_parser("pull", help="Download a model from HuggingFace")
330
+ p_pull.add_argument("model", help="Model name or alias (e.g. llama3.2:1b)")
331
+
332
+ # list
333
+ p_list = sub.add_parser("list", help="List cached and available models")
334
+ p_list.add_argument("--json", dest="json_output", action="store_true")
335
+
336
+ # run
337
+ p_run = sub.add_parser("run", help="Chat with a model (auto-pulls if needed)")
338
+ p_run.add_argument("model", help="Model name, alias, or .gguf path")
339
+ p_run.add_argument("prompt", nargs="*", default=None, help="Optional prompt")
340
+ p_run.add_argument("-j", "--threads", type=int, default=4)
341
+ p_run.add_argument("-n", "--max-tokens", type=int, default=256)
342
+ p_run.add_argument("-t", "--temperature", type=float, default=0.7)
343
+
344
+ # serve
345
+ p_serve = sub.add_parser("serve", help="Start OpenAI-compatible HTTP server")
346
+ p_serve.add_argument("model", help="Model name, alias, or .gguf path")
347
+ p_serve.add_argument("-p", "--port", type=int, default=8080)
348
+ p_serve.add_argument("-j", "--threads", type=int, default=4)
349
+
350
+ # client
351
+ p_client = sub.add_parser("client",
352
+ help="Send a chat request to a running quantcpp serve endpoint")
353
+ p_client.add_argument("prompt", help="Question to send")
354
+ p_client.add_argument("--url", default="http://localhost:8080",
355
+ help="Server URL (default: http://localhost:8080)")
356
+ p_client.add_argument("--model-name", "-m", default="quantcpp",
357
+ help="Model name in the request body (server ignores)")
358
+ p_client.add_argument("-n", "--max-tokens", type=int, default=256)
359
+ p_client.add_argument("-t", "--temperature", type=float, default=0.7)
360
+ p_client.add_argument("--no-stream", action="store_true",
361
+ help="Disable SSE streaming (single JSON response)")
362
+
363
+ # Backwards-compat: top-level args for direct chat
364
+ parser.add_argument("prompt", nargs="*", default=None,
365
+ help="(default mode) question to ask")
366
+ parser.add_argument("--model", "-m", default=None,
367
+ help="(default mode) model name or .gguf path")
368
+ parser.add_argument("--max-tokens", "-n", type=int, default=256)
369
+ parser.add_argument("--temperature", "-t", type=float, default=0.7)
370
+ parser.add_argument("--threads", "-j", type=int, default=4)
371
+
372
+ args = parser.parse_args()
373
+
374
+ if args.command == "pull":
375
+ return cmd_pull(args)
376
+ if args.command == "list":
377
+ return cmd_list(args)
378
+ if args.command == "run":
379
+ return cmd_run(args)
380
+ if args.command == "serve":
381
+ return cmd_serve(args)
382
+ if args.command == "client":
383
+ return cmd_client(args)
384
+
385
+ # No subcommand → backwards-compat default chat
386
+ return cmd_chat_default(args)
387
+
388
+
389
+ if __name__ == "__main__":
390
+ sys.exit(main())
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: quantcpp
3
- Version: 0.11.0
3
+ Version: 0.12.1
4
4
  Summary: Single-header LLM inference engine with KV cache compression (7× compression at fp32 parity)
5
5
  Author-email: quantumaikr <noreply@quantumaikr.com>
6
6
  License: Apache-2.0
@@ -1,64 +0,0 @@
1
- """
2
- quantcpp CLI — chat with a local LLM in your terminal.
3
-
4
- Usage:
5
- quantcpp # auto-downloads Llama-3.2-1B, starts chat
6
- quantcpp "What is gravity?" # one-shot question
7
- quantcpp --model SmolLM2-135M # use a smaller model (faster download)
8
- quantcpp --model path/to/file.gguf # use your own GGUF file
9
- """
10
-
11
- import sys
12
- import os
13
-
14
-
15
- def main():
16
- import argparse
17
- parser = argparse.ArgumentParser(
18
- prog="quantcpp",
19
- description="Chat with a local LLM. No API key, no GPU, no server.",
20
- )
21
- parser.add_argument("prompt", nargs="*", help="Question to ask (omit for interactive chat)")
22
- parser.add_argument("--model", "-m", default="Llama-3.2-1B",
23
- help="Model name or path to .gguf file (default: Llama-3.2-1B)")
24
- parser.add_argument("--max-tokens", "-n", type=int, default=256)
25
- parser.add_argument("--temperature", "-t", type=float, default=0.7)
26
- args = parser.parse_args()
27
-
28
- from quantcpp import Model
29
-
30
- # Load model
31
- model_path = args.model
32
- if os.path.isfile(model_path):
33
- print(f"Loading {model_path}...", file=sys.stderr)
34
- m = Model(model_path, max_tokens=args.max_tokens, temperature=args.temperature)
35
- else:
36
- print(f"Downloading {model_path}...", file=sys.stderr)
37
- m = Model.from_pretrained(model_path, max_tokens=args.max_tokens,
38
- temperature=args.temperature)
39
-
40
- # One-shot or interactive
41
- if args.prompt:
42
- question = " ".join(args.prompt)
43
- for tok in m.generate(question):
44
- print(tok, end="", flush=True)
45
- print()
46
- else:
47
- print("quantcpp — type your message, Ctrl+C to exit", file=sys.stderr)
48
- try:
49
- while True:
50
- question = input("\nYou: ")
51
- if not question.strip():
52
- continue
53
- print("AI: ", end="", flush=True)
54
- for tok in m.generate(question):
55
- print(tok, end="", flush=True)
56
- print()
57
- except (KeyboardInterrupt, EOFError):
58
- print("\nBye!", file=sys.stderr)
59
-
60
- m.close()
61
-
62
-
63
- if __name__ == "__main__":
64
- main()
File without changes
File without changes
File without changes
File without changes
File without changes