quantcpp 0.11.0__tar.gz → 0.12.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: quantcpp
3
- Version: 0.11.0
3
+ Version: 0.12.0
4
4
  Summary: Single-header LLM inference engine with KV cache compression (7× compression at fp32 parity)
5
5
  Author-email: quantumaikr <noreply@quantumaikr.com>
6
6
  License: Apache-2.0
@@ -7,7 +7,7 @@ build-backend = "setuptools.build_meta"
7
7
 
8
8
  [project]
9
9
  name = "quantcpp"
10
- version = "0.11.0"
10
+ version = "0.12.0"
11
11
  description = "Single-header LLM inference engine with KV cache compression (7× compression at fp32 parity)"
12
12
  readme = "README.md"
13
13
  license = { text = "Apache-2.0" }
@@ -202,8 +202,6 @@ static inline int clock_gettime(int id, struct timespec* ts) {
202
202
  // Section 1: Types and Specs (from tq_types.h, tq_spec.h)
203
203
  // ============================================================================
204
204
 
205
-
206
-
207
205
  /* Cross-language static assert: works in both C11 and C++11/17 */
208
206
  #ifdef __cplusplus
209
207
  #define TQ_STATIC_ASSERT(cond, msg) static_assert(cond, msg)
@@ -219,8 +217,6 @@ static inline int clock_gettime(int id, struct timespec* ts) {
219
217
  #define TQ_PI_2 1.5707963267948966f
220
218
  #endif
221
219
 
222
-
223
-
224
220
  /* ============================================================
225
221
  * Constants
226
222
  * ============================================================ */
@@ -398,8 +394,6 @@ typedef struct {
398
394
  int enable_recompression;/* Tier 1 → Tier 2 re-compression */
399
395
  } tq_progressive_config_t;
400
396
 
401
-
402
-
403
397
  /* TurboQuant KV cache block: RHT + Lloyd-Max codebook + QJL residual
404
398
  * 3-bit variant: 2-bit codebook (4 levels) + 1-bit QJL sign hash
405
399
  * Block covers TQ_BK elements (128).
@@ -469,12 +463,6 @@ TQ_CHECK_SIZE(block_tq_turbo_kv_4b, 8 + TQ_BK * 3 / 8 + TQ_BK / 8);
469
463
  TQ_CHECK_SIZE(block_tq_turbo_kv_1b, 8 + TQ_BK / 8);
470
464
  TQ_CHECK_SIZE(block_tq_turbo_kv_2b, 8 + TQ_BK / 8 + TQ_BK / 8);
471
465
 
472
-
473
-
474
-
475
-
476
-
477
-
478
466
  /* Format specification — version-aware, ONNX-inspired */
479
467
 
480
468
  #define TQ_SPEC_VERSION 1
@@ -500,18 +488,10 @@ typedef struct {
500
488
  uint8_t flags; /* TQ_FLAG_* bitmask */
501
489
  } tq_format_spec_t;
502
490
 
503
-
504
-
505
-
506
-
507
491
  // ============================================================================
508
492
  // Section 2: Engine Types (from tq_engine.h)
509
493
  // ============================================================================
510
494
 
511
-
512
-
513
-
514
-
515
495
  /* ============================================================
516
496
  * Model configuration
517
497
  * ============================================================ */
@@ -1123,9 +1103,6 @@ void tq_tp_run(void* (*fn)(void*), void** args, int n_tasks);
1123
1103
  /* Max threads supported by thread pool */
1124
1104
  #define TQ_TP_MAX 16
1125
1105
 
1126
-
1127
-
1128
-
1129
1106
  // ============================================================================
1130
1107
  // Section 3: GGUF Types (from tq_gguf.h)
1131
1108
  // ============================================================================
@@ -1143,10 +1120,6 @@ void tq_tp_run(void* (*fn)(void*), void** args, int n_tasks);
1143
1120
  * directly into TurboQuant inference engine.
1144
1121
  */
1145
1122
 
1146
-
1147
-
1148
-
1149
-
1150
1123
  /* ============================================================
1151
1124
  * GGUF format constants
1152
1125
  * ============================================================ */
@@ -1462,14 +1435,10 @@ int tq_metal_moe_forward(
1462
1435
  const int* up_types, /* per-expert up quant types, NULL = use weight_type */
1463
1436
  const int* down_types); /* per-expert down quant types, NULL = use weight_type */
1464
1437
 
1465
-
1466
-
1467
-
1468
1438
  // ============================================================================
1469
1439
  // Section 4: Internal API (from turboquant.h)
1470
1440
  // ============================================================================
1471
1441
 
1472
-
1473
1442
  /**
1474
1443
  * TurboQuant.cpp — Cross-platform KV cache compression library
1475
1444
  *
@@ -1477,9 +1446,6 @@ int tq_metal_moe_forward(
1477
1446
  * Zero external dependencies (libc/libm only).
1478
1447
  */
1479
1448
 
1480
-
1481
-
1482
-
1483
1449
  /* ============================================================
1484
1450
  * Version
1485
1451
  * ============================================================ */
@@ -1753,15 +1719,10 @@ void tq_progressive_free(tq_progressive_t* p);
1753
1719
 
1754
1720
  tq_progressive_config_t tq_progressive_default_config(void);
1755
1721
 
1756
-
1757
-
1758
-
1759
-
1760
1722
  // ============================================================================
1761
1723
  // Section 5: quant_ctx struct definition
1762
1724
  // ============================================================================
1763
1725
 
1764
-
1765
1726
  struct quant_ctx {
1766
1727
  tq_model_t* model;
1767
1728
  tq_state_t* state;
@@ -1788,7 +1749,6 @@ struct quant_ctx {
1788
1749
  * - Random signs decorrelate channels across different blocks
1789
1750
  */
1790
1751
 
1791
-
1792
1752
  #ifdef __ARM_NEON
1793
1753
  #include <arm_neon.h>
1794
1754
  #endif
@@ -1902,7 +1862,6 @@ void tq_rht_inverse(float* data, int n, uint32_t seed) {
1902
1862
  */
1903
1863
  /* Generic reference — no compiler-specific pragmas */
1904
1864
 
1905
-
1906
1865
  /* ---------- FP16 helpers ---------- */
1907
1866
 
1908
1867
  static uint16_t uni_fp32_to_fp16(float v) {
@@ -2285,7 +2244,6 @@ void tq_uniform_3b_attention_ref(const float* query, const void* kv,
2285
2244
  // Section 8: Type Traits (from tq_traits.c)
2286
2245
  // ============================================================================
2287
2246
 
2288
-
2289
2247
  /* Stub implementations for excluded quantization types (polar, qjl, turbo, mixed) */
2290
2248
  static void tq_stub_quantize(const float* src, void* dst, int n) {
2291
2249
  (void)src; (void)dst; (void)n;
@@ -2583,7 +2541,6 @@ tq_type tq_type_from_name(const char* name) {
2583
2541
  * No external dependencies — libc/libm only.
2584
2542
  */
2585
2543
 
2586
-
2587
2544
  #ifdef __ARM_NEON
2588
2545
  #include <arm_neon.h>
2589
2546
  #endif
@@ -2617,7 +2574,6 @@ static struct {
2617
2574
 
2618
2575
  static int g_n_threads = 1;
2619
2576
 
2620
-
2621
2577
  static void* tp_worker(void* arg) {
2622
2578
  int id = (int)(intptr_t)arg;
2623
2579
  int my_gen = 0;
@@ -4388,8 +4344,6 @@ void tq_matmul_1bit(float* out, const float* x,
4388
4344
  * SPDX-License-Identifier: MIT
4389
4345
  */
4390
4346
 
4391
-
4392
-
4393
4347
  #ifdef _WIN32
4394
4348
  #else
4395
4349
  #endif
@@ -5098,8 +5052,6 @@ const tq_gguf_tensor_t* tq_gguf_find_tensor(const tq_gguf_ctx_t* ctx, const char
5098
5052
  * Pure C11, no external dependencies.
5099
5053
  */
5100
5054
 
5101
-
5102
-
5103
5055
  #if defined(__ARM_NEON) || defined(__ARM_NEON__)
5104
5056
  #include <arm_neon.h>
5105
5057
  #define TQ_HAS_NEON 1
@@ -7174,7 +7126,6 @@ void tq_metal_batch_end_if_available(void) {
7174
7126
  * Also supports the legacy llama2.c binary tokenizer format as fallback.
7175
7127
  */
7176
7128
 
7177
-
7178
7129
  /* Global for qsort comparator (vocab index sorting) */
7179
7130
  static char** g_vocab_for_sort;
7180
7131
  static int cmp_vocab_idx(const void* a, const void* b) {
@@ -8519,7 +8470,6 @@ const char* tq_decode(const tq_tokenizer_t* tok, int prev_token, int token) {
8519
8470
  * Supports hybrid architectures (e.g., Qwen3.5 DeltaNet + self_attn).
8520
8471
  */
8521
8472
 
8522
-
8523
8473
  #ifdef _WIN32
8524
8474
  #else
8525
8475
  #endif
@@ -12179,8 +12129,13 @@ tq_model_t* tq_load_gguf(const char* path) {
12179
12129
  }
12180
12130
 
12181
12131
  const size_t MAX_FP32_BYTES = (size_t)16 * 1024 * 1024 * 1024ULL; /* 16 GB */
12182
- /* TQ_NO_Q4=1 disables Q4 recompression → use direct GGUF dequant for better quality */
12132
+ /* TQ_NO_Q4=1 disables Q4 recompression → use direct GGUF dequant for better quality.
12133
+ * Can be set via environment variable or compile-time define (useful for WASM). */
12134
+ #ifdef TQ_NO_Q4
12135
+ if (1) {
12136
+ #else
12183
12137
  if (getenv("TQ_NO_Q4")) {
12138
+ #endif
12184
12139
  fprintf(stderr, "tq_load_gguf: TQ_NO_Q4 set — skipping Q4 conversion, using GGUF on-the-fly dequant\n");
12185
12140
  goto skip_q4_conversion;
12186
12141
  }
@@ -12929,7 +12884,6 @@ void tq_quantize_weights_1bit(tq_model_t* model) {
12929
12884
  * -> residual add
12930
12885
  */
12931
12886
 
12932
-
12933
12887
  /* Unified Q2/1-bit matmul dispatch.
12934
12888
  * When model->use_1bit_weights, Q2 fields contain sign bits + norms,
12935
12889
  * dispatched to tq_matmul_1bit (FP32 input required).
@@ -15189,7 +15143,6 @@ float* tq_forward(tq_model_t* model, tq_state_t* s, int token, int pos) {
15189
15143
  }
15190
15144
  }
15191
15145
 
15192
-
15193
15146
  /* Increment profile token count if profiling is active */
15194
15147
  if (s->profile_kv) {
15195
15148
  s->profile_kv_count++;
@@ -15240,7 +15193,6 @@ float* tq_forward(tq_model_t* model, tq_state_t* s, int token, int pos) {
15240
15193
  * - Full generation loop with streaming callback
15241
15194
  */
15242
15195
 
15243
-
15244
15196
  /* ============================================================
15245
15197
  * Argmax sampling: return token with highest logit
15246
15198
  * ============================================================ */
@@ -15461,7 +15413,12 @@ int tq_generate(tq_model_t* model, tq_tokenizer_t* tokenizer,
15461
15413
  fprintf(stderr, "\n");
15462
15414
  }
15463
15415
 
15464
- /* Prefill: process all prompt tokens */
15416
+ /* Prefill: process all prompt tokens.
15417
+ * NOTE: No emscripten_sleep() here — the call stack during tq_forward()
15418
+ * is too deep for ASYNCIFY to unwind (matmul → SIMD kernels). Adding
15419
+ * sleep here breaks ASYNCIFY for the entire generate call, including
15420
+ * the token streaming callback. The browser shows "Thinking..." via
15421
+ * requestAnimationFrame before entering this blocking prefill. */
15465
15422
  for (int i = 0; i < n_prompt; i++) {
15466
15423
  tq_forward(model, state, prompt_tokens[i], i);
15467
15424
  }
@@ -15663,7 +15620,6 @@ int tq_generate(tq_model_t* model, tq_tokenizer_t* tokenizer,
15663
15620
  return generated;
15664
15621
  }
15665
15622
 
15666
-
15667
15623
  // ============================================================================
15668
15624
 
15669
15625
  // ============================================================================
@@ -15,7 +15,7 @@ try:
15
15
  from importlib.metadata import version as _pkg_version
16
16
  __version__ = _pkg_version("quantcpp")
17
17
  except Exception:
18
- __version__ = "0.11.0" # fallback for editable / source-tree imports
18
+ __version__ = "0.12.0" # fallback for editable / source-tree imports
19
19
 
20
20
  import os
21
21
  import sys
@@ -202,8 +202,6 @@ static inline int clock_gettime(int id, struct timespec* ts) {
202
202
  // Section 1: Types and Specs (from tq_types.h, tq_spec.h)
203
203
  // ============================================================================
204
204
 
205
-
206
-
207
205
  /* Cross-language static assert: works in both C11 and C++11/17 */
208
206
  #ifdef __cplusplus
209
207
  #define TQ_STATIC_ASSERT(cond, msg) static_assert(cond, msg)
@@ -219,8 +217,6 @@ static inline int clock_gettime(int id, struct timespec* ts) {
219
217
  #define TQ_PI_2 1.5707963267948966f
220
218
  #endif
221
219
 
222
-
223
-
224
220
  /* ============================================================
225
221
  * Constants
226
222
  * ============================================================ */
@@ -398,8 +394,6 @@ typedef struct {
398
394
  int enable_recompression;/* Tier 1 → Tier 2 re-compression */
399
395
  } tq_progressive_config_t;
400
396
 
401
-
402
-
403
397
  /* TurboQuant KV cache block: RHT + Lloyd-Max codebook + QJL residual
404
398
  * 3-bit variant: 2-bit codebook (4 levels) + 1-bit QJL sign hash
405
399
  * Block covers TQ_BK elements (128).
@@ -469,12 +463,6 @@ TQ_CHECK_SIZE(block_tq_turbo_kv_4b, 8 + TQ_BK * 3 / 8 + TQ_BK / 8);
469
463
  TQ_CHECK_SIZE(block_tq_turbo_kv_1b, 8 + TQ_BK / 8);
470
464
  TQ_CHECK_SIZE(block_tq_turbo_kv_2b, 8 + TQ_BK / 8 + TQ_BK / 8);
471
465
 
472
-
473
-
474
-
475
-
476
-
477
-
478
466
  /* Format specification — version-aware, ONNX-inspired */
479
467
 
480
468
  #define TQ_SPEC_VERSION 1
@@ -500,18 +488,10 @@ typedef struct {
500
488
  uint8_t flags; /* TQ_FLAG_* bitmask */
501
489
  } tq_format_spec_t;
502
490
 
503
-
504
-
505
-
506
-
507
491
  // ============================================================================
508
492
  // Section 2: Engine Types (from tq_engine.h)
509
493
  // ============================================================================
510
494
 
511
-
512
-
513
-
514
-
515
495
  /* ============================================================
516
496
  * Model configuration
517
497
  * ============================================================ */
@@ -1123,9 +1103,6 @@ void tq_tp_run(void* (*fn)(void*), void** args, int n_tasks);
1123
1103
  /* Max threads supported by thread pool */
1124
1104
  #define TQ_TP_MAX 16
1125
1105
 
1126
-
1127
-
1128
-
1129
1106
  // ============================================================================
1130
1107
  // Section 3: GGUF Types (from tq_gguf.h)
1131
1108
  // ============================================================================
@@ -1143,10 +1120,6 @@ void tq_tp_run(void* (*fn)(void*), void** args, int n_tasks);
1143
1120
  * directly into TurboQuant inference engine.
1144
1121
  */
1145
1122
 
1146
-
1147
-
1148
-
1149
-
1150
1123
  /* ============================================================
1151
1124
  * GGUF format constants
1152
1125
  * ============================================================ */
@@ -1462,14 +1435,10 @@ int tq_metal_moe_forward(
1462
1435
  const int* up_types, /* per-expert up quant types, NULL = use weight_type */
1463
1436
  const int* down_types); /* per-expert down quant types, NULL = use weight_type */
1464
1437
 
1465
-
1466
-
1467
-
1468
1438
  // ============================================================================
1469
1439
  // Section 4: Internal API (from turboquant.h)
1470
1440
  // ============================================================================
1471
1441
 
1472
-
1473
1442
  /**
1474
1443
  * TurboQuant.cpp — Cross-platform KV cache compression library
1475
1444
  *
@@ -1477,9 +1446,6 @@ int tq_metal_moe_forward(
1477
1446
  * Zero external dependencies (libc/libm only).
1478
1447
  */
1479
1448
 
1480
-
1481
-
1482
-
1483
1449
  /* ============================================================
1484
1450
  * Version
1485
1451
  * ============================================================ */
@@ -1753,15 +1719,10 @@ void tq_progressive_free(tq_progressive_t* p);
1753
1719
 
1754
1720
  tq_progressive_config_t tq_progressive_default_config(void);
1755
1721
 
1756
-
1757
-
1758
-
1759
-
1760
1722
  // ============================================================================
1761
1723
  // Section 5: quant_ctx struct definition
1762
1724
  // ============================================================================
1763
1725
 
1764
-
1765
1726
  struct quant_ctx {
1766
1727
  tq_model_t* model;
1767
1728
  tq_state_t* state;
@@ -1788,7 +1749,6 @@ struct quant_ctx {
1788
1749
  * - Random signs decorrelate channels across different blocks
1789
1750
  */
1790
1751
 
1791
-
1792
1752
  #ifdef __ARM_NEON
1793
1753
  #include <arm_neon.h>
1794
1754
  #endif
@@ -1902,7 +1862,6 @@ void tq_rht_inverse(float* data, int n, uint32_t seed) {
1902
1862
  */
1903
1863
  /* Generic reference — no compiler-specific pragmas */
1904
1864
 
1905
-
1906
1865
  /* ---------- FP16 helpers ---------- */
1907
1866
 
1908
1867
  static uint16_t uni_fp32_to_fp16(float v) {
@@ -2285,7 +2244,6 @@ void tq_uniform_3b_attention_ref(const float* query, const void* kv,
2285
2244
  // Section 8: Type Traits (from tq_traits.c)
2286
2245
  // ============================================================================
2287
2246
 
2288
-
2289
2247
  /* Stub implementations for excluded quantization types (polar, qjl, turbo, mixed) */
2290
2248
  static void tq_stub_quantize(const float* src, void* dst, int n) {
2291
2249
  (void)src; (void)dst; (void)n;
@@ -2583,7 +2541,6 @@ tq_type tq_type_from_name(const char* name) {
2583
2541
  * No external dependencies — libc/libm only.
2584
2542
  */
2585
2543
 
2586
-
2587
2544
  #ifdef __ARM_NEON
2588
2545
  #include <arm_neon.h>
2589
2546
  #endif
@@ -2617,7 +2574,6 @@ static struct {
2617
2574
 
2618
2575
  static int g_n_threads = 1;
2619
2576
 
2620
-
2621
2577
  static void* tp_worker(void* arg) {
2622
2578
  int id = (int)(intptr_t)arg;
2623
2579
  int my_gen = 0;
@@ -4388,8 +4344,6 @@ void tq_matmul_1bit(float* out, const float* x,
4388
4344
  * SPDX-License-Identifier: MIT
4389
4345
  */
4390
4346
 
4391
-
4392
-
4393
4347
  #ifdef _WIN32
4394
4348
  #else
4395
4349
  #endif
@@ -5098,8 +5052,6 @@ const tq_gguf_tensor_t* tq_gguf_find_tensor(const tq_gguf_ctx_t* ctx, const char
5098
5052
  * Pure C11, no external dependencies.
5099
5053
  */
5100
5054
 
5101
-
5102
-
5103
5055
  #if defined(__ARM_NEON) || defined(__ARM_NEON__)
5104
5056
  #include <arm_neon.h>
5105
5057
  #define TQ_HAS_NEON 1
@@ -7174,7 +7126,6 @@ void tq_metal_batch_end_if_available(void) {
7174
7126
  * Also supports the legacy llama2.c binary tokenizer format as fallback.
7175
7127
  */
7176
7128
 
7177
-
7178
7129
  /* Global for qsort comparator (vocab index sorting) */
7179
7130
  static char** g_vocab_for_sort;
7180
7131
  static int cmp_vocab_idx(const void* a, const void* b) {
@@ -8519,7 +8470,6 @@ const char* tq_decode(const tq_tokenizer_t* tok, int prev_token, int token) {
8519
8470
  * Supports hybrid architectures (e.g., Qwen3.5 DeltaNet + self_attn).
8520
8471
  */
8521
8472
 
8522
-
8523
8473
  #ifdef _WIN32
8524
8474
  #else
8525
8475
  #endif
@@ -12179,8 +12129,13 @@ tq_model_t* tq_load_gguf(const char* path) {
12179
12129
  }
12180
12130
 
12181
12131
  const size_t MAX_FP32_BYTES = (size_t)16 * 1024 * 1024 * 1024ULL; /* 16 GB */
12182
- /* TQ_NO_Q4=1 disables Q4 recompression → use direct GGUF dequant for better quality */
12132
+ /* TQ_NO_Q4=1 disables Q4 recompression → use direct GGUF dequant for better quality.
12133
+ * Can be set via environment variable or compile-time define (useful for WASM). */
12134
+ #ifdef TQ_NO_Q4
12135
+ if (1) {
12136
+ #else
12183
12137
  if (getenv("TQ_NO_Q4")) {
12138
+ #endif
12184
12139
  fprintf(stderr, "tq_load_gguf: TQ_NO_Q4 set — skipping Q4 conversion, using GGUF on-the-fly dequant\n");
12185
12140
  goto skip_q4_conversion;
12186
12141
  }
@@ -12929,7 +12884,6 @@ void tq_quantize_weights_1bit(tq_model_t* model) {
12929
12884
  * -> residual add
12930
12885
  */
12931
12886
 
12932
-
12933
12887
  /* Unified Q2/1-bit matmul dispatch.
12934
12888
  * When model->use_1bit_weights, Q2 fields contain sign bits + norms,
12935
12889
  * dispatched to tq_matmul_1bit (FP32 input required).
@@ -15189,7 +15143,6 @@ float* tq_forward(tq_model_t* model, tq_state_t* s, int token, int pos) {
15189
15143
  }
15190
15144
  }
15191
15145
 
15192
-
15193
15146
  /* Increment profile token count if profiling is active */
15194
15147
  if (s->profile_kv) {
15195
15148
  s->profile_kv_count++;
@@ -15240,7 +15193,6 @@ float* tq_forward(tq_model_t* model, tq_state_t* s, int token, int pos) {
15240
15193
  * - Full generation loop with streaming callback
15241
15194
  */
15242
15195
 
15243
-
15244
15196
  /* ============================================================
15245
15197
  * Argmax sampling: return token with highest logit
15246
15198
  * ============================================================ */
@@ -15461,7 +15413,12 @@ int tq_generate(tq_model_t* model, tq_tokenizer_t* tokenizer,
15461
15413
  fprintf(stderr, "\n");
15462
15414
  }
15463
15415
 
15464
- /* Prefill: process all prompt tokens */
15416
+ /* Prefill: process all prompt tokens.
15417
+ * NOTE: No emscripten_sleep() here — the call stack during tq_forward()
15418
+ * is too deep for ASYNCIFY to unwind (matmul → SIMD kernels). Adding
15419
+ * sleep here breaks ASYNCIFY for the entire generate call, including
15420
+ * the token streaming callback. The browser shows "Thinking..." via
15421
+ * requestAnimationFrame before entering this blocking prefill. */
15465
15422
  for (int i = 0; i < n_prompt; i++) {
15466
15423
  tq_forward(model, state, prompt_tokens[i], i);
15467
15424
  }
@@ -15663,7 +15620,6 @@ int tq_generate(tq_model_t* model, tq_tokenizer_t* tokenizer,
15663
15620
  return generated;
15664
15621
  }
15665
15622
 
15666
-
15667
15623
  // ============================================================================
15668
15624
 
15669
15625
  // ============================================================================
@@ -0,0 +1,289 @@
1
+ """
2
+ quantcpp CLI — chat with a local LLM in your terminal.
3
+
4
+ Ollama-style commands:
5
+ quantcpp pull MODEL Download a model from HuggingFace
6
+ quantcpp list List cached and available models
7
+ quantcpp run MODEL [Q] Chat with a model (auto-pulls if needed)
8
+ quantcpp serve MODEL Start OpenAI-compatible HTTP server
9
+
10
+ Backwards-compatible shortcut:
11
+ quantcpp Auto-downloads Llama-3.2-1B, starts chat
12
+ quantcpp "What is X?" One-shot question with default model
13
+ quantcpp --model NAME Use a specific model
14
+ """
15
+
16
+ import sys
17
+ import os
18
+ import json
19
+
20
+
21
+ # Ollama-style short aliases → canonical _MODEL_REGISTRY keys
22
+ MODEL_ALIASES = {
23
+ "smollm2": "SmolLM2-135M",
24
+ "smollm2:135m": "SmolLM2-135M",
25
+ "qwen3.5": "Qwen3.5-0.8B",
26
+ "qwen3.5:0.8b": "Qwen3.5-0.8B",
27
+ "llama3.2": "Llama-3.2-1B",
28
+ "llama3.2:1b": "Llama-3.2-1B",
29
+ }
30
+
31
+
32
+ def _resolve_name(name):
33
+ """Resolve user input to canonical registry key or local path."""
34
+ if name is None:
35
+ return None
36
+ if os.path.exists(name) and name.endswith(".gguf"):
37
+ return name
38
+ return MODEL_ALIASES.get(name.lower(), name)
39
+
40
+
41
+ def _registry():
42
+ from quantcpp import _MODEL_REGISTRY, _CACHE_DIR
43
+ return _MODEL_REGISTRY, _CACHE_DIR
44
+
45
+
46
+ def cmd_pull(args):
47
+ """Download a model by alias or canonical name."""
48
+ import quantcpp
49
+ name = _resolve_name(args.model)
50
+
51
+ if os.path.exists(name) and name.endswith(".gguf"):
52
+ print(f"already local: {name}")
53
+ return 0
54
+
55
+ if name not in quantcpp._MODEL_REGISTRY:
56
+ avail = ", ".join(sorted(quantcpp._MODEL_REGISTRY.keys()))
57
+ aliases = ", ".join(sorted(MODEL_ALIASES.keys()))
58
+ print(f"unknown model: {args.model!r}", file=sys.stderr)
59
+ print(f" registry: {avail}", file=sys.stderr)
60
+ print(f" aliases: {aliases}", file=sys.stderr)
61
+ return 1
62
+
63
+ print(f"pulling {name}...", file=sys.stderr)
64
+ try:
65
+ path = quantcpp.download(name)
66
+ size_mb = os.path.getsize(path) / (1024 * 1024)
67
+ print(f"\u2713 {name} \u2192 {path} ({size_mb:.0f} MB)", file=sys.stderr)
68
+ return 0
69
+ except Exception as e:
70
+ print(f"download failed: {e}", file=sys.stderr)
71
+ return 1
72
+
73
+
74
+ def cmd_list(args):
75
+ """List cached and available models."""
76
+ registry, cache_dir = _registry()
77
+
78
+ rows = []
79
+ for name, (repo, filename, approx_mb) in sorted(registry.items()):
80
+ path = cache_dir / filename
81
+ if path.exists():
82
+ size_mb = path.stat().st_size / (1024 * 1024)
83
+ status = "cached"
84
+ display_path = str(path)
85
+ else:
86
+ size_mb = approx_mb
87
+ status = "remote"
88
+ display_path = f"~{approx_mb} MB"
89
+ alias = next((a for a, n in MODEL_ALIASES.items() if n == name and ":" in a), "")
90
+ rows.append((status, name, alias, size_mb, display_path))
91
+
92
+ if args.json_output:
93
+ print(json.dumps([
94
+ {"status": s, "name": n, "alias": a, "size_mb": round(sz, 1), "path": p}
95
+ for (s, n, a, sz, p) in rows
96
+ ], indent=2))
97
+ return 0
98
+
99
+ print(f"\n Models cache: {cache_dir}\n")
100
+ print(f" {'STATUS':<8} {'NAME':<16} {'ALIAS':<14} {'SIZE':>8}")
101
+ print(f" {'-'*8} {'-'*16} {'-'*14} {'-'*8}")
102
+ for status, name, alias, size_mb, _ in rows:
103
+ size_str = f"{size_mb:.0f} MB"
104
+ print(f" {status:<8} {name:<16} {alias:<14} {size_str:>8}")
105
+ print()
106
+ return 0
107
+
108
+
109
+ def _resolve_to_path(name_or_path):
110
+ """Resolve alias/name to a local .gguf path, downloading if needed."""
111
+ import quantcpp
112
+ name = _resolve_name(name_or_path)
113
+
114
+ if os.path.exists(name) and name.endswith(".gguf"):
115
+ return name
116
+
117
+ if name not in quantcpp._MODEL_REGISTRY:
118
+ avail = ", ".join(sorted(quantcpp._MODEL_REGISTRY.keys()))
119
+ raise ValueError(
120
+ f"unknown model: {name_or_path!r}. Available: {avail}"
121
+ )
122
+
123
+ repo, filename, _ = quantcpp._MODEL_REGISTRY[name]
124
+ cached = quantcpp._CACHE_DIR / filename
125
+ if cached.exists():
126
+ return str(cached)
127
+
128
+ print(f"model not cached \u2014 pulling {name}...", file=sys.stderr)
129
+ return quantcpp.download(name)
130
+
131
+
132
+ def cmd_run(args):
133
+ """Chat with a model (auto-pull if needed)."""
134
+ try:
135
+ model_path = _resolve_to_path(args.model)
136
+ except ValueError as e:
137
+ print(str(e), file=sys.stderr)
138
+ return 1
139
+ except Exception as e:
140
+ print(f"pull failed: {e}", file=sys.stderr)
141
+ return 1
142
+
143
+ from quantcpp import Model
144
+ print(f"loading {os.path.basename(model_path)}...", file=sys.stderr)
145
+ m = Model(model_path, max_tokens=args.max_tokens, temperature=args.temperature,
146
+ n_threads=args.threads)
147
+
148
+ if args.prompt:
149
+ question = " ".join(args.prompt) if isinstance(args.prompt, list) else args.prompt
150
+ for tok in m.generate(question):
151
+ print(tok, end="", flush=True)
152
+ print()
153
+ else:
154
+ print("quantcpp \u2014 type your message, Ctrl+C to exit", file=sys.stderr)
155
+ try:
156
+ while True:
157
+ question = input("\nYou: ")
158
+ if not question.strip():
159
+ continue
160
+ print("AI: ", end="", flush=True)
161
+ for tok in m.generate(question):
162
+ print(tok, end="", flush=True)
163
+ print()
164
+ except (KeyboardInterrupt, EOFError):
165
+ print("\nBye!", file=sys.stderr)
166
+
167
+ m.close()
168
+ return 0
169
+
170
+
171
+ def cmd_serve(args):
172
+ """Start OpenAI-compatible HTTP server (requires quant-server binary)."""
173
+ import shutil
174
+ import subprocess
175
+
176
+ try:
177
+ model_path = _resolve_to_path(args.model)
178
+ except Exception as e:
179
+ print(f"error: {e}", file=sys.stderr)
180
+ return 1
181
+
182
+ binary = shutil.which("quant-server")
183
+ if not binary:
184
+ # Look in common build dirs relative to repo
185
+ for guess in ("./build/quant-server", "./build_metal/quant-server"):
186
+ if os.path.isfile(guess) and os.access(guess, os.X_OK):
187
+ binary = guess
188
+ break
189
+
190
+ if not binary:
191
+ print("quant-server binary not found.", file=sys.stderr)
192
+ print(" Build with: cmake -B build -DTQ_BUILD_SERVER=ON && cmake --build build",
193
+ file=sys.stderr)
194
+ print(" Or install via your package manager.", file=sys.stderr)
195
+ return 2
196
+
197
+ cmd = [binary, model_path, "-p", str(args.port), "-j", str(args.threads)]
198
+ print(f"quant serve {os.path.basename(model_path)} on :{args.port}", file=sys.stderr)
199
+ os.execvp(cmd[0], cmd)
200
+
201
+
202
+ def cmd_chat_default(args):
203
+ """Backwards-compatible default: auto-download Llama-3.2-1B and chat."""
204
+ args.model = args.model or "Llama-3.2-1B"
205
+ args.threads = getattr(args, "threads", 4)
206
+ args.max_tokens = getattr(args, "max_tokens", 256)
207
+ args.temperature = getattr(args, "temperature", 0.7)
208
+ args.prompt = args.prompt or None
209
+ return cmd_run(args)
210
+
211
+
212
+ def main():
213
+ import argparse
214
+
215
+ parser = argparse.ArgumentParser(
216
+ prog="quantcpp",
217
+ description="Chat with a local LLM. No API key, no GPU, no server.",
218
+ formatter_class=argparse.RawDescriptionHelpFormatter,
219
+ epilog="""
220
+ commands:
221
+ pull MODEL Download a model (e.g. llama3.2:1b)
222
+ list List cached and available models
223
+ run MODEL [PROMPT] Chat with a model (auto-pulls if needed)
224
+ serve MODEL Start OpenAI-compatible HTTP server
225
+
226
+ examples:
227
+ quantcpp pull llama3.2:1b
228
+ quantcpp list
229
+ quantcpp run llama3.2:1b
230
+ quantcpp run llama3.2:1b "What is gravity?"
231
+ quantcpp serve llama3.2:1b --port 8080
232
+
233
+ backwards-compat (no subcommand):
234
+ quantcpp # default chat with Llama-3.2-1B
235
+ quantcpp "What is gravity?" # one-shot
236
+ quantcpp --model SmolLM2-135M # different model
237
+ """,
238
+ )
239
+
240
+ sub = parser.add_subparsers(dest="command")
241
+
242
+ # pull
243
+ p_pull = sub.add_parser("pull", help="Download a model from HuggingFace")
244
+ p_pull.add_argument("model", help="Model name or alias (e.g. llama3.2:1b)")
245
+
246
+ # list
247
+ p_list = sub.add_parser("list", help="List cached and available models")
248
+ p_list.add_argument("--json", dest="json_output", action="store_true")
249
+
250
+ # run
251
+ p_run = sub.add_parser("run", help="Chat with a model (auto-pulls if needed)")
252
+ p_run.add_argument("model", help="Model name, alias, or .gguf path")
253
+ p_run.add_argument("prompt", nargs="*", default=None, help="Optional prompt")
254
+ p_run.add_argument("-j", "--threads", type=int, default=4)
255
+ p_run.add_argument("-n", "--max-tokens", type=int, default=256)
256
+ p_run.add_argument("-t", "--temperature", type=float, default=0.7)
257
+
258
+ # serve
259
+ p_serve = sub.add_parser("serve", help="Start OpenAI-compatible HTTP server")
260
+ p_serve.add_argument("model", help="Model name, alias, or .gguf path")
261
+ p_serve.add_argument("-p", "--port", type=int, default=8080)
262
+ p_serve.add_argument("-j", "--threads", type=int, default=4)
263
+
264
+ # Backwards-compat: top-level args for direct chat
265
+ parser.add_argument("prompt", nargs="*", default=None,
266
+ help="(default mode) question to ask")
267
+ parser.add_argument("--model", "-m", default=None,
268
+ help="(default mode) model name or .gguf path")
269
+ parser.add_argument("--max-tokens", "-n", type=int, default=256)
270
+ parser.add_argument("--temperature", "-t", type=float, default=0.7)
271
+ parser.add_argument("--threads", "-j", type=int, default=4)
272
+
273
+ args = parser.parse_args()
274
+
275
+ if args.command == "pull":
276
+ return cmd_pull(args)
277
+ if args.command == "list":
278
+ return cmd_list(args)
279
+ if args.command == "run":
280
+ return cmd_run(args)
281
+ if args.command == "serve":
282
+ return cmd_serve(args)
283
+
284
+ # No subcommand → backwards-compat default chat
285
+ return cmd_chat_default(args)
286
+
287
+
288
+ if __name__ == "__main__":
289
+ sys.exit(main())
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: quantcpp
3
- Version: 0.11.0
3
+ Version: 0.12.0
4
4
  Summary: Single-header LLM inference engine with KV cache compression (7× compression at fp32 parity)
5
5
  Author-email: quantumaikr <noreply@quantumaikr.com>
6
6
  License: Apache-2.0
@@ -1,64 +0,0 @@
1
- """
2
- quantcpp CLI — chat with a local LLM in your terminal.
3
-
4
- Usage:
5
- quantcpp # auto-downloads Llama-3.2-1B, starts chat
6
- quantcpp "What is gravity?" # one-shot question
7
- quantcpp --model SmolLM2-135M # use a smaller model (faster download)
8
- quantcpp --model path/to/file.gguf # use your own GGUF file
9
- """
10
-
11
- import sys
12
- import os
13
-
14
-
15
- def main():
16
- import argparse
17
- parser = argparse.ArgumentParser(
18
- prog="quantcpp",
19
- description="Chat with a local LLM. No API key, no GPU, no server.",
20
- )
21
- parser.add_argument("prompt", nargs="*", help="Question to ask (omit for interactive chat)")
22
- parser.add_argument("--model", "-m", default="Llama-3.2-1B",
23
- help="Model name or path to .gguf file (default: Llama-3.2-1B)")
24
- parser.add_argument("--max-tokens", "-n", type=int, default=256)
25
- parser.add_argument("--temperature", "-t", type=float, default=0.7)
26
- args = parser.parse_args()
27
-
28
- from quantcpp import Model
29
-
30
- # Load model
31
- model_path = args.model
32
- if os.path.isfile(model_path):
33
- print(f"Loading {model_path}...", file=sys.stderr)
34
- m = Model(model_path, max_tokens=args.max_tokens, temperature=args.temperature)
35
- else:
36
- print(f"Downloading {model_path}...", file=sys.stderr)
37
- m = Model.from_pretrained(model_path, max_tokens=args.max_tokens,
38
- temperature=args.temperature)
39
-
40
- # One-shot or interactive
41
- if args.prompt:
42
- question = " ".join(args.prompt)
43
- for tok in m.generate(question):
44
- print(tok, end="", flush=True)
45
- print()
46
- else:
47
- print("quantcpp — type your message, Ctrl+C to exit", file=sys.stderr)
48
- try:
49
- while True:
50
- question = input("\nYou: ")
51
- if not question.strip():
52
- continue
53
- print("AI: ", end="", flush=True)
54
- for tok in m.generate(question):
55
- print(tok, end="", flush=True)
56
- print()
57
- except (KeyboardInterrupt, EOFError):
58
- print("\nBye!", file=sys.stderr)
59
-
60
- m.close()
61
-
62
-
63
- if __name__ == "__main__":
64
- main()
File without changes
File without changes
File without changes
File without changes
File without changes