quantcpp 0.11.0__tar.gz → 0.12.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {quantcpp-0.11.0/quantcpp.egg-info → quantcpp-0.12.1}/PKG-INFO +1 -1
- {quantcpp-0.11.0 → quantcpp-0.12.1}/pyproject.toml +1 -1
- {quantcpp-0.11.0 → quantcpp-0.12.1}/quant.h +18 -58
- {quantcpp-0.11.0 → quantcpp-0.12.1}/quantcpp/__init__.py +1 -1
- {quantcpp-0.11.0 → quantcpp-0.12.1}/quantcpp/_quant.h +18 -58
- quantcpp-0.12.1/quantcpp/cli.py +390 -0
- {quantcpp-0.11.0 → quantcpp-0.12.1/quantcpp.egg-info}/PKG-INFO +1 -1
- quantcpp-0.11.0/quantcpp/cli.py +0 -64
- {quantcpp-0.11.0 → quantcpp-0.12.1}/MANIFEST.in +0 -0
- {quantcpp-0.11.0 → quantcpp-0.12.1}/README.md +0 -0
- {quantcpp-0.11.0 → quantcpp-0.12.1}/quantcpp/_binding.py +0 -0
- {quantcpp-0.11.0 → quantcpp-0.12.1}/quantcpp.egg-info/SOURCES.txt +0 -0
- {quantcpp-0.11.0 → quantcpp-0.12.1}/quantcpp.egg-info/dependency_links.txt +0 -0
- {quantcpp-0.11.0 → quantcpp-0.12.1}/quantcpp.egg-info/entry_points.txt +0 -0
- {quantcpp-0.11.0 → quantcpp-0.12.1}/quantcpp.egg-info/requires.txt +0 -0
- {quantcpp-0.11.0 → quantcpp-0.12.1}/quantcpp.egg-info/top_level.txt +0 -0
- {quantcpp-0.11.0 → quantcpp-0.12.1}/setup.cfg +0 -0
- {quantcpp-0.11.0 → quantcpp-0.12.1}/setup.py +0 -0
- {quantcpp-0.11.0 → quantcpp-0.12.1}/tests/test_basic.py +0 -0
- {quantcpp-0.11.0 → quantcpp-0.12.1}/tests/test_python.py +0 -0
|
@@ -7,7 +7,7 @@ build-backend = "setuptools.build_meta"
|
|
|
7
7
|
|
|
8
8
|
[project]
|
|
9
9
|
name = "quantcpp"
|
|
10
|
-
version = "0.
|
|
10
|
+
version = "0.12.1"
|
|
11
11
|
description = "Single-header LLM inference engine with KV cache compression (7× compression at fp32 parity)"
|
|
12
12
|
readme = "README.md"
|
|
13
13
|
license = { text = "Apache-2.0" }
|
|
@@ -202,8 +202,6 @@ static inline int clock_gettime(int id, struct timespec* ts) {
|
|
|
202
202
|
// Section 1: Types and Specs (from tq_types.h, tq_spec.h)
|
|
203
203
|
// ============================================================================
|
|
204
204
|
|
|
205
|
-
|
|
206
|
-
|
|
207
205
|
/* Cross-language static assert: works in both C11 and C++11/17 */
|
|
208
206
|
#ifdef __cplusplus
|
|
209
207
|
#define TQ_STATIC_ASSERT(cond, msg) static_assert(cond, msg)
|
|
@@ -219,8 +217,6 @@ static inline int clock_gettime(int id, struct timespec* ts) {
|
|
|
219
217
|
#define TQ_PI_2 1.5707963267948966f
|
|
220
218
|
#endif
|
|
221
219
|
|
|
222
|
-
|
|
223
|
-
|
|
224
220
|
/* ============================================================
|
|
225
221
|
* Constants
|
|
226
222
|
* ============================================================ */
|
|
@@ -398,8 +394,6 @@ typedef struct {
|
|
|
398
394
|
int enable_recompression;/* Tier 1 → Tier 2 re-compression */
|
|
399
395
|
} tq_progressive_config_t;
|
|
400
396
|
|
|
401
|
-
|
|
402
|
-
|
|
403
397
|
/* TurboQuant KV cache block: RHT + Lloyd-Max codebook + QJL residual
|
|
404
398
|
* 3-bit variant: 2-bit codebook (4 levels) + 1-bit QJL sign hash
|
|
405
399
|
* Block covers TQ_BK elements (128).
|
|
@@ -469,12 +463,6 @@ TQ_CHECK_SIZE(block_tq_turbo_kv_4b, 8 + TQ_BK * 3 / 8 + TQ_BK / 8);
|
|
|
469
463
|
TQ_CHECK_SIZE(block_tq_turbo_kv_1b, 8 + TQ_BK / 8);
|
|
470
464
|
TQ_CHECK_SIZE(block_tq_turbo_kv_2b, 8 + TQ_BK / 8 + TQ_BK / 8);
|
|
471
465
|
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
466
|
/* Format specification — version-aware, ONNX-inspired */
|
|
479
467
|
|
|
480
468
|
#define TQ_SPEC_VERSION 1
|
|
@@ -500,18 +488,10 @@ typedef struct {
|
|
|
500
488
|
uint8_t flags; /* TQ_FLAG_* bitmask */
|
|
501
489
|
} tq_format_spec_t;
|
|
502
490
|
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
491
|
// ============================================================================
|
|
508
492
|
// Section 2: Engine Types (from tq_engine.h)
|
|
509
493
|
// ============================================================================
|
|
510
494
|
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
495
|
/* ============================================================
|
|
516
496
|
* Model configuration
|
|
517
497
|
* ============================================================ */
|
|
@@ -886,6 +866,7 @@ typedef struct {
|
|
|
886
866
|
int n_threads;
|
|
887
867
|
float rep_penalty; /* repetition penalty (default: 1.1, 1.0 = disabled) */
|
|
888
868
|
int rep_window; /* how many recent tokens to penalize (default: 32) */
|
|
869
|
+
unsigned long long rng_seed; /* sampling seed (default: 42, 0 = use 42 for back-compat) */
|
|
889
870
|
/* Callback for streaming output */
|
|
890
871
|
void (*on_token)(const char* text, void* user_data);
|
|
891
872
|
void* user_data;
|
|
@@ -1123,9 +1104,6 @@ void tq_tp_run(void* (*fn)(void*), void** args, int n_tasks);
|
|
|
1123
1104
|
/* Max threads supported by thread pool */
|
|
1124
1105
|
#define TQ_TP_MAX 16
|
|
1125
1106
|
|
|
1126
|
-
|
|
1127
|
-
|
|
1128
|
-
|
|
1129
1107
|
// ============================================================================
|
|
1130
1108
|
// Section 3: GGUF Types (from tq_gguf.h)
|
|
1131
1109
|
// ============================================================================
|
|
@@ -1143,10 +1121,6 @@ void tq_tp_run(void* (*fn)(void*), void** args, int n_tasks);
|
|
|
1143
1121
|
* directly into TurboQuant inference engine.
|
|
1144
1122
|
*/
|
|
1145
1123
|
|
|
1146
|
-
|
|
1147
|
-
|
|
1148
|
-
|
|
1149
|
-
|
|
1150
1124
|
/* ============================================================
|
|
1151
1125
|
* GGUF format constants
|
|
1152
1126
|
* ============================================================ */
|
|
@@ -1462,14 +1436,10 @@ int tq_metal_moe_forward(
|
|
|
1462
1436
|
const int* up_types, /* per-expert up quant types, NULL = use weight_type */
|
|
1463
1437
|
const int* down_types); /* per-expert down quant types, NULL = use weight_type */
|
|
1464
1438
|
|
|
1465
|
-
|
|
1466
|
-
|
|
1467
|
-
|
|
1468
1439
|
// ============================================================================
|
|
1469
1440
|
// Section 4: Internal API (from turboquant.h)
|
|
1470
1441
|
// ============================================================================
|
|
1471
1442
|
|
|
1472
|
-
|
|
1473
1443
|
/**
|
|
1474
1444
|
* TurboQuant.cpp — Cross-platform KV cache compression library
|
|
1475
1445
|
*
|
|
@@ -1477,9 +1447,6 @@ int tq_metal_moe_forward(
|
|
|
1477
1447
|
* Zero external dependencies (libc/libm only).
|
|
1478
1448
|
*/
|
|
1479
1449
|
|
|
1480
|
-
|
|
1481
|
-
|
|
1482
|
-
|
|
1483
1450
|
/* ============================================================
|
|
1484
1451
|
* Version
|
|
1485
1452
|
* ============================================================ */
|
|
@@ -1753,15 +1720,10 @@ void tq_progressive_free(tq_progressive_t* p);
|
|
|
1753
1720
|
|
|
1754
1721
|
tq_progressive_config_t tq_progressive_default_config(void);
|
|
1755
1722
|
|
|
1756
|
-
|
|
1757
|
-
|
|
1758
|
-
|
|
1759
|
-
|
|
1760
1723
|
// ============================================================================
|
|
1761
1724
|
// Section 5: quant_ctx struct definition
|
|
1762
1725
|
// ============================================================================
|
|
1763
1726
|
|
|
1764
|
-
|
|
1765
1727
|
struct quant_ctx {
|
|
1766
1728
|
tq_model_t* model;
|
|
1767
1729
|
tq_state_t* state;
|
|
@@ -1788,7 +1750,6 @@ struct quant_ctx {
|
|
|
1788
1750
|
* - Random signs decorrelate channels across different blocks
|
|
1789
1751
|
*/
|
|
1790
1752
|
|
|
1791
|
-
|
|
1792
1753
|
#ifdef __ARM_NEON
|
|
1793
1754
|
#include <arm_neon.h>
|
|
1794
1755
|
#endif
|
|
@@ -1902,7 +1863,6 @@ void tq_rht_inverse(float* data, int n, uint32_t seed) {
|
|
|
1902
1863
|
*/
|
|
1903
1864
|
/* Generic reference — no compiler-specific pragmas */
|
|
1904
1865
|
|
|
1905
|
-
|
|
1906
1866
|
/* ---------- FP16 helpers ---------- */
|
|
1907
1867
|
|
|
1908
1868
|
static uint16_t uni_fp32_to_fp16(float v) {
|
|
@@ -2285,7 +2245,6 @@ void tq_uniform_3b_attention_ref(const float* query, const void* kv,
|
|
|
2285
2245
|
// Section 8: Type Traits (from tq_traits.c)
|
|
2286
2246
|
// ============================================================================
|
|
2287
2247
|
|
|
2288
|
-
|
|
2289
2248
|
/* Stub implementations for excluded quantization types (polar, qjl, turbo, mixed) */
|
|
2290
2249
|
static void tq_stub_quantize(const float* src, void* dst, int n) {
|
|
2291
2250
|
(void)src; (void)dst; (void)n;
|
|
@@ -2583,7 +2542,6 @@ tq_type tq_type_from_name(const char* name) {
|
|
|
2583
2542
|
* No external dependencies — libc/libm only.
|
|
2584
2543
|
*/
|
|
2585
2544
|
|
|
2586
|
-
|
|
2587
2545
|
#ifdef __ARM_NEON
|
|
2588
2546
|
#include <arm_neon.h>
|
|
2589
2547
|
#endif
|
|
@@ -2617,7 +2575,6 @@ static struct {
|
|
|
2617
2575
|
|
|
2618
2576
|
static int g_n_threads = 1;
|
|
2619
2577
|
|
|
2620
|
-
|
|
2621
2578
|
static void* tp_worker(void* arg) {
|
|
2622
2579
|
int id = (int)(intptr_t)arg;
|
|
2623
2580
|
int my_gen = 0;
|
|
@@ -4173,6 +4130,7 @@ tq_gen_config_t tq_default_gen_config(void) {
|
|
|
4173
4130
|
config.n_threads = 1;
|
|
4174
4131
|
config.rep_penalty = 1.1f;
|
|
4175
4132
|
config.rep_window = 32;
|
|
4133
|
+
config.rng_seed = 42ULL;
|
|
4176
4134
|
config.on_token = NULL;
|
|
4177
4135
|
config.user_data = NULL;
|
|
4178
4136
|
return config;
|
|
@@ -4388,8 +4346,6 @@ void tq_matmul_1bit(float* out, const float* x,
|
|
|
4388
4346
|
* SPDX-License-Identifier: MIT
|
|
4389
4347
|
*/
|
|
4390
4348
|
|
|
4391
|
-
|
|
4392
|
-
|
|
4393
4349
|
#ifdef _WIN32
|
|
4394
4350
|
#else
|
|
4395
4351
|
#endif
|
|
@@ -5098,8 +5054,6 @@ const tq_gguf_tensor_t* tq_gguf_find_tensor(const tq_gguf_ctx_t* ctx, const char
|
|
|
5098
5054
|
* Pure C11, no external dependencies.
|
|
5099
5055
|
*/
|
|
5100
5056
|
|
|
5101
|
-
|
|
5102
|
-
|
|
5103
5057
|
#if defined(__ARM_NEON) || defined(__ARM_NEON__)
|
|
5104
5058
|
#include <arm_neon.h>
|
|
5105
5059
|
#define TQ_HAS_NEON 1
|
|
@@ -7174,7 +7128,6 @@ void tq_metal_batch_end_if_available(void) {
|
|
|
7174
7128
|
* Also supports the legacy llama2.c binary tokenizer format as fallback.
|
|
7175
7129
|
*/
|
|
7176
7130
|
|
|
7177
|
-
|
|
7178
7131
|
/* Global for qsort comparator (vocab index sorting) */
|
|
7179
7132
|
static char** g_vocab_for_sort;
|
|
7180
7133
|
static int cmp_vocab_idx(const void* a, const void* b) {
|
|
@@ -8519,7 +8472,6 @@ const char* tq_decode(const tq_tokenizer_t* tok, int prev_token, int token) {
|
|
|
8519
8472
|
* Supports hybrid architectures (e.g., Qwen3.5 DeltaNet + self_attn).
|
|
8520
8473
|
*/
|
|
8521
8474
|
|
|
8522
|
-
|
|
8523
8475
|
#ifdef _WIN32
|
|
8524
8476
|
#else
|
|
8525
8477
|
#endif
|
|
@@ -12179,8 +12131,13 @@ tq_model_t* tq_load_gguf(const char* path) {
|
|
|
12179
12131
|
}
|
|
12180
12132
|
|
|
12181
12133
|
const size_t MAX_FP32_BYTES = (size_t)16 * 1024 * 1024 * 1024ULL; /* 16 GB */
|
|
12182
|
-
/* TQ_NO_Q4=1 disables Q4 recompression → use direct GGUF dequant for better quality
|
|
12134
|
+
/* TQ_NO_Q4=1 disables Q4 recompression → use direct GGUF dequant for better quality.
|
|
12135
|
+
* Can be set via environment variable or compile-time define (useful for WASM). */
|
|
12136
|
+
#ifdef TQ_NO_Q4
|
|
12137
|
+
if (1) {
|
|
12138
|
+
#else
|
|
12183
12139
|
if (getenv("TQ_NO_Q4")) {
|
|
12140
|
+
#endif
|
|
12184
12141
|
fprintf(stderr, "tq_load_gguf: TQ_NO_Q4 set — skipping Q4 conversion, using GGUF on-the-fly dequant\n");
|
|
12185
12142
|
goto skip_q4_conversion;
|
|
12186
12143
|
}
|
|
@@ -12929,7 +12886,6 @@ void tq_quantize_weights_1bit(tq_model_t* model) {
|
|
|
12929
12886
|
* -> residual add
|
|
12930
12887
|
*/
|
|
12931
12888
|
|
|
12932
|
-
|
|
12933
12889
|
/* Unified Q2/1-bit matmul dispatch.
|
|
12934
12890
|
* When model->use_1bit_weights, Q2 fields contain sign bits + norms,
|
|
12935
12891
|
* dispatched to tq_matmul_1bit (FP32 input required).
|
|
@@ -15189,7 +15145,6 @@ float* tq_forward(tq_model_t* model, tq_state_t* s, int token, int pos) {
|
|
|
15189
15145
|
}
|
|
15190
15146
|
}
|
|
15191
15147
|
|
|
15192
|
-
|
|
15193
15148
|
/* Increment profile token count if profiling is active */
|
|
15194
15149
|
if (s->profile_kv) {
|
|
15195
15150
|
s->profile_kv_count++;
|
|
@@ -15240,7 +15195,6 @@ float* tq_forward(tq_model_t* model, tq_state_t* s, int token, int pos) {
|
|
|
15240
15195
|
* - Full generation loop with streaming callback
|
|
15241
15196
|
*/
|
|
15242
15197
|
|
|
15243
|
-
|
|
15244
15198
|
/* ============================================================
|
|
15245
15199
|
* Argmax sampling: return token with highest logit
|
|
15246
15200
|
* ============================================================ */
|
|
@@ -15461,7 +15415,12 @@ int tq_generate(tq_model_t* model, tq_tokenizer_t* tokenizer,
|
|
|
15461
15415
|
fprintf(stderr, "\n");
|
|
15462
15416
|
}
|
|
15463
15417
|
|
|
15464
|
-
/* Prefill: process all prompt tokens
|
|
15418
|
+
/* Prefill: process all prompt tokens.
|
|
15419
|
+
* NOTE: No emscripten_sleep() here — the call stack during tq_forward()
|
|
15420
|
+
* is too deep for ASYNCIFY to unwind (matmul → SIMD kernels). Adding
|
|
15421
|
+
* sleep here breaks ASYNCIFY for the entire generate call, including
|
|
15422
|
+
* the token streaming callback. The browser shows "Thinking..." via
|
|
15423
|
+
* requestAnimationFrame before entering this blocking prefill. */
|
|
15465
15424
|
for (int i = 0; i < n_prompt; i++) {
|
|
15466
15425
|
tq_forward(model, state, prompt_tokens[i], i);
|
|
15467
15426
|
}
|
|
@@ -15496,9 +15455,11 @@ int tq_generate(tq_model_t* model, tq_tokenizer_t* tokenizer,
|
|
|
15496
15455
|
}
|
|
15497
15456
|
}
|
|
15498
15457
|
|
|
15499
|
-
/* Sample first generated token
|
|
15458
|
+
/* Sample first generated token. The seed is configurable via
|
|
15459
|
+
* config->rng_seed (default 42); 0 falls back to 42 so existing
|
|
15460
|
+
* callers that never set rng_seed get bit-identical behaviour. */
|
|
15500
15461
|
int pos = n_prompt;
|
|
15501
|
-
unsigned long long rng_state =
|
|
15462
|
+
unsigned long long rng_state = config->rng_seed ? config->rng_seed : 42ULL;
|
|
15502
15463
|
int next_token = tq_sample_topp(state->logits, vocab_size,
|
|
15503
15464
|
config->temperature, config->top_p,
|
|
15504
15465
|
&rng_state);
|
|
@@ -15663,7 +15624,6 @@ int tq_generate(tq_model_t* model, tq_tokenizer_t* tokenizer,
|
|
|
15663
15624
|
return generated;
|
|
15664
15625
|
}
|
|
15665
15626
|
|
|
15666
|
-
|
|
15667
15627
|
// ============================================================================
|
|
15668
15628
|
|
|
15669
15629
|
// ============================================================================
|
|
@@ -15,7 +15,7 @@ try:
|
|
|
15
15
|
from importlib.metadata import version as _pkg_version
|
|
16
16
|
__version__ = _pkg_version("quantcpp")
|
|
17
17
|
except Exception:
|
|
18
|
-
__version__ = "0.
|
|
18
|
+
__version__ = "0.12.1" # fallback for editable / source-tree imports
|
|
19
19
|
|
|
20
20
|
import os
|
|
21
21
|
import sys
|
|
@@ -202,8 +202,6 @@ static inline int clock_gettime(int id, struct timespec* ts) {
|
|
|
202
202
|
// Section 1: Types and Specs (from tq_types.h, tq_spec.h)
|
|
203
203
|
// ============================================================================
|
|
204
204
|
|
|
205
|
-
|
|
206
|
-
|
|
207
205
|
/* Cross-language static assert: works in both C11 and C++11/17 */
|
|
208
206
|
#ifdef __cplusplus
|
|
209
207
|
#define TQ_STATIC_ASSERT(cond, msg) static_assert(cond, msg)
|
|
@@ -219,8 +217,6 @@ static inline int clock_gettime(int id, struct timespec* ts) {
|
|
|
219
217
|
#define TQ_PI_2 1.5707963267948966f
|
|
220
218
|
#endif
|
|
221
219
|
|
|
222
|
-
|
|
223
|
-
|
|
224
220
|
/* ============================================================
|
|
225
221
|
* Constants
|
|
226
222
|
* ============================================================ */
|
|
@@ -398,8 +394,6 @@ typedef struct {
|
|
|
398
394
|
int enable_recompression;/* Tier 1 → Tier 2 re-compression */
|
|
399
395
|
} tq_progressive_config_t;
|
|
400
396
|
|
|
401
|
-
|
|
402
|
-
|
|
403
397
|
/* TurboQuant KV cache block: RHT + Lloyd-Max codebook + QJL residual
|
|
404
398
|
* 3-bit variant: 2-bit codebook (4 levels) + 1-bit QJL sign hash
|
|
405
399
|
* Block covers TQ_BK elements (128).
|
|
@@ -469,12 +463,6 @@ TQ_CHECK_SIZE(block_tq_turbo_kv_4b, 8 + TQ_BK * 3 / 8 + TQ_BK / 8);
|
|
|
469
463
|
TQ_CHECK_SIZE(block_tq_turbo_kv_1b, 8 + TQ_BK / 8);
|
|
470
464
|
TQ_CHECK_SIZE(block_tq_turbo_kv_2b, 8 + TQ_BK / 8 + TQ_BK / 8);
|
|
471
465
|
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
466
|
/* Format specification — version-aware, ONNX-inspired */
|
|
479
467
|
|
|
480
468
|
#define TQ_SPEC_VERSION 1
|
|
@@ -500,18 +488,10 @@ typedef struct {
|
|
|
500
488
|
uint8_t flags; /* TQ_FLAG_* bitmask */
|
|
501
489
|
} tq_format_spec_t;
|
|
502
490
|
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
491
|
// ============================================================================
|
|
508
492
|
// Section 2: Engine Types (from tq_engine.h)
|
|
509
493
|
// ============================================================================
|
|
510
494
|
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
495
|
/* ============================================================
|
|
516
496
|
* Model configuration
|
|
517
497
|
* ============================================================ */
|
|
@@ -886,6 +866,7 @@ typedef struct {
|
|
|
886
866
|
int n_threads;
|
|
887
867
|
float rep_penalty; /* repetition penalty (default: 1.1, 1.0 = disabled) */
|
|
888
868
|
int rep_window; /* how many recent tokens to penalize (default: 32) */
|
|
869
|
+
unsigned long long rng_seed; /* sampling seed (default: 42, 0 = use 42 for back-compat) */
|
|
889
870
|
/* Callback for streaming output */
|
|
890
871
|
void (*on_token)(const char* text, void* user_data);
|
|
891
872
|
void* user_data;
|
|
@@ -1123,9 +1104,6 @@ void tq_tp_run(void* (*fn)(void*), void** args, int n_tasks);
|
|
|
1123
1104
|
/* Max threads supported by thread pool */
|
|
1124
1105
|
#define TQ_TP_MAX 16
|
|
1125
1106
|
|
|
1126
|
-
|
|
1127
|
-
|
|
1128
|
-
|
|
1129
1107
|
// ============================================================================
|
|
1130
1108
|
// Section 3: GGUF Types (from tq_gguf.h)
|
|
1131
1109
|
// ============================================================================
|
|
@@ -1143,10 +1121,6 @@ void tq_tp_run(void* (*fn)(void*), void** args, int n_tasks);
|
|
|
1143
1121
|
* directly into TurboQuant inference engine.
|
|
1144
1122
|
*/
|
|
1145
1123
|
|
|
1146
|
-
|
|
1147
|
-
|
|
1148
|
-
|
|
1149
|
-
|
|
1150
1124
|
/* ============================================================
|
|
1151
1125
|
* GGUF format constants
|
|
1152
1126
|
* ============================================================ */
|
|
@@ -1462,14 +1436,10 @@ int tq_metal_moe_forward(
|
|
|
1462
1436
|
const int* up_types, /* per-expert up quant types, NULL = use weight_type */
|
|
1463
1437
|
const int* down_types); /* per-expert down quant types, NULL = use weight_type */
|
|
1464
1438
|
|
|
1465
|
-
|
|
1466
|
-
|
|
1467
|
-
|
|
1468
1439
|
// ============================================================================
|
|
1469
1440
|
// Section 4: Internal API (from turboquant.h)
|
|
1470
1441
|
// ============================================================================
|
|
1471
1442
|
|
|
1472
|
-
|
|
1473
1443
|
/**
|
|
1474
1444
|
* TurboQuant.cpp — Cross-platform KV cache compression library
|
|
1475
1445
|
*
|
|
@@ -1477,9 +1447,6 @@ int tq_metal_moe_forward(
|
|
|
1477
1447
|
* Zero external dependencies (libc/libm only).
|
|
1478
1448
|
*/
|
|
1479
1449
|
|
|
1480
|
-
|
|
1481
|
-
|
|
1482
|
-
|
|
1483
1450
|
/* ============================================================
|
|
1484
1451
|
* Version
|
|
1485
1452
|
* ============================================================ */
|
|
@@ -1753,15 +1720,10 @@ void tq_progressive_free(tq_progressive_t* p);
|
|
|
1753
1720
|
|
|
1754
1721
|
tq_progressive_config_t tq_progressive_default_config(void);
|
|
1755
1722
|
|
|
1756
|
-
|
|
1757
|
-
|
|
1758
|
-
|
|
1759
|
-
|
|
1760
1723
|
// ============================================================================
|
|
1761
1724
|
// Section 5: quant_ctx struct definition
|
|
1762
1725
|
// ============================================================================
|
|
1763
1726
|
|
|
1764
|
-
|
|
1765
1727
|
struct quant_ctx {
|
|
1766
1728
|
tq_model_t* model;
|
|
1767
1729
|
tq_state_t* state;
|
|
@@ -1788,7 +1750,6 @@ struct quant_ctx {
|
|
|
1788
1750
|
* - Random signs decorrelate channels across different blocks
|
|
1789
1751
|
*/
|
|
1790
1752
|
|
|
1791
|
-
|
|
1792
1753
|
#ifdef __ARM_NEON
|
|
1793
1754
|
#include <arm_neon.h>
|
|
1794
1755
|
#endif
|
|
@@ -1902,7 +1863,6 @@ void tq_rht_inverse(float* data, int n, uint32_t seed) {
|
|
|
1902
1863
|
*/
|
|
1903
1864
|
/* Generic reference — no compiler-specific pragmas */
|
|
1904
1865
|
|
|
1905
|
-
|
|
1906
1866
|
/* ---------- FP16 helpers ---------- */
|
|
1907
1867
|
|
|
1908
1868
|
static uint16_t uni_fp32_to_fp16(float v) {
|
|
@@ -2285,7 +2245,6 @@ void tq_uniform_3b_attention_ref(const float* query, const void* kv,
|
|
|
2285
2245
|
// Section 8: Type Traits (from tq_traits.c)
|
|
2286
2246
|
// ============================================================================
|
|
2287
2247
|
|
|
2288
|
-
|
|
2289
2248
|
/* Stub implementations for excluded quantization types (polar, qjl, turbo, mixed) */
|
|
2290
2249
|
static void tq_stub_quantize(const float* src, void* dst, int n) {
|
|
2291
2250
|
(void)src; (void)dst; (void)n;
|
|
@@ -2583,7 +2542,6 @@ tq_type tq_type_from_name(const char* name) {
|
|
|
2583
2542
|
* No external dependencies — libc/libm only.
|
|
2584
2543
|
*/
|
|
2585
2544
|
|
|
2586
|
-
|
|
2587
2545
|
#ifdef __ARM_NEON
|
|
2588
2546
|
#include <arm_neon.h>
|
|
2589
2547
|
#endif
|
|
@@ -2617,7 +2575,6 @@ static struct {
|
|
|
2617
2575
|
|
|
2618
2576
|
static int g_n_threads = 1;
|
|
2619
2577
|
|
|
2620
|
-
|
|
2621
2578
|
static void* tp_worker(void* arg) {
|
|
2622
2579
|
int id = (int)(intptr_t)arg;
|
|
2623
2580
|
int my_gen = 0;
|
|
@@ -4173,6 +4130,7 @@ tq_gen_config_t tq_default_gen_config(void) {
|
|
|
4173
4130
|
config.n_threads = 1;
|
|
4174
4131
|
config.rep_penalty = 1.1f;
|
|
4175
4132
|
config.rep_window = 32;
|
|
4133
|
+
config.rng_seed = 42ULL;
|
|
4176
4134
|
config.on_token = NULL;
|
|
4177
4135
|
config.user_data = NULL;
|
|
4178
4136
|
return config;
|
|
@@ -4388,8 +4346,6 @@ void tq_matmul_1bit(float* out, const float* x,
|
|
|
4388
4346
|
* SPDX-License-Identifier: MIT
|
|
4389
4347
|
*/
|
|
4390
4348
|
|
|
4391
|
-
|
|
4392
|
-
|
|
4393
4349
|
#ifdef _WIN32
|
|
4394
4350
|
#else
|
|
4395
4351
|
#endif
|
|
@@ -5098,8 +5054,6 @@ const tq_gguf_tensor_t* tq_gguf_find_tensor(const tq_gguf_ctx_t* ctx, const char
|
|
|
5098
5054
|
* Pure C11, no external dependencies.
|
|
5099
5055
|
*/
|
|
5100
5056
|
|
|
5101
|
-
|
|
5102
|
-
|
|
5103
5057
|
#if defined(__ARM_NEON) || defined(__ARM_NEON__)
|
|
5104
5058
|
#include <arm_neon.h>
|
|
5105
5059
|
#define TQ_HAS_NEON 1
|
|
@@ -7174,7 +7128,6 @@ void tq_metal_batch_end_if_available(void) {
|
|
|
7174
7128
|
* Also supports the legacy llama2.c binary tokenizer format as fallback.
|
|
7175
7129
|
*/
|
|
7176
7130
|
|
|
7177
|
-
|
|
7178
7131
|
/* Global for qsort comparator (vocab index sorting) */
|
|
7179
7132
|
static char** g_vocab_for_sort;
|
|
7180
7133
|
static int cmp_vocab_idx(const void* a, const void* b) {
|
|
@@ -8519,7 +8472,6 @@ const char* tq_decode(const tq_tokenizer_t* tok, int prev_token, int token) {
|
|
|
8519
8472
|
* Supports hybrid architectures (e.g., Qwen3.5 DeltaNet + self_attn).
|
|
8520
8473
|
*/
|
|
8521
8474
|
|
|
8522
|
-
|
|
8523
8475
|
#ifdef _WIN32
|
|
8524
8476
|
#else
|
|
8525
8477
|
#endif
|
|
@@ -12179,8 +12131,13 @@ tq_model_t* tq_load_gguf(const char* path) {
|
|
|
12179
12131
|
}
|
|
12180
12132
|
|
|
12181
12133
|
const size_t MAX_FP32_BYTES = (size_t)16 * 1024 * 1024 * 1024ULL; /* 16 GB */
|
|
12182
|
-
/* TQ_NO_Q4=1 disables Q4 recompression → use direct GGUF dequant for better quality
|
|
12134
|
+
/* TQ_NO_Q4=1 disables Q4 recompression → use direct GGUF dequant for better quality.
|
|
12135
|
+
* Can be set via environment variable or compile-time define (useful for WASM). */
|
|
12136
|
+
#ifdef TQ_NO_Q4
|
|
12137
|
+
if (1) {
|
|
12138
|
+
#else
|
|
12183
12139
|
if (getenv("TQ_NO_Q4")) {
|
|
12140
|
+
#endif
|
|
12184
12141
|
fprintf(stderr, "tq_load_gguf: TQ_NO_Q4 set — skipping Q4 conversion, using GGUF on-the-fly dequant\n");
|
|
12185
12142
|
goto skip_q4_conversion;
|
|
12186
12143
|
}
|
|
@@ -12929,7 +12886,6 @@ void tq_quantize_weights_1bit(tq_model_t* model) {
|
|
|
12929
12886
|
* -> residual add
|
|
12930
12887
|
*/
|
|
12931
12888
|
|
|
12932
|
-
|
|
12933
12889
|
/* Unified Q2/1-bit matmul dispatch.
|
|
12934
12890
|
* When model->use_1bit_weights, Q2 fields contain sign bits + norms,
|
|
12935
12891
|
* dispatched to tq_matmul_1bit (FP32 input required).
|
|
@@ -15189,7 +15145,6 @@ float* tq_forward(tq_model_t* model, tq_state_t* s, int token, int pos) {
|
|
|
15189
15145
|
}
|
|
15190
15146
|
}
|
|
15191
15147
|
|
|
15192
|
-
|
|
15193
15148
|
/* Increment profile token count if profiling is active */
|
|
15194
15149
|
if (s->profile_kv) {
|
|
15195
15150
|
s->profile_kv_count++;
|
|
@@ -15240,7 +15195,6 @@ float* tq_forward(tq_model_t* model, tq_state_t* s, int token, int pos) {
|
|
|
15240
15195
|
* - Full generation loop with streaming callback
|
|
15241
15196
|
*/
|
|
15242
15197
|
|
|
15243
|
-
|
|
15244
15198
|
/* ============================================================
|
|
15245
15199
|
* Argmax sampling: return token with highest logit
|
|
15246
15200
|
* ============================================================ */
|
|
@@ -15461,7 +15415,12 @@ int tq_generate(tq_model_t* model, tq_tokenizer_t* tokenizer,
|
|
|
15461
15415
|
fprintf(stderr, "\n");
|
|
15462
15416
|
}
|
|
15463
15417
|
|
|
15464
|
-
/* Prefill: process all prompt tokens
|
|
15418
|
+
/* Prefill: process all prompt tokens.
|
|
15419
|
+
* NOTE: No emscripten_sleep() here — the call stack during tq_forward()
|
|
15420
|
+
* is too deep for ASYNCIFY to unwind (matmul → SIMD kernels). Adding
|
|
15421
|
+
* sleep here breaks ASYNCIFY for the entire generate call, including
|
|
15422
|
+
* the token streaming callback. The browser shows "Thinking..." via
|
|
15423
|
+
* requestAnimationFrame before entering this blocking prefill. */
|
|
15465
15424
|
for (int i = 0; i < n_prompt; i++) {
|
|
15466
15425
|
tq_forward(model, state, prompt_tokens[i], i);
|
|
15467
15426
|
}
|
|
@@ -15496,9 +15455,11 @@ int tq_generate(tq_model_t* model, tq_tokenizer_t* tokenizer,
|
|
|
15496
15455
|
}
|
|
15497
15456
|
}
|
|
15498
15457
|
|
|
15499
|
-
/* Sample first generated token
|
|
15458
|
+
/* Sample first generated token. The seed is configurable via
|
|
15459
|
+
* config->rng_seed (default 42); 0 falls back to 42 so existing
|
|
15460
|
+
* callers that never set rng_seed get bit-identical behaviour. */
|
|
15500
15461
|
int pos = n_prompt;
|
|
15501
|
-
unsigned long long rng_state =
|
|
15462
|
+
unsigned long long rng_state = config->rng_seed ? config->rng_seed : 42ULL;
|
|
15502
15463
|
int next_token = tq_sample_topp(state->logits, vocab_size,
|
|
15503
15464
|
config->temperature, config->top_p,
|
|
15504
15465
|
&rng_state);
|
|
@@ -15663,7 +15624,6 @@ int tq_generate(tq_model_t* model, tq_tokenizer_t* tokenizer,
|
|
|
15663
15624
|
return generated;
|
|
15664
15625
|
}
|
|
15665
15626
|
|
|
15666
|
-
|
|
15667
15627
|
// ============================================================================
|
|
15668
15628
|
|
|
15669
15629
|
// ============================================================================
|
|
@@ -0,0 +1,390 @@
|
|
|
1
|
+
"""
|
|
2
|
+
quantcpp CLI — chat with a local LLM in your terminal.
|
|
3
|
+
|
|
4
|
+
Ollama-style commands:
|
|
5
|
+
quantcpp pull MODEL Download a model from HuggingFace
|
|
6
|
+
quantcpp list List cached and available models
|
|
7
|
+
quantcpp run MODEL [Q] Chat with a model (auto-pulls if needed)
|
|
8
|
+
quantcpp serve MODEL Start OpenAI-compatible HTTP server
|
|
9
|
+
|
|
10
|
+
Backwards-compatible shortcut:
|
|
11
|
+
quantcpp Auto-downloads Llama-3.2-1B, starts chat
|
|
12
|
+
quantcpp "What is X?" One-shot question with default model
|
|
13
|
+
quantcpp --model NAME Use a specific model
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
import sys
|
|
17
|
+
import os
|
|
18
|
+
import json
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
# Ollama-style short aliases → canonical _MODEL_REGISTRY keys
|
|
22
|
+
MODEL_ALIASES = {
|
|
23
|
+
"smollm2": "SmolLM2-135M",
|
|
24
|
+
"smollm2:135m": "SmolLM2-135M",
|
|
25
|
+
"qwen3.5": "Qwen3.5-0.8B",
|
|
26
|
+
"qwen3.5:0.8b": "Qwen3.5-0.8B",
|
|
27
|
+
"llama3.2": "Llama-3.2-1B",
|
|
28
|
+
"llama3.2:1b": "Llama-3.2-1B",
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _resolve_name(name):
|
|
33
|
+
"""Resolve user input to canonical registry key or local path."""
|
|
34
|
+
if name is None:
|
|
35
|
+
return None
|
|
36
|
+
if os.path.exists(name) and name.endswith(".gguf"):
|
|
37
|
+
return name
|
|
38
|
+
return MODEL_ALIASES.get(name.lower(), name)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def _registry():
|
|
42
|
+
from quantcpp import _MODEL_REGISTRY, _CACHE_DIR
|
|
43
|
+
return _MODEL_REGISTRY, _CACHE_DIR
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def cmd_pull(args):
|
|
47
|
+
"""Download a model by alias or canonical name."""
|
|
48
|
+
import quantcpp
|
|
49
|
+
name = _resolve_name(args.model)
|
|
50
|
+
|
|
51
|
+
if os.path.exists(name) and name.endswith(".gguf"):
|
|
52
|
+
print(f"already local: {name}")
|
|
53
|
+
return 0
|
|
54
|
+
|
|
55
|
+
if name not in quantcpp._MODEL_REGISTRY:
|
|
56
|
+
avail = ", ".join(sorted(quantcpp._MODEL_REGISTRY.keys()))
|
|
57
|
+
aliases = ", ".join(sorted(MODEL_ALIASES.keys()))
|
|
58
|
+
print(f"unknown model: {args.model!r}", file=sys.stderr)
|
|
59
|
+
print(f" registry: {avail}", file=sys.stderr)
|
|
60
|
+
print(f" aliases: {aliases}", file=sys.stderr)
|
|
61
|
+
return 1
|
|
62
|
+
|
|
63
|
+
print(f"pulling {name}...", file=sys.stderr)
|
|
64
|
+
try:
|
|
65
|
+
path = quantcpp.download(name)
|
|
66
|
+
size_mb = os.path.getsize(path) / (1024 * 1024)
|
|
67
|
+
print(f"\u2713 {name} \u2192 {path} ({size_mb:.0f} MB)", file=sys.stderr)
|
|
68
|
+
return 0
|
|
69
|
+
except Exception as e:
|
|
70
|
+
print(f"download failed: {e}", file=sys.stderr)
|
|
71
|
+
return 1
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def cmd_list(args):
|
|
75
|
+
"""List cached and available models."""
|
|
76
|
+
registry, cache_dir = _registry()
|
|
77
|
+
|
|
78
|
+
rows = []
|
|
79
|
+
for name, (repo, filename, approx_mb) in sorted(registry.items()):
|
|
80
|
+
path = cache_dir / filename
|
|
81
|
+
if path.exists():
|
|
82
|
+
size_mb = path.stat().st_size / (1024 * 1024)
|
|
83
|
+
status = "cached"
|
|
84
|
+
display_path = str(path)
|
|
85
|
+
else:
|
|
86
|
+
size_mb = approx_mb
|
|
87
|
+
status = "remote"
|
|
88
|
+
display_path = f"~{approx_mb} MB"
|
|
89
|
+
alias = next((a for a, n in MODEL_ALIASES.items() if n == name and ":" in a), "")
|
|
90
|
+
rows.append((status, name, alias, size_mb, display_path))
|
|
91
|
+
|
|
92
|
+
if args.json_output:
|
|
93
|
+
print(json.dumps([
|
|
94
|
+
{"status": s, "name": n, "alias": a, "size_mb": round(sz, 1), "path": p}
|
|
95
|
+
for (s, n, a, sz, p) in rows
|
|
96
|
+
], indent=2))
|
|
97
|
+
return 0
|
|
98
|
+
|
|
99
|
+
print(f"\n Models cache: {cache_dir}\n")
|
|
100
|
+
print(f" {'STATUS':<8} {'NAME':<16} {'ALIAS':<14} {'SIZE':>8}")
|
|
101
|
+
print(f" {'-'*8} {'-'*16} {'-'*14} {'-'*8}")
|
|
102
|
+
for status, name, alias, size_mb, _ in rows:
|
|
103
|
+
size_str = f"{size_mb:.0f} MB"
|
|
104
|
+
print(f" {status:<8} {name:<16} {alias:<14} {size_str:>8}")
|
|
105
|
+
print()
|
|
106
|
+
return 0
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def _resolve_to_path(name_or_path):
|
|
110
|
+
"""Resolve alias/name to a local .gguf path, downloading if needed."""
|
|
111
|
+
import quantcpp
|
|
112
|
+
name = _resolve_name(name_or_path)
|
|
113
|
+
|
|
114
|
+
if os.path.exists(name) and name.endswith(".gguf"):
|
|
115
|
+
return name
|
|
116
|
+
|
|
117
|
+
if name not in quantcpp._MODEL_REGISTRY:
|
|
118
|
+
avail = ", ".join(sorted(quantcpp._MODEL_REGISTRY.keys()))
|
|
119
|
+
raise ValueError(
|
|
120
|
+
f"unknown model: {name_or_path!r}. Available: {avail}"
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
repo, filename, _ = quantcpp._MODEL_REGISTRY[name]
|
|
124
|
+
cached = quantcpp._CACHE_DIR / filename
|
|
125
|
+
if cached.exists():
|
|
126
|
+
return str(cached)
|
|
127
|
+
|
|
128
|
+
print(f"model not cached \u2014 pulling {name}...", file=sys.stderr)
|
|
129
|
+
return quantcpp.download(name)
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def cmd_run(args):
|
|
133
|
+
"""Chat with a model (auto-pull if needed)."""
|
|
134
|
+
try:
|
|
135
|
+
model_path = _resolve_to_path(args.model)
|
|
136
|
+
except ValueError as e:
|
|
137
|
+
print(str(e), file=sys.stderr)
|
|
138
|
+
return 1
|
|
139
|
+
except Exception as e:
|
|
140
|
+
print(f"pull failed: {e}", file=sys.stderr)
|
|
141
|
+
return 1
|
|
142
|
+
|
|
143
|
+
from quantcpp import Model
|
|
144
|
+
print(f"loading {os.path.basename(model_path)}...", file=sys.stderr)
|
|
145
|
+
m = Model(model_path, max_tokens=args.max_tokens, temperature=args.temperature,
|
|
146
|
+
n_threads=args.threads)
|
|
147
|
+
|
|
148
|
+
if args.prompt:
|
|
149
|
+
question = " ".join(args.prompt) if isinstance(args.prompt, list) else args.prompt
|
|
150
|
+
for tok in m.generate(question):
|
|
151
|
+
print(tok, end="", flush=True)
|
|
152
|
+
print()
|
|
153
|
+
else:
|
|
154
|
+
print("quantcpp \u2014 type your message, Ctrl+C to exit", file=sys.stderr)
|
|
155
|
+
try:
|
|
156
|
+
while True:
|
|
157
|
+
question = input("\nYou: ")
|
|
158
|
+
if not question.strip():
|
|
159
|
+
continue
|
|
160
|
+
print("AI: ", end="", flush=True)
|
|
161
|
+
for tok in m.generate(question):
|
|
162
|
+
print(tok, end="", flush=True)
|
|
163
|
+
print()
|
|
164
|
+
except (KeyboardInterrupt, EOFError):
|
|
165
|
+
print("\nBye!", file=sys.stderr)
|
|
166
|
+
|
|
167
|
+
m.close()
|
|
168
|
+
return 0
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def cmd_serve(args):
|
|
172
|
+
"""Start OpenAI-compatible HTTP server (requires quant-server binary)."""
|
|
173
|
+
import shutil
|
|
174
|
+
import subprocess
|
|
175
|
+
|
|
176
|
+
try:
|
|
177
|
+
model_path = _resolve_to_path(args.model)
|
|
178
|
+
except Exception as e:
|
|
179
|
+
print(f"error: {e}", file=sys.stderr)
|
|
180
|
+
return 1
|
|
181
|
+
|
|
182
|
+
binary = shutil.which("quant-server")
|
|
183
|
+
if not binary:
|
|
184
|
+
# Look in common build dirs relative to repo
|
|
185
|
+
for guess in ("./build/quant-server", "./build_metal/quant-server"):
|
|
186
|
+
if os.path.isfile(guess) and os.access(guess, os.X_OK):
|
|
187
|
+
binary = guess
|
|
188
|
+
break
|
|
189
|
+
|
|
190
|
+
if not binary:
|
|
191
|
+
print("quant-server binary not found.", file=sys.stderr)
|
|
192
|
+
print(" Build with: cmake -B build -DTQ_BUILD_SERVER=ON && cmake --build build",
|
|
193
|
+
file=sys.stderr)
|
|
194
|
+
print(" Or install via your package manager.", file=sys.stderr)
|
|
195
|
+
return 2
|
|
196
|
+
|
|
197
|
+
cmd = [binary, model_path, "-p", str(args.port), "-j", str(args.threads)]
|
|
198
|
+
print(f"quantcpp serve {os.path.basename(model_path)} on :{args.port}", file=sys.stderr)
|
|
199
|
+
print("", file=sys.stderr)
|
|
200
|
+
print("OpenAI-compatible endpoints:", file=sys.stderr)
|
|
201
|
+
print(f" POST http://localhost:{args.port}/v1/chat/completions", file=sys.stderr)
|
|
202
|
+
print(f" GET http://localhost:{args.port}/v1/models", file=sys.stderr)
|
|
203
|
+
print(f" GET http://localhost:{args.port}/health", file=sys.stderr)
|
|
204
|
+
print("", file=sys.stderr)
|
|
205
|
+
print("Streaming (SSE — token-by-token):", file=sys.stderr)
|
|
206
|
+
print(f" curl -N http://localhost:{args.port}/v1/chat/completions \\", file=sys.stderr)
|
|
207
|
+
print(" -H 'Content-Type: application/json' \\", file=sys.stderr)
|
|
208
|
+
print(' -d \'{"messages":[{"role":"user","content":"Hi"}],"stream":true}\'',
|
|
209
|
+
file=sys.stderr)
|
|
210
|
+
print("", file=sys.stderr)
|
|
211
|
+
print("Non-streaming (single JSON response):", file=sys.stderr)
|
|
212
|
+
print(f" curl http://localhost:{args.port}/v1/chat/completions \\", file=sys.stderr)
|
|
213
|
+
print(" -H 'Content-Type: application/json' \\", file=sys.stderr)
|
|
214
|
+
print(' -d \'{"messages":[{"role":"user","content":"Hi"}]}\'',
|
|
215
|
+
file=sys.stderr)
|
|
216
|
+
print("", file=sys.stderr)
|
|
217
|
+
print("OpenAI Python SDK works as-is:", file=sys.stderr)
|
|
218
|
+
print(f" client = OpenAI(base_url='http://localhost:{args.port}/v1', api_key='none')",
|
|
219
|
+
file=sys.stderr)
|
|
220
|
+
print(" client.chat.completions.create(model='quantcpp', messages=[...], stream=True)",
|
|
221
|
+
file=sys.stderr)
|
|
222
|
+
print("", file=sys.stderr)
|
|
223
|
+
os.execvp(cmd[0], cmd)
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
def cmd_client(args):
|
|
227
|
+
"""Send a chat request to a running quantcpp serve endpoint.
|
|
228
|
+
|
|
229
|
+
Default mode is streaming (SSE) — tokens print as they arrive.
|
|
230
|
+
Use --no-stream for a single JSON response.
|
|
231
|
+
"""
|
|
232
|
+
import json as _json
|
|
233
|
+
import urllib.request
|
|
234
|
+
|
|
235
|
+
url = args.url.rstrip("/") + "/v1/chat/completions"
|
|
236
|
+
payload = {
|
|
237
|
+
"model": args.model_name,
|
|
238
|
+
"messages": [{"role": "user", "content": args.prompt}],
|
|
239
|
+
"max_tokens": args.max_tokens,
|
|
240
|
+
"temperature": args.temperature,
|
|
241
|
+
"stream": not args.no_stream,
|
|
242
|
+
}
|
|
243
|
+
body = _json.dumps(payload).encode()
|
|
244
|
+
req = urllib.request.Request(
|
|
245
|
+
url, data=body,
|
|
246
|
+
headers={
|
|
247
|
+
"Content-Type": "application/json",
|
|
248
|
+
"User-Agent": "quantcpp-client",
|
|
249
|
+
},
|
|
250
|
+
)
|
|
251
|
+
|
|
252
|
+
try:
|
|
253
|
+
with urllib.request.urlopen(req) as resp:
|
|
254
|
+
if args.no_stream:
|
|
255
|
+
data = _json.loads(resp.read())
|
|
256
|
+
print(data["choices"][0]["message"]["content"])
|
|
257
|
+
return 0
|
|
258
|
+
|
|
259
|
+
# SSE stream — parse `data: {...}\n\n` chunks
|
|
260
|
+
for line in resp:
|
|
261
|
+
line = line.decode("utf-8", errors="replace").rstrip()
|
|
262
|
+
if not line.startswith("data:"):
|
|
263
|
+
continue
|
|
264
|
+
payload_str = line[5:].strip()
|
|
265
|
+
if payload_str == "[DONE]":
|
|
266
|
+
break
|
|
267
|
+
try:
|
|
268
|
+
chunk = _json.loads(payload_str)
|
|
269
|
+
delta = chunk["choices"][0]["delta"].get("content", "")
|
|
270
|
+
if delta:
|
|
271
|
+
print(delta, end="", flush=True)
|
|
272
|
+
except Exception:
|
|
273
|
+
pass
|
|
274
|
+
print()
|
|
275
|
+
return 0
|
|
276
|
+
except urllib.error.URLError as e:
|
|
277
|
+
print(f"connection failed: {e}", file=sys.stderr)
|
|
278
|
+
print(f" Is the server running on {args.url}?", file=sys.stderr)
|
|
279
|
+
print(f" Start it with: quantcpp serve llama3.2:1b -p {args.url.rsplit(':', 1)[-1].rstrip('/')}",
|
|
280
|
+
file=sys.stderr)
|
|
281
|
+
return 1
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
def cmd_chat_default(args):
|
|
285
|
+
"""Backwards-compatible default: auto-download Llama-3.2-1B and chat."""
|
|
286
|
+
args.model = args.model or "Llama-3.2-1B"
|
|
287
|
+
args.threads = getattr(args, "threads", 4)
|
|
288
|
+
args.max_tokens = getattr(args, "max_tokens", 256)
|
|
289
|
+
args.temperature = getattr(args, "temperature", 0.7)
|
|
290
|
+
args.prompt = args.prompt or None
|
|
291
|
+
return cmd_run(args)
|
|
292
|
+
|
|
293
|
+
|
|
294
|
+
def main():
|
|
295
|
+
import argparse
|
|
296
|
+
|
|
297
|
+
parser = argparse.ArgumentParser(
|
|
298
|
+
prog="quantcpp",
|
|
299
|
+
description="Chat with a local LLM. No API key, no GPU, no server.",
|
|
300
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
301
|
+
epilog="""
|
|
302
|
+
commands:
|
|
303
|
+
pull MODEL Download a model (e.g. llama3.2:1b)
|
|
304
|
+
list List cached and available models
|
|
305
|
+
run MODEL [PROMPT] Chat with a model (auto-pulls if needed)
|
|
306
|
+
serve MODEL Start OpenAI-compatible HTTP server
|
|
307
|
+
client PROMPT Send a request to a running serve (default: SSE streaming)
|
|
308
|
+
|
|
309
|
+
examples:
|
|
310
|
+
quantcpp pull llama3.2:1b
|
|
311
|
+
quantcpp list
|
|
312
|
+
quantcpp run llama3.2:1b
|
|
313
|
+
quantcpp run llama3.2:1b "What is gravity?"
|
|
314
|
+
quantcpp serve llama3.2:1b --port 8080
|
|
315
|
+
quantcpp client "What is gravity?" # streams from :8080
|
|
316
|
+
quantcpp client "Hi" --url http://localhost:8081
|
|
317
|
+
quantcpp client "Hi" --no-stream # single JSON response
|
|
318
|
+
|
|
319
|
+
backwards-compat (no subcommand):
|
|
320
|
+
quantcpp # default chat with Llama-3.2-1B
|
|
321
|
+
quantcpp "What is gravity?" # one-shot
|
|
322
|
+
quantcpp --model SmolLM2-135M # different model
|
|
323
|
+
""",
|
|
324
|
+
)
|
|
325
|
+
|
|
326
|
+
sub = parser.add_subparsers(dest="command")
|
|
327
|
+
|
|
328
|
+
# pull
|
|
329
|
+
p_pull = sub.add_parser("pull", help="Download a model from HuggingFace")
|
|
330
|
+
p_pull.add_argument("model", help="Model name or alias (e.g. llama3.2:1b)")
|
|
331
|
+
|
|
332
|
+
# list
|
|
333
|
+
p_list = sub.add_parser("list", help="List cached and available models")
|
|
334
|
+
p_list.add_argument("--json", dest="json_output", action="store_true")
|
|
335
|
+
|
|
336
|
+
# run
|
|
337
|
+
p_run = sub.add_parser("run", help="Chat with a model (auto-pulls if needed)")
|
|
338
|
+
p_run.add_argument("model", help="Model name, alias, or .gguf path")
|
|
339
|
+
p_run.add_argument("prompt", nargs="*", default=None, help="Optional prompt")
|
|
340
|
+
p_run.add_argument("-j", "--threads", type=int, default=4)
|
|
341
|
+
p_run.add_argument("-n", "--max-tokens", type=int, default=256)
|
|
342
|
+
p_run.add_argument("-t", "--temperature", type=float, default=0.7)
|
|
343
|
+
|
|
344
|
+
# serve
|
|
345
|
+
p_serve = sub.add_parser("serve", help="Start OpenAI-compatible HTTP server")
|
|
346
|
+
p_serve.add_argument("model", help="Model name, alias, or .gguf path")
|
|
347
|
+
p_serve.add_argument("-p", "--port", type=int, default=8080)
|
|
348
|
+
p_serve.add_argument("-j", "--threads", type=int, default=4)
|
|
349
|
+
|
|
350
|
+
# client
|
|
351
|
+
p_client = sub.add_parser("client",
|
|
352
|
+
help="Send a chat request to a running quantcpp serve endpoint")
|
|
353
|
+
p_client.add_argument("prompt", help="Question to send")
|
|
354
|
+
p_client.add_argument("--url", default="http://localhost:8080",
|
|
355
|
+
help="Server URL (default: http://localhost:8080)")
|
|
356
|
+
p_client.add_argument("--model-name", "-m", default="quantcpp",
|
|
357
|
+
help="Model name in the request body (server ignores)")
|
|
358
|
+
p_client.add_argument("-n", "--max-tokens", type=int, default=256)
|
|
359
|
+
p_client.add_argument("-t", "--temperature", type=float, default=0.7)
|
|
360
|
+
p_client.add_argument("--no-stream", action="store_true",
|
|
361
|
+
help="Disable SSE streaming (single JSON response)")
|
|
362
|
+
|
|
363
|
+
# Backwards-compat: top-level args for direct chat
|
|
364
|
+
parser.add_argument("prompt", nargs="*", default=None,
|
|
365
|
+
help="(default mode) question to ask")
|
|
366
|
+
parser.add_argument("--model", "-m", default=None,
|
|
367
|
+
help="(default mode) model name or .gguf path")
|
|
368
|
+
parser.add_argument("--max-tokens", "-n", type=int, default=256)
|
|
369
|
+
parser.add_argument("--temperature", "-t", type=float, default=0.7)
|
|
370
|
+
parser.add_argument("--threads", "-j", type=int, default=4)
|
|
371
|
+
|
|
372
|
+
args = parser.parse_args()
|
|
373
|
+
|
|
374
|
+
if args.command == "pull":
|
|
375
|
+
return cmd_pull(args)
|
|
376
|
+
if args.command == "list":
|
|
377
|
+
return cmd_list(args)
|
|
378
|
+
if args.command == "run":
|
|
379
|
+
return cmd_run(args)
|
|
380
|
+
if args.command == "serve":
|
|
381
|
+
return cmd_serve(args)
|
|
382
|
+
if args.command == "client":
|
|
383
|
+
return cmd_client(args)
|
|
384
|
+
|
|
385
|
+
# No subcommand → backwards-compat default chat
|
|
386
|
+
return cmd_chat_default(args)
|
|
387
|
+
|
|
388
|
+
|
|
389
|
+
if __name__ == "__main__":
|
|
390
|
+
sys.exit(main())
|
quantcpp-0.11.0/quantcpp/cli.py
DELETED
|
@@ -1,64 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
quantcpp CLI — chat with a local LLM in your terminal.
|
|
3
|
-
|
|
4
|
-
Usage:
|
|
5
|
-
quantcpp # auto-downloads Llama-3.2-1B, starts chat
|
|
6
|
-
quantcpp "What is gravity?" # one-shot question
|
|
7
|
-
quantcpp --model SmolLM2-135M # use a smaller model (faster download)
|
|
8
|
-
quantcpp --model path/to/file.gguf # use your own GGUF file
|
|
9
|
-
"""
|
|
10
|
-
|
|
11
|
-
import sys
|
|
12
|
-
import os
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
def main():
|
|
16
|
-
import argparse
|
|
17
|
-
parser = argparse.ArgumentParser(
|
|
18
|
-
prog="quantcpp",
|
|
19
|
-
description="Chat with a local LLM. No API key, no GPU, no server.",
|
|
20
|
-
)
|
|
21
|
-
parser.add_argument("prompt", nargs="*", help="Question to ask (omit for interactive chat)")
|
|
22
|
-
parser.add_argument("--model", "-m", default="Llama-3.2-1B",
|
|
23
|
-
help="Model name or path to .gguf file (default: Llama-3.2-1B)")
|
|
24
|
-
parser.add_argument("--max-tokens", "-n", type=int, default=256)
|
|
25
|
-
parser.add_argument("--temperature", "-t", type=float, default=0.7)
|
|
26
|
-
args = parser.parse_args()
|
|
27
|
-
|
|
28
|
-
from quantcpp import Model
|
|
29
|
-
|
|
30
|
-
# Load model
|
|
31
|
-
model_path = args.model
|
|
32
|
-
if os.path.isfile(model_path):
|
|
33
|
-
print(f"Loading {model_path}...", file=sys.stderr)
|
|
34
|
-
m = Model(model_path, max_tokens=args.max_tokens, temperature=args.temperature)
|
|
35
|
-
else:
|
|
36
|
-
print(f"Downloading {model_path}...", file=sys.stderr)
|
|
37
|
-
m = Model.from_pretrained(model_path, max_tokens=args.max_tokens,
|
|
38
|
-
temperature=args.temperature)
|
|
39
|
-
|
|
40
|
-
# One-shot or interactive
|
|
41
|
-
if args.prompt:
|
|
42
|
-
question = " ".join(args.prompt)
|
|
43
|
-
for tok in m.generate(question):
|
|
44
|
-
print(tok, end="", flush=True)
|
|
45
|
-
print()
|
|
46
|
-
else:
|
|
47
|
-
print("quantcpp — type your message, Ctrl+C to exit", file=sys.stderr)
|
|
48
|
-
try:
|
|
49
|
-
while True:
|
|
50
|
-
question = input("\nYou: ")
|
|
51
|
-
if not question.strip():
|
|
52
|
-
continue
|
|
53
|
-
print("AI: ", end="", flush=True)
|
|
54
|
-
for tok in m.generate(question):
|
|
55
|
-
print(tok, end="", flush=True)
|
|
56
|
-
print()
|
|
57
|
-
except (KeyboardInterrupt, EOFError):
|
|
58
|
-
print("\nBye!", file=sys.stderr)
|
|
59
|
-
|
|
60
|
-
m.close()
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
if __name__ == "__main__":
|
|
64
|
-
main()
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|